In [None]:
%load_ext dotenv
%dotenv

In [None]:
import os, time, uuid
import numpy as np
from InstructorEmbedding import INSTRUCTOR
from PyPDF2 import PdfReader
import re
import unicodedata
from pathlib import Path

## Text processing

In [None]:
def read_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)


def normalize_unicode(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    # normalize curly quotes and long dashes to ASCII
    s = s.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    s = s.replace("–", "-").replace("—", "-")
    return s


def dehyphenate_linebreaks(s: str) -> str:
    # Join words split across line breaks: e.g. "classi-\nfication" -> "classification"
    return re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", s)


def collapse_linebreaks(s: str) -> str:
    # Convert hard line breaks inside paragraphs to spaces, keep paragraph breaks
    # 1) normalize Windows/Mac line endings
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    # 2) keep blank-line paragraph breaks, but turn single newlines into spaces
    s = re.sub(r"[ \t]*\n[ \t]*(?=[^\n])", " ", s)                  # single linebreak -> space
    s = re.sub(r"(?:\n\s*){2,}", "\n\n", s)                          # 2+ linebreaks -> exactly two
    return s


def remove_page_furniture(s: str) -> str:
    # Remove typical headers/footers and page number lines
    patterns = [
        r"^\s*page\s+\d+(\s+of\s+\d+)?\s*$",                         # Page X (of Y)
        r"^\s*this project is co[- ]funded.*$",                      # common boilerplate
        r"^\s*centre for humanitarian data.*$",                      # example header
        r"^\s*european union.*$",                                    # example footer
        r"^\s*acknowledg(e)?ments.*$",                               # section heading noise (optional)
    ]
    rx = re.compile("|".join(f"(?:{p})" for p in patterns), re.I | re.M)
    return rx.sub("", s)


def fix_ocr_letter_spacing(s: str) -> str:
    # Collapse spaced-out OCR words like "S O M A L I A" -> "Somalia"
    # Heuristic 1: 3+ single-letter tokens separated by spaces
    def _collapse(match):
        word = match.group(0).replace(" ", "")
        # Capitalize if it looked like a title word (mostly uppercase)
        if word.isupper():
            return word.title()
        return word.lower()
    s = re.sub(r"\b(?:[A-Za-z]\s){2,}[A-Za-z]\b", _collapse, s)

    # Heuristic 2: handle inconsistent spacing inside mid-length words: "re c las s if ied"
    # Collapse space between letters when surrounded by letters on both sides
    s = re.sub(r"(?<=\w)\s+(?=\w)", " ", s)  # first normalize to single spaces
    s = re.sub(r"\b([A-Za-z])\s(?=[A-Za-z])", r"\1", s)  # then remove residual intra-word single spaces
    return s


def remove_inline_refs_urls_emails(s: str) -> str:
    # Remove bracketed numeric refs like [12], (12)
    s = re.sub(r"\s*\[\d+\]", "", s)
    s = re.sub(r"\s*\(\d+\)", "", s)
    # Remove raw URLs/emails (keep if you actually need them)
    s = re.sub(r"\bhttps?://\S+\b", "", s, flags=re.I)
    s = re.sub(r"\bwww\.\S+\b", "", s, flags=re.I)
    s = re.sub(r"\b\S+@\S+\.\S+\b", "", s)
    return s


def tidy_spaces_punctuation(s: str) -> str:
    # Remove non-printable/control chars
    s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
    # Collapse multiple spaces
    s = re.sub(r"[ \t]{2,}", " ", s)
    # Remove spaces before punctuation
    s = re.sub(r"\s+([,.;:?!%])", r"\1", s)
    # Ensure single space after punctuation where appropriate
    s = re.sub(r"([,.;:?!%])([^\s])", r"\1 \2", s)
    # Trim lines and trailing spaces around paragraphs
    s = re.sub(r"[ \t]+(\n)", r"\1", s)
    return s.strip()


def dedupe_paragraphs(s: str) -> str:
    # Drop exact duplicate paragraphs (simple but effective)
    paras = [p.strip() for p in s.split("\n\n")]
    seen, out = set(), []
    for p in paras:
        if p and p not in seen:
            seen.add(p)
            out.append(p)
    return "\n\n".join(out)



def clean_text(raw: str) -> str:
    text = normalize_unicode(raw)
    text = dehyphenate_linebreaks(text)
    text = collapse_linebreaks(text)
    text = remove_page_furniture(text)
    text = fix_ocr_letter_spacing(text)
    text = remove_inline_refs_urls_emails(text)
    text = tidy_spaces_punctuation(text)
    text = dedupe_paragraphs(text)
    return text


def process_text_by_pages(folder_path):
    count = 0
    all_pdf_chunks = {}  # {filename: [page1_text, page2_text, ...]}

    for filename in os.listdir(folder_path):
        if count >= 2:
            break

        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            count += 1

            reader = PdfReader(pdf_path)
            page_chunks = []

            for page_number, page in enumerate(reader.pages, start=1):
                raw_text = page.extract_text() or ""
                cleaned_page = clean_text(raw_text)
                page_chunks.append({
                    "page_number": page_number,
                    "text": cleaned_page
                })

            all_pdf_chunks[filename] = page_chunks
            print(f"Processed {filename} into {len(page_chunks)} page chunks.")

    return all_pdf_chunks

## Embedding Text

In [None]:
def text_embed(texts):
    model = INSTRUCTOR("hkunlp/instructor-xl")
    instruction = "Represent the sentence for semantic search"

    pairs = [[instruction, t] for t in texts]
    embs = model.encode(pairs, normalize_embeddings=True)  # cosine-friendly
    embs = np.asarray(embs, dtype="float32")
    dim = embs.shape[1]

    return dim,embs

In [None]:
texts = []

folder_path = str(os.environ.get("PDF_FOLDER_PATH"))

pdf_page_chunks = process_text_by_pages(folder_path)
# print(pdf_page_chunks[0])
# Preview first PDF's first chunk
count = 1
for filename, pages in pdf_page_chunks.items():
    print(f"\n📄 {filename} - {len(pages)} pages")
    for p in pages:  # show only first 2 pages
        print(f"--- Page {p['page_number']} ---")
        texts.append({"text":p['text']})
        count += 1
        

# print(texts[0])
# print(texts[6])
embedded_text = text_embed(texts)
print(embedded_text)

In [None]:
vectors = []
for i, emb in enumerate(embedded_text[1]):
    vectors.append((
        f"id-{i}",            # unique ID
        emb.tolist(),         # convert to list
        {"text": texts[i]["text"]}    # optional metadata
    ))
# print(vectors[0])

In [None]:
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [None]:
pc.list_indexes()

In [None]:
index_name = "pdf-test"
dimension = 1024
metric = "cosine"

In [None]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

In [None]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

In [None]:
index = pc.Index(index_name)

In [None]:
index.upsert(vectors = vectors)
print("✅ Inserted embeddings into Pinecone")