In [None]:
# Install required packages (works in Google Colab, Carnets, or iSH)
!pip install --upgrade pip
!pip install PyPDF2 pandas nltk

In [None]:
# ---- IMPORT MODULES ----
import re
import pandas as pd
from collections import Counter
from PyPDF2 import PdfReader
import nltk
from nltk.util import ngrams

# Download NLTK stopwords if not already available
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

In [None]:
# ---- CONFIG ----
pdf_path = "input.pdf"  # Replace with your PDF path or upload in Colab
output_csv_keywords = "purview_keywords.csv"
output_csv_regex = "purview_regex.csv"
min_word_length = 3          # Minimum characters for useful words
min_phrase_frequency = 2     # Minimum occurrence for multi-word phrases

# ---- STOPWORDS ----
stopwords_set = set(nltk_stopwords.words('english'))

In [None]:
# ---- EXTRACT PDF TEXT ----
reader = PdfReader(pdf_path)
full_text = ""
for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        full_text += page_text + " "

In [None]:
# ---- CLEAN AND TOKENIZE ----
text_lower = full_text.lower()
text_clean = re.sub(r"[^\w\s]", " ", text_lower)  # Remove punctuation
tokens = [t for t in text_clean.split() if t not in stopwords_set and len(t) >= min_word_length and not t.isdigit()]

In [None]:
# ---- GENERATE N-GRAMS (multi-word phrases) ----
ngram_counts = Counter()
for n in range(2, 4):  # 2-grams and 3-grams
    for ng in ngrams(tokens, n):
        ngram_str = " ".join(ng)   # keep natural spacing
        ngram_counts[ngram_str] += 1

# Keep only frequent n-grams
common_phrases = [k for k, v in ngram_counts.items() if v >= min_phrase_frequency]

In [None]:
# ---- DETECT REGEX CANDIDATES ----
regex_patterns = set()

# Numeric sequences (IDs, account numbers, references)
numeric_sequences = re.findall(r"\b\d[\d\s\-]{4,}\b", full_text)
for seq in numeric_sequences:
    seq_clean = seq.strip()
    regex_seq = re.sub(r"\d", r"\\d", seq_clean)
    regex_seq = re.sub(r"\s+", r"\\s+", regex_seq)
    regex_patterns.add(regex_seq)

# Alphanumeric patterns (codes like INV12345, REF-2023-ABC)
alphanumeric_sequences = re.findall(r"\b[A-Z]{2,}[A-Z0-9\-]{2,}\b", full_text)
for seq in alphanumeric_sequences:
    regex_seq = re.sub(r"[A-Z]", "[A-Z]", seq)
    regex_seq = re.sub(r"[0-9]", "\\d", regex_seq)
    regex_patterns.add(regex_seq)

# Date-like patterns
regex_patterns.add(r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b")

In [None]:
# ---- DETECT KEYWORDS ----
candidate_keywords = tokens + common_phrases
word_counts = Counter(candidate_keywords)

# Remove overly generic terms
generic_words = {
    "document", "page", "statement", "date", "amount",
    "details", "number", "total", "payment", "invoice"
}
for gw in generic_words:
    if gw in word_counts:
        del word_counts[gw]

df_keywords = pd.DataFrame(word_counts.items(), columns=["Keyword", "Count"]).sort_values(by="Count", ascending=False)
df_keywords.to_csv(output_csv_keywords, index=False)

In [None]:
# ---- SAVE REGEX ----
df_regex = pd.DataFrame({"RegexPattern": list(regex_patterns)})
df_regex.to_csv(output_csv_regex, index=False)

# ---- OUTPUT SUMMARY ----
print("Keyword list saved to:", output_csv_keywords)
print("Regex patterns saved to:", output_csv_regex)
print("\nTop detected multi-word phrases:")
print(common_phrases[:20])
print("\nTop keywords:")
print(df_keywords.head(20))
print("\nSample regex patterns:")
print(list(regex_patterns)[:10])