In [None]:
# Install required packages (works in Google Colab, Carnets, or iSH)
!pip install --upgrade pip
!pip install PyPDF2 pandas nltk

In [None]:
# ---- IMPORT MODULES ----
import re
import pandas as pd
from collections import Counter
from PyPDF2 import PdfReader
import nltk
from nltk.util import ngrams

# Download NLTK stopwords if not already available
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

In [None]:
# ---- CONFIG ----
pdf_path = "input.pdf"  # Replace with your PDF path or upload in Colab
output_csv_keywords = "purview_keywords.csv"
output_csv_regex = "purview_regex.csv"
min_word_length = 3          # Minimum characters for useful words
min_phrase_frequency = 2     # Minimum occurrence for multi-word phrases

# ---- STOPWORDS ----
stopwords_set = set(nltk_stopwords.words('english'))

In [None]:
# ---- EXTRACT PDF TEXT ----
reader = PdfReader(pdf_path)
full_text = ""
for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        full_text += page_text + " "

In [None]:
# ---- CLEAN AND TOKENIZE ----
text_lower = full_text.lower()
text_clean = re.sub(r"[^\w\s]", " ", text_lower)  # Remove punctuation
tokens = [t for t in text_clean.split() if t not in stopwords_set and len(t) >= min_word_length and not t.isdigit()]

In [None]:
# ---- GENERATE N-GRAMS (multi-word phrases) ----
ngram_counts = Counter()
for n in range(2, 4):  # 2-grams and 3-grams
    for ng in ngrams(tokens, n):
        ngram_str = "_".join(ng)
        ngram_counts[ngram_str] += 1

# Keep only frequent n-grams
common_phrases = [k for k, v in ngram_counts.items() if v >= min_phrase_frequency]

# Replace multi-word phrases in text to treat them as single token
processed_text = text_clean
for phrase in common_phrases:
    processed_text = processed_text.replace(phrase.replace("_", " "), phrase)

In [None]:
# ---- GENERATE KEYWORDS ----
words = re.findall(r"\b[a-zA-Z_]+\b", processed_text)
words_filtered = [w for w in words if w not in stopwords_set and len(w) >= min_word_length and not w.isdigit()]

word_counts = Counter(words_filtered)

# Optionally remove overly common English words (can be customized)
common_english = {"document", "page", "statement", "date"}
for word in common_english:
    if word in word_counts:
        del word_counts[word]

df_keywords = pd.DataFrame(word_counts.items(), columns=["Keyword", "Count"]).sort_values(by="Count", ascending=False)
df_keywords.to_csv(output_csv_keywords, index=False)

In [None]:
# ---- GENERATE NUMERIC REGEX PATTERNS ----
regex_patterns = set()

# Find numeric sequences in document
numeric_sequences = re.findall(r"\b\d[\d\s\-]{2,}\b", processed_text)
for seq in numeric_sequences:
    seq_clean = seq.strip()
    regex_seq = re.sub(r"\d", r"\\d", seq_clean)
    regex_seq = re.sub(r"\s+", r"\\s+", regex_seq)
    regex_patterns.add(regex_seq)

# Add generic regex patterns
generic_patterns = [
    r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",   # Dates
    r"\b\d{4}\s\d{4}\s\d{4}\b",       # 12-digit sequences like card numbers
    r"\b[A-Z]{2,}\d{4,}\b"            # IDs like TX12345
]
regex_patterns.update(generic_patterns)

df_regex = pd.DataFrame({"RegexPattern": list(regex_patterns)})
df_regex.to_csv(output_csv_regex, index=False)

In [None]:
# ---- OUTPUT SUMMARY ----
print("Keyword list saved to:", output_csv_keywords)
print("Regex patterns saved to:", output_csv_regex)
print("Top detected multi-word phrases:", common_phrases[:20])
print("Top keywords:", df_keywords.head(20))