In [None]:
# Install required packages
!pip install --upgrade pip
!pip install PyPDF2 pandas nltk

In [None]:
# ---- IMPORT MODULES ----
import re
import pandas as pd
from collections import Counter
from PyPDF2 import PdfReader
import nltk
from nltk.util import ngrams

nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

In [None]:
# ---- CONFIG ----
pdf_path = 'input.pdf'  # Replace with your PDF path or upload in Colab
output_csv_keywords = 'purview_keywords.csv'
output_csv_regex = 'purview_regex.csv'
min_word_length = 3
min_phrase_frequency = 2

stopwords_set = set(nltk_stopwords.words('english'))

In [None]:
# ---- EXTRACT PDF TEXT ----
reader = PdfReader(pdf_path)
full_text = ''
for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        full_text += page_text + ' '

In [None]:
# ---- CLEAN AND TOKENIZE ----
text_lower = full_text.lower()
text_clean = re.sub(r'[^\w\s]', ' ', text_lower)
tokens = [t for t in text_clean.split() if t not in stopwords_set and len(t) >= min_word_length and not t.isdigit()]

In [None]:
# ---- GENERATE MULTI-WORD PHRASES ----
ngram_counts = Counter()
for n in range(2, 4):  # 2-grams and 3-grams
    for ng in ngrams(tokens, n):
        ngram_str = ' '.join(ng)
        ngram_counts[ngram_str] += 1

common_phrases = [k for k, v in ngram_counts.items() if v >= min_phrase_frequency]

In [None]:
# ---- DETECT REGEX CANDIDATES ----
regex_candidates = []

# Numeric sequences (IDs, references)
numeric_sequences = re.findall(r'\b\d[\d\s\-]{4,}\b', full_text)
for seq in numeric_sequences:
    seq_clean = seq.strip()
    regex_seq = re.sub(r'\d', r'\\d', seq_clean)
    regex_seq = re.sub(r'\s+', r'\\s+', regex_seq)
    regex_candidates.append((regex_seq, 'Numeric sequence (ID / reference)', seq_clean))

# Alphanumeric patterns
alphanumeric_sequences = re.findall(r'\b[A-Z]{2,}[A-Z0-9\-]{2,}\b', full_text)
for seq in alphanumeric_sequences:
    regex_seq = re.sub(r'[A-Z]', '[A-Z]', seq)
    regex_seq = re.sub(r'[0-9]', r'\\d', regex_seq)
    regex_candidates.append((regex_seq, 'Alphanumeric code (reference / ID)', seq))

# Date-like patterns
date_sequences = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', full_text)
for seq in date_sequences:
    regex_seq = r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b'
    regex_candidates.append((regex_seq, 'Date', seq))

# Deduplicate
regex_dict = {}
for regex, desc, sample in regex_candidates:
    if regex not in regex_dict:
        regex_dict[regex] = (desc, sample)

df_regex = pd.DataFrame(
    [(regex, desc, sample) for regex, (desc, sample) in regex_dict.items()],
    columns=['RegexPattern','Description','SampleValue']
)
df_regex.to_csv(output_csv_regex, index=False)
print('Regex patterns saved to:', output_csv_regex)
df_regex.head(10)

In [None]:
# ---- DETECT KEYWORDS INCLUDING MULTI-WORD PHRASES ----
candidate_keywords = tokens + common_phrases
word_counts = Counter(candidate_keywords)

# Remove generic and sensitive terms
generic_words = {'document','page','statement','date','amount','details','number','total','payment','invoice','name','address','phone','email'}

def is_sensitive(phrase):
    # Heuristic: capitalized names
    if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', phrase):
        return True
    # Heuristic: addresses
    if re.search(r'\b\d{1,5}\s+\w+', phrase):
        return True
    return False

filtered_keywords = {}
for kw, count in word_counts.items():
    if kw.lower() not in generic_words and not is_sensitive(kw):
        filtered_keywords[kw] = count

df_keywords = pd.DataFrame(filtered_keywords.items(), columns=['Keyword','Count'])
df_keywords = df_keywords.sort_values(by='Count', ascending=False)
df_keywords.to_csv(output_csv_keywords, index=False)
print('Keyword list (including multi-word phrases) saved to:', output_csv_keywords)
df_keywords.head(20)