In [None]:
# Install required packages
!pip install --upgrade pip
!pip install PyPDF2 pandas nltk

In [None]:
# ---- IMPORT MODULES ----
import re
import pandas as pd
from collections import Counter
from PyPDF2 import PdfReader
import nltk
from nltk.util import ngrams

nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

In [None]:
# ---- CONFIG ----
pdf_path = 'input4.pdf'  # Replace with your PDF path or upload in Colab
output_csv_keywords = 'purview_keywords.csv'
output_csv_regex = 'purview_regex.csv'
min_word_length = 3
min_phrase_frequency = 2

stopwords_set = set(nltk_stopwords.words('english'))

In [None]:
# ---- EXTRACT PDF TEXT ----
reader = PdfReader(pdf_path)
full_text = ''
for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        full_text += page_text + ' '

In [None]:
# ---- CLEAN AND TOKENIZE ----
text_lower = full_text.lower()
text_clean = re.sub(r'[^\w\s]', ' ', text_lower)
tokens = [t for t in text_clean.split() if t not in stopwords_set and len(t) >= min_word_length and not t.isdigit()]

In [None]:
# ---- GENERATE MULTI-WORD PHRASES ----
ngram_counts = Counter()
for n in range(2, 4):  # 2-grams and 3-grams
    for ng in ngrams(tokens, n):
        ngram_str = ' '.join(ng)
        ngram_counts[ngram_str] += 1

common_phrases = [k for k, v in ngram_counts.items() if v >= min_phrase_frequency]

In [None]:
# ---- DETECT REGEX CANDIDATES ----
regex_candidates = []
# Date-like patterns
date_sequences = re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', full_text)
for seq in date_sequences:
    regex_seq = r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b'
    regex_candidates.append((regex_seq, 'Date', seq))

    # Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2
Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2 = re.findall(r'(?i)Key|Credential', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2:
	regex_seq = r'(?i)iotHub'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2', seq))


# Regex_Ranker_CSCAN_AZURE0050
Regex_Ranker_CSCAN_AZURE0050 = re.findall(r'(?i)iotHub', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0050:
	regex_seq = r'(?i)EndpointSuffix=([a-z0-9\\._]{10,50})[;"\']'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0050', seq))


# Regex_Ranker_CSCAN_AZURE0070_combined_ranker
Regex_Ranker_CSCAN_AZURE0070_combined_ranker = re.findall(r'(?i)EndpointSuffix=([a-z0-9\\._]{10,50})[;"\']', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0070_combined_ranker:
	regex_seq = r'(?i)Endpoint=(https?://[a-z0-9_]{3,50}\\.(table|blob|queue|file)\\.[a-z0-9\\.]{10,50})/?;'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker', seq))


# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2
Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2 = re.findall(r'(?i)Endpoint=(https?://[a-z0-9_]{3,50}\\.(table|blob|queue|file)\\.[a-z0-9\\.]{10,50})/?;', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2:
	regex_seq = r'(?i)batch\\.azure\\.com'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0070_2', seq))


# Regex_Ranker_CSCAN_AZURE0130_AzureBatch
Regex_Ranker_CSCAN_AZURE0130_AzureBatch = re.findall(r'(?i)batch\\.azure\\.com', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0130_AzureBatch:
	regex_seq = r'(?i)AccountName=([a-z0-9_]+);'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0130_AzureBatch', seq))


# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_AccountName
Regex_Ranker_CSCAN_AZURE0070_combined_ranker_AccountName = re.findall(r'(?i)AccountName=([a-z0-9_]+);', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0070_combined_ranker_AccountName:
	regex_seq = r'(?i)AccountEndpoint=(https?://[a-z0-9_\\.]+\\.documents\\.azure\\.com(:\\d+)?)/?[;"\']'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_AccountName', seq))


# Regex_Ranker_CSCAN_AZURE0080_AccountEndpoint_38496415
Regex_Ranker_CSCAN_AZURE0080_AccountEndpoint_38496415 = re.findall(r'(?i)AccountEndpoint=(https?://[a-z0-9_\\.]+\\.documents\\.azure\\.com(:\\d+)?)/?[;"\']', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0080_AccountEndpoint_38496415:
	regex_seq = r'(?i)Account|Storage|Access|Primary[^v]|Secondary[^v]|Blob'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0080_AccountEndpoint_38496415', seq))


# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_3
Regex_Ranker_CSCAN_AZURE0070_combined_ranker_3 = re.findall(r'(?i)Account|Storage|Access|Primary[^v]|Secondary[^v]|Blob', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0070_combined_ranker_3:
	regex_seq = r'(?i)^\\Wcore\\.windows\\.net'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_3', seq))


# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2
Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2 = re.findall(r'(?i)^\\Wcore\\.windows\\.net', full_text)
for seq in Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2:
	regex_seq = r'(?i)\\Wrefresh.?token'
	regex_candidates.append((regex_seq, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_2', seq))


# Deduplicate
regex_dict = {}
for regex, desc, sample in regex_candidates:
    if regex not in regex_dict:
        regex_dict[regex] = (desc, sample)

df_regex = pd.DataFrame(
    [(regex, desc, sample) for regex, (desc, sample) in regex_dict.items()],
    columns=['RegexPattern','Description','SampleValue']
)
df_regex.to_csv(output_csv_regex, index=False)
print('Regex patterns saved to:', output_csv_regex)
df_regex.head(10)

In [None]:
# ---- DETECT KEYWORDS INCLUDING MULTI-WORD PHRASES ----
candidate_keywords = tokens + common_phrases
word_counts = Counter(candidate_keywords)

# Remove generic and sensitive terms
generic_words = {'document','page','statement','date','amount','details','number','total','payment','invoice','name','address','phone','email'}

def is_sensitive(phrase):
    # Heuristic: capitalized names
    if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', phrase):
        return True
    # Heuristic: addresses
    if re.search(r'\b\d{1,5}\s+\w+', phrase):
        return True
    return False

filtered_keywords = {}
for kw, count in word_counts.items():
    if kw.lower() not in generic_words and not is_sensitive(kw):
        filtered_keywords[kw] = count

df_keywords = pd.DataFrame(filtered_keywords.items(), columns=['Keyword','Count'])
df_keywords = df_keywords.sort_values(by='Count', ascending=False)
df_keywords.to_csv(output_csv_keywords, index=False)
print('Keyword list (including multi-word phrases) saved to:', output_csv_keywords)
df_keywords.head(20)