In [None]:
# Install required packages
!pip install --upgrade pip
!pip install PyPDF2 pandas nltk

In [None]:
# ---- IMPORT MODULES ----
import re
import pandas as pd
from collections import Counter
from PyPDF2 import PdfReader
import requests
import nltk
from nltk.util import ngrams

# ---- Load stopwords from GitHub ----
STOPWORDS_URL = "https://raw.githubusercontent.com/tp12121212/purview_sit_analyzer/refs/heads/main/english.txt"

response = requests.get(STOPWORDS_URL)
if response.status_code == 200:
    stopwords = set(response.text.splitlines())
else:
    raise Exception(f"Failed to fetch stopwords from GitHub. Status code: {response.status_code}")

print(f"Loaded {len(stopwords)} stopwords")
print('Regex patterns saved to:', stopwords)


In [None]:
# ---- CONFIG ----
pdf_path = 'input.pdf'  # Replace with your PDF path or upload in Colab
output_csv_keywords = 'purview_keywords.csv'
output_csv_regex = 'purview_regex.csv'
min_word_length = 3
min_phrase_frequency = 2

stopwords_set = stopwords

In [None]:
# ---- EXTRACT PDF TEXT ----
reader = PdfReader(pdf_path)
full_text = ''
for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        full_text += page_text + ' '
print('Regex patterns saved to:', full_text)

In [None]:
# ---- CLEAN AND TOKENIZE ----
text_lower = full_text.lower()
text_clean = re.sub(r'[^\w\s]', ' ', text_lower)
tokens = [t for t in text_clean.split() if t not in stopwords_set and len(t) >= min_word_length and not t.isdigit()]
print('text_lower:', text_lower)
print('text_clean:', text_clean)

In [None]:
# ---- GENERATE MULTI-WORD PHRASES ----
ngram_counts = Counter()
for n in range(2, 4):  # 2-grams and 3-grams
    for ng in ngrams(tokens, n):
        ngram_str = ' '.join(ng)
        ngram_counts[ngram_str] += 1

common_phrases = [k for k, v in ngram_counts.items() if v >= min_phrase_frequency]

In [None]:
import re
import pandas as pd

# ---- DETECT REGEX CANDIDATES ----
regex_candidates = []

def find_with_lines(pattern, text, desc):
    """Find all regex matches with their line numbers."""
    for match in re.finditer(pattern, text):
        match_text = match.group(0)
        line_number = text.count('\n', 0, match.start()) + 1
        regex_candidates.append((pattern, desc, match_text, line_number))

# NSW_Drivers_Licence_number
find_with_lines(r'\b(?:[0-9]{8}|(?-i:[0-9]{4}[A-Z]{2}))\b', full_text, 'NSW_Drivers_Licence_number')

# NSW_Drivers_Card_number
find_with_lines(r'\b[0-9][ ][0-9]{3}[ ][0-9]{3}[ ][0-9]{3}\b', full_text, 'NSW_Drivers_Licence_Card_number')

# Date-like patterns
find_with_lines(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', full_text, 'Date')

# Regex_Ranker_CSCAN_AZURE0090_b08f98f2_d980_4598_814b_67c29887ebff_33624151_CEP2_0
find_with_lines(r'userPWD=', full_text, 'Regex_Ranker_CSCAN_AZURE0090_b08f98f2_d980_4598_814b_67c29887ebff_33624151_CEP2_0')

# regex_nab_bank_statements
find_with_lines(r'\b08[2-4](?:[- ]?\d{3})\b', full_text, 'regex_nab_bank_statements')

# regex_commbank_bsb
find_with_lines(r'(?:^|[\s,;:\(\)\[\]\""\' ])(?:(?:(?:06|76)[0-9]{4})|(?:06|76) [0-9]{4}|(?:06[0-9]|76[-0-9])( |-)[0-9]{3})(?:$|[\s,;:\(\)\[\]\""\' ]|\.\s|\.$)', full_text, 'regex_commbank_bsb')

# Stgeorge_bank_statement_supporting_regex
find_with_lines(r'\b(112|113|114|115|116|117|118|119|330|332|333|334|335|336)[ -]?\d{3}\b', full_text, 'Stgeorge_bank_statement_supporting_regex')

# Regex_Scanner_JsonWebToken_JwtToken_51810644_CEP2_0
#find_with_lines(
#    r'(eyJ(?i)[a-z0-9\-_%]+\.(?-i)eyJ(?i)[a-z0-9\-_%]+\.[a-z0-9\-_%]+)|([rR]efresh_?[tT]oken|REFRESH_?TOKEN)["\']?\s{0,4}[:=]{1,2}\s{0,4}["\']?((\w+-)+\w+["\']?)',#
#    full_text,
#    'Regex_Scanner_JsonWebToken_JwtToken_51810644_CEP2_0'
#)

# Regex_canada_passport_number_Copy
find_with_lines(r'(?ix)\b([A-Z]{2}\d{6})\b', full_text, 'Regex_canada_passport_number_Copy')

# Regex_Ranker_CSCAN_AZURE0030_9beb734f_ba2b_452b_b422_589f5ac467ef_43332040_CEP2_0
find_with_lines(r'(?i)Shared(Access(Policy)?Key|SecretValue)\s?=', full_text, 'Regex_Ranker_CSCAN_AZURE0030_9beb734f_ba2b_452b_b422_589f5ac467ef_43332040_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_17ba94a9_a24c_4c84_a838_22a1c0c192e7_26987408_CEP2_0
find_with_lines(r'(?i)Key|Credential', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_17ba94a9_a24c_4c84_a838_22a1c0c192e7_26987408_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0050_Partial_8305ad49_df2a_4e1e_a008_fc63cb1db966_49652976_CEP2_0
find_with_lines(r'(?i)iotHub', full_text, 'Regex_Ranker_CSCAN_AZURE0050_Partial_8305ad49_df2a_4e1e_a008_fc63cb1db966_49652976_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_EndpointSuffix_22118023_CEP2_0
find_with_lines(r'(?i)EndpointSuffix=([a-z0-9._]{10,50})[;"\']', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_EndpointSuffix_22118023_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_Endpoint_24827179_CEP2_0
find_with_lines(r'(?i)Endpoint=(https?://[a-z0-9_]{3,50}\.(table|blob|queue|file)\.[a-z0-9\.]{10,50})/?;', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_Endpoint_24827179_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0130_AzureBatch_a5d6121e_e9af_4b7d_a7da_9aad47e4c66d_53046711_CEP2_0
find_with_lines(r'(?i)batch\.azure\.com', full_text, 'Regex_Ranker_CSCAN_AZURE0130_AzureBatch_a5d6121e_e9af_4b7d_a7da_9aad47e4c66d_53046711_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_AccountName_40041277_CEP2_0
find_with_lines(r'(?i)AccountName=([a-z0-9_]+);', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_AccountName_40041277_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0080_AccountEndpoint_38496415_CEP2_0
find_with_lines(r'(?i)AccountEndpoint=(https?://[a-z0-9_.]+\.documents\.azure\.com(:\d+)?)/?[;"\']', full_text, 'Regex_Ranker_CSCAN_AZURE0080_AccountEndpoint_38496415_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_d37929c5_be80_4f59_951f_5dc6f21d8892_17911681_CEP2_0
find_with_lines(r'(?i)Account|Storage|Access|Primary[^v]|Secondary[^v]|Blob', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_d37929c5_be80_4f59_951f_5dc6f21d8892_17911681_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_EndpointSuffix_64844482_CEP2_0
find_with_lines(r'(?i)^\Wcore\.windows\.net', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_EndpointSuffix_64844482_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0140_RefreshToken_74aa94f2_34ed_40bf_ba88_0bc17398a9cf_9381496_CEP2_0
find_with_lines(r'(?i)\Wrefresh.?token', full_text, 'Regex_Ranker_CSCAN_AZURE0140_RefreshToken_74aa94f2_34ed_40bf_ba88_0bc17398a9cf_9381496_CEP2_0')

# Regex_Scanner_LoginCredentials_Login_2383799_CEP2_0
find_with_lines(r'(?i)[^a-z\$](DB_USER|user id|uid|(sql)?user(name)?|service\s?account)\s{0,4}[^\w\s,]([ -~\r\n\s]{2,120}?|[ -~]{2,30}?)([^a-z\s\$]|\s)\s{0,4}(DB_PASS|(sql|service)?password|pwd)\s{0,4}[^\a-z,\+&\)\]\}\[\{_][ -~\r\n\s]{2,700}?([;|<,})]|$)|[^a-z\s\$]\s{0,4}(DB_PASS|password|pwd)\s{0,4}[^\a-z,\+&\)\]\}\[\{_][ -~\r\n\s]{2,60}?[^a-z\$](DB_USER|user id|uid|user(name)?)\s{0,4}[^\w\s,]([ -~\r\n\s]{2,60}?|[ -~]{2,30}?)([;|<,})]|$)', full_text,'Regex_Scanner_LoginCredentials_Login_2383799_CEP2_0')

# Regex_Scanner_SymmetricKey128_SymmetricKey_61027830_CEP2_0
find_with_lines(r'(?i)[^\w/\+\._\$,\\]([a-z0-9/\+]{22}==)([^\w/\+\.\$]|$)', full_text, 'Regex_Scanner_SymmetricKey128_SymmetricKey_61027830_CEP2_0')

# Regex_Scanner_SymmetricKey360_SymmetricKey_31201899_CEP2_0
find_with_lines(r'(?i)[^\w/\+\.\-\$,\\]([a-z0-9/\+]{60})[^\w/\+\.\-\$,\\]', full_text, 'Regex_Scanner_SymmetricKey360_SymmetricKey_31201899_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0020_ServerName_21454193_CEP2_0
find_with_lines(r'(?i)(tcp:)?([a-z\-_0-9:\.]{1,50}(\.database\.azure\.com|\.database(\.secure)?\.windows\.net|\.cloudapp\.net|\.database\.usgovcloudapi\.net|\.database\.chinacloudapi\.cn|\.database\.cloudapi\.de))', full_text, 'Regex_Ranker_CSCAN_AZURE0020_ServerName_21454193_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_AccountKey_41560081_CEP2_0
find_with_lines(r'(?i)(Storage)?.?Account.?Key', full_text, 'Regex_Ranker_CSCAN_AZURE0070_combined_ranker_CEP2_0_AccountKey_41560081_CEP2_0')

# Regex_Filter_TrivialValue2_20601768_CEP2_0
find_with_lines(r'(?i)(sample|example)\.(com|net)', full_text, 'Regex_Filter_TrivialValue2_20601768_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0080_Partial_1eb5757c_210e_46a5_876e_b0ad231103e9_10923418_CEP2_0
find_with_lines(r'(?i)(Doc(ument)?|cosmos)Db(Conn(ection)?Str(ing)?|(Access)?Key)', full_text, 'Regex_Ranker_CSCAN_AZURE0080_Partial_1eb5757c_210e_46a5_876e_b0ad231103e9_10923418_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0020_UserName_58870012_CEP2_0
find_with_lines(r'(?i)(DB_USER|user id|uid|user(name)?)(\s{0,4}=\s{0,4}|["\']\s{0,4}=>\s{0,4}["\']|["\'\\]+\s{0,4}:\s{0,4}["\'\\]+)([^;|"\'\s\r\n<`,)]{2,50}?)[;|"\'\s\r\n<`,]', full_text, 'Regex_Ranker_CSCAN_AZURE0020_UserName_58870012_CEP2_0')

# Regex_Ranker_CSCAN_AZURE0020_0eae114f_baad_4dba_a100_c5b34d217964_60068066_CEP2_0
find_with_lines(r'(?i)(^|[^a-z])(DB_[a-z]*?NAME|initial catalog|database(name)?)(\s{0,4}=\s{0,4}|["\']\s{0,4}=>\s{0,4}["\']|["\'\\]+\s{0,4}:\s{0,4}["\'\\]+)([^;"\'\s\r\n]{4,50}?)[;|"\'\s\r\n<`,]', full_text, 'Regex_Ranker_CSCAN_AZURE0020_0eae114f_baad_4dba_a100_c5b34d217964_60068066_CEP2_0')

# Regex_Ranker_CSCAN_GENERAL0030_combined_ranker_CEP2_0_ServerName_53517805_CEP2_0
find_with_lines(r'(?i)(^|[^a-z])((Remote ?LU( ?Alias)?|host(name)?|data source|server|addr|(network )?address)(\s{0,4}=\s{0,4}|:\s{0,4}["\'\\]+)|jdbc:sqlserver://)', full_text, 'Regex_Ranker_CSCAN_GENERAL0030_combined_ranker_CEP2_0_ServerName_53517805_CEP2_0')

# Regex_Scanner_PasswordContextInCode_Password_62074597_CEP2_0
# This one is commented out but can be fixed similarly if needed

# Regex_australia_passport_number_Copy
find_with_lines(r'(?i)\b(([AC-FNUX]|P[A-FUWXZ])\d{7})\b', full_text, 'Regex_australia_passport_number_Copy')

# Regex_australia_drivers_license_number_Copy
find_with_lines(r'(?i)\b((\d{2}[ -]?\d{2}[ -]?\d{4})|(\d{3}[ -]?\d{3}[ -]?\d{3,4})|(\d{1}[ -]?\d{3}[ -]?\d{3}[ -]?\d{3})|([A-Za-z]\d{5})|([A-Za-z]{2}\d{4})|(\d{7})|(\d{4}[A-Za-z]{2}))\b', full_text, 'Regex_australia_drivers_license_number_Copy')

# Regex_australia_drivers_license_number_Custom_v2
find_with_lines(r'(?i)\b((\d{2}[ -]?\d{2}[ -]?\d{4})|(\d{3}[ -]?\d{3}[ -]?\d{3,4})|(\d{1}[ -]?\d{3}[ -]?\d{3}[ -]?\d{3})|([A-Za-z]\d{5})|([A-Za-z]{2}\d{4})|(\d{7}))\b', full_text, 'Regex_australia_drivers_license_number_Custom_v2')

# Regex_ipv6_address_Copy
find_with_lines(r'(?i)(?:^|[\s,;\(\)\[\]"\'])((?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4})(?:$|[\s,;\(\)\[\]"\']|\.\\s|\.$)', full_text, 'Regex_ipv6_address_Copy')

# Regex_Ranker_CSCAN_AZURE0130_Prefix_a853ad91_d200_41b2_8dc4_4ce41dda3b81_3056034_CEP2_0
find_with_lines(r'(?i)((SharedAccess(Policy)?|SAS|Primary|Secondary)Key|SharedAccessSignature|SharedKey)[ -~]{0,50}$', full_text, 'Regex_Ranker_CSCAN_AZURE0130_Prefix_a853ad91_d200_41b2_8dc4_4ce41dda3b81_3056034_CEP2_0')

# CEP_Regex_AzurePublishSettingPasswords
find_with_lines(r'\b(userpwd="[a-z0-9]{60}")\b', full_text, 'CEP_Regex_AzurePublishSettingPasswords')

# regex_statement_number
find_with_lines(r'\b(Statement No\. (?:[1-9]|[1-9][0-9]|1[0-9]{2}|200)\b)\b', full_text, 'regex_statement_number')

# regex_page_number
find_with_lines(r'\b(Page 1 of ([1-9]|1[0-9]|20))\b', full_text, 'regex_page_number')

# CEP_Regex_AzureServiceBusConnectionString
find_with_lines(r'\b(EndPoint\s{0,2}=\s{0,2}[\x20-\x7F]{1,200}?servicebus\.windows\.net[\x20-\x7F]{1,200}?SharedAccessKey\s{0,2}=\s{0,2}[a-zA-Z0-9/+]{43}=)\b', full_text, 'CEP_Regex_AzureServiceBusConnectionString')

# CEP_AzureEmulatorStorageAccountFilter
find_with_lines(r'\b(Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==)\b', full_text, 'CEP_AzureEmulatorStorageAccountFilter')

# CEP_Regex_AzureStorageAccountKey
find_with_lines(r'\b(DefaultEndpointsProtocol\s{0,2}=\s{0,2}[\x20-\x7F]{1,200}?AccountKey\s{0,2}=\s{0,2}[a-zA-Z0-9/+]{86}==)\b', full_text, 'CEP_Regex_AzureStorageAccountKey')

# CEP_CommonExampleKeywords
find_with_lines(r'\b(contoso|fabrikam|northwind|sandbox|onebox|localhost|127\.0\.0\.1|testacs\.com|s-int\.net)\b', full_text, 'CEP_CommonExampleKeywords')

# Regex_canada_phin_Copy
find_with_lines(r'\b(\d{9})\b', full_text, 'Regex_canada_phin_Copy')

# Regex_canada_bank_account_number_Copy
find_with_lines(r'\b(\d{7}|\d{12})\b', full_text, 'Regex_canada_bank_account_number_Copy')

# Regex_australia_bank_account_number_bsb
find_with_lines(r'\b(\d{3}-\d{3})\b', full_text, 'Regex_australia_bank_account_number_bsb')

# Regex_canada_health_service_number_Copy
find_with_lines(r'\b(\d{10})\b', full_text, 'Regex_canada_health_service_number_Copy')

# regex_transaction_amount
find_with_lines(r'\b(\$?-?\d{1,3}(,\d{3}){0,3}\.\d{2}\b)\b', full_text, 'regex_transaction_amount')

# Regex_australia_bank_account_number
find_with_lines(r'\b([0-9]{6,10})\b', full_text, 'Regex_australia_bank_account_number')

# CEP_Regex_SQLServerConnectionString
find_with_lines(r'\b((User Id|User ID|uid|UserId)[\x20-\x7F]{1,200}(Password|[^a-z]pwd)=[^$%>@";\[\{][^;/"]{7,128}(;|"))\b', full_text, 'CEP_Regex_SQLServerConnectionString')

# CEP_Regex_AzureConnectionString
find_with_lines(r'\b((Server|server|data source)\s{0,2}=\s{0,2}[\x20-\x7F]{1,200}?((cloudapp\.(azure\.com|net))|(database\.windows\.net))[\x20-\x7F]{1,300}?(Password|password|pwd)\s{0,2}=\s{0,2}[^;"\']+[;"\'])\b', full_text, 'CEP_Regex_AzureConnectionString')

# CEP_PasswordPlaceHolder
find_with_lines(r'\b((Password|pwd)\s{0,2}=\s{0,2}\*+|(Password|pwd)=<[a-zA-Z0-9\*\-\_\s]{1,200}>)\b', full_text, 'CEP_PasswordPlaceHolder')

# Westpac_statement_bsb_regex
find_with_lines(r'\b((03[2-9]|73[0-9])[ -]?\d{3})\b', full_text, 'Westpac_statement_bsb_regex')

# Westpac_statement_supporting_regex
find_with_lines(r'\b((03[2-9]|73[0-9])[ -]?\d{3})\b', full_text, 'Westpac_statement_supporting_regex')

# Regex_Ecuador_Unique_Identification_Number
find_with_lines(r'\b((0[1-9]|[1-2][0-9]|30|90|99)\d{8})\b', full_text, 'Regex_Ecuador_Unique_Identification_Number')

# CEP_Regex_AzureStorageAccountKeyGeneric
find_with_lines(r'\b((>|\'|=|"|#)?[a-zA-Z0-9/+]{86}==)\b', full_text, 'CEP_Regex_AzureStorageAccountKeyGeneric')

# Regex_canada_bank_account_transit_number_Copy
find_with_lines(r'\b((\d{5}-\d{3})|(0\d{8}))\b', full_text, 'Regex_canada_bank_account_transit_number_Copy')

# Regex_australia_passport_number_custom
find_with_lines(r'\b(([AC-FNUX]|P[A-FUWXZ])\d{7})\b', full_text, 'Regex_australia_passport_number_custom')

# Regex_ipv4_address_Copy
find_with_lines(r'\b((?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-5])(?:\.(?:0|[1-9]\d?|1\d\d|2[0-4]\d|25[0-5])){3})\b', full_text, 'Regex_ipv4_address_Copy')

# CEP_Regex_AzureRedisCacheConnectionString
find_with_lines(r'(?:^|[\s,;:\(\)\[\]"\'\.])(redis\.cache\.windows\.net[\x20-\x7F]{1,200}?(password|pwd)\s{0,2}=\s{0,2}[a-zA-Z0-9/+]{43}=)\b', full_text, 'CEP_Regex_AzureRedisCacheConnectionString')

# CEP_Regex_AzureDocumentDBAuthKey
find_with_lines(r'(?:^|[\s,;:\(\)\[\]"\'\.])(DocumentDb[\x20-\x7F]{3,200}?[>\'="][a-zA-Z0-9/+]{86}==)\b', full_text, 'CEP_Regex_AzureDocumentDBAuthKey')

# CEP_Regex_AzureSAS
find_with_lines(r'(?:^|[\s,;:\(\)\[\]"\'&])(sig\s{0,2}=\s{0,2}[a-zA-Z0-9%]{43,53}%3[Dd][^a-zA-Z0-9%])\b', full_text, 'CEP_Regex_AzureSAS')

# Regex_Ukraine_Passport_Domestic
find_with_lines(r'\b(\d{9})(?:$|[\s,;:\(\)\[\]"\'#|]|\.$)', full_text, 'Regex_Ukraine_Passport_Domestic')

# Regex_austria_eu_drivers_license_number
find_with_lines(r'\b(\d{8})\b', full_text, 'Regex_austria_eu_drivers_license_number')

# Regex_croatia_eu_drivers_license_number
find_with_lines(r'\b(\d{8})\b', full_text, 'Regex_croatia_eu_drivers_license_number')

# Regex_austria_eu_ssn_or_equivalent
find_with_lines(r'\b(\d{4}([0]?[1-9]|1[0-9]|2[0-9]|3[0-1])([0]?[1-9]|1[0-9])\d{2})\b', full_text, 'Regex_austria_eu_ssn_or_equivalent')

# Regex_France_Health_Insurance_Number
find_with_lines(r'\b(\d{21}|\d{10} \d{10} \d)(?:$|[\s,;:\(\)\[\]"\'#|]|\.$)', full_text, 'Regex_France_Health_Insurance_Number')

# Regex_argentina_national_id
find_with_lines(r'\b(\d{2}\.\d{3}\.\d{3}|\d{8})\b', full_text, 'Regex_argentina_national_id')

# Regex_belgium_eu_tax_file_number
find_with_lines(r'\b(\d{2}[01]\d[0123]\d{6})\b', full_text, 'Regex_belgium_eu_tax_file_number')

# Regex_austria_eu_tax_file_number
find_with_lines(r'\b(\d{2}-?\d{3}/?\d{4})\b', full_text, 'Regex_austria_eu_tax_file_number')

# Regex_Russian_Passport_Number_International
find_with_lines(r'\b(\d{2}[ -]?\d{7})\b', full_text, 'Regex_Russian_Passport_Number_International')

# Regex_belgium_eu_ssn_or_equivalent
find_with_lines(r'\b(\d{11})\b', full_text, 'Regex_belgium_eu_ssn_or_equivalent')

# Regex_Russian_Passport_Number_Domestic
find_with_lines(r'\b(\d{10}|\d{4} \d{6}|\d{2}[ -]\d{2} \d{6})(?:$|[\s,;:\(\)\[\]"\'#|]|\.$)', full_text, 'Regex_Russian_Passport_Number_Domestic')

# Regex_belgium_eu_drivers_license_number
find_with_lines(r'\b(\d{10})\b', full_text, 'Regex_belgium_eu_drivers_license_number')

# Regex_austria_eu_national_id_card
find_with_lines(r'\b([a-zA-Z0-9+/\\]{22}[a-zA-Z0-9+/=\\][a-zA-Z0-9+/=\\])\b', full_text, 'Regex_austria_eu_national_id_card')

# Regex_belgium_eu_passport_number
find_with_lines(r'\b([a-zA-Z]{2}\d{6})\b', full_text, 'Regex_belgium_eu_passport_number')

# Regex_Ukraine_Passport_International
find_with_lines(r'\b([A-Za-z]{2}\d{6})(?:$|[\s,;:\(\)\[\]"\'#|]|\.$)', full_text, 'Regex_Ukraine_Passport_International')

# Regex_austria_eu_passport_number
find_with_lines(r'\b([a-zA-Z]\s?\d{7})\b', full_text, 'Regex_austria_eu_passport_number')

# Regex_swift
find_with_lines(r'\b([A-Z]{4}(AF|AX|AL|DZ|AS|AD|AO|AI|AQ|AG|AR|AM|AW|AU|AT|AZ|BS|BH|BD|BB|BY|BE|BZ|BJ|BM|BT|BO|BQ|BA|BW|BV|BR|IO|BN|BG|BF|BI|KH|CM|CA|CV|KY|CF|TD|CL|CN|CX|CC|CO|KM|CG|CD|CK|CR|CI|HR|CU|CW|CY|CZ|DK|DJ|DM|DO|EC|EG|SV|GQ|ER|EE|ET|FK|FO|FJ|FI|FR|GF|PF|TF|GA|GM|GE|DE|GH|GI|GR|GL|GD|GP|GU|GT|GG|GN|GW|GY|HT|HM|VA|HN|HK|HU|IS|IN|ID|IR|IQ|IE|IM|IL|IT|JM|JP|JE|JO|KZ|KE|KI|KP|KR|KW|KG|LA|LV|LB|LS|LR|LY|LI|LT|LU|MO|MK|MG|MW|MY|MV|ML|MT|MH|MQ|MR|MU|YT|MX|FM|MD|MC|MN|ME|MS|MA|MZ|MM|NA|NR|NP|NL|NC|NZ|NI|NE|NG|NU|NF|MP|NO|OM|PK|PW|PS|PA|PG|PY|PE|PH|PN|PL|PT|PR|QA|RE|RO|RU|RW|BL|SH|KN|LC|MF|PM|VC|WS|SM|ST|SA|SN|RS|SC|SL|SG|SX|SK|SI|SB|SO|ZA|GS|SS|ES|LK|SD|SR|SJ|SZ|SE|CH|SY|TW|TJ|TZ|TH|TL|TG|TK|TO|TT|TN|TR|TM|TC|TV|UG|UA|AE|GB|US|UM|UY|UZ|VU|VE|VN|VG|VI|WF|EH|YE|ZM|ZW)(\w{2}|\w{5}))\b', full_text, 'Regex_swift')

# Regex_Finland_European_Health_Insurance_Number
find_with_lines(r'\b((8024680246)[- ]?\d{10})(?:$|[\s,;:\(\)\[\]"\'#|]|\.$)', full_text, 'Regex_Finland_European_Health_Insurance_Number')

# Regex_colombia_national_id_number
find_with_lines(r'\b((\d{2}((\.\d{3}){2}|(,\d{3}){2}|\d{6}))|(1((\.\d{3}){3}|(,\d{3}){3}|\d{9})))\b', full_text, 'Regex_colombia_national_id_number')

# Regex_australia_drivers_license_number_Custom_v3
find_with_lines(r'(?:^|[\s,;:\(\)\[\]"\' ])((\d{2}[ -]?\d{2}[ -]?\d{4})|(\d{3}[ -]?\d{3}[ -]?\d{3,4})|(\d{1}[ -]?\d{3}[ -]?\d{3}[ -]?\d{3})|([A-Za-z]\d{5})|([A-Za-z]{2}\d{4})|(\d{7})|(\d{4}[A-Za-z]{2}))\b', full_text, 'Regex_australia_drivers_license_number_Custom_v3')

# Regex_Email
find_with_lines(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,7}\b', full_text, 'Regex_Email')

# Regex_IP_Address
find_with_lines(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', full_text, 'Regex_IP_Address')

# Regex_URL
find_with_lines(r'\bhttps?:\/\/[^\s]*\b', full_text, 'Regex_URL')

# Regex_JWT
find_with_lines(r'\b[eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9]+\.[eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ]+\.[SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c]+\b', full_text, 'Regex_JWT')

# Regex_Dropbox_Access_Token
find_with_lines(r'\b[a-zA-Z0-9]{64}\b', full_text, 'Regex_Dropbox_Access_Token')

# Regex_SendGrid_API_Key
find_with_lines(r'\bSG\.[a-zA-Z0-9-_]{22,}\b', full_text, 'Regex_SendGrid_API_Key')

# Regex_Slack_Bot_Token
find_with_lines(r'\bxoxb-[a-zA-Z0-9]{10,}\b', full_text, 'Regex_Slack_Bot_Token')

# Regex_Stripe_Publishable_Key
find_with_lines(r'\bpk_(live|test)_[a-zA-Z0-9]{24}\b', full_text, 'Regex_Stripe_Publishable_Key')

# Regex_Twilio_Account_SID
find_with_lines(r'\bAC[a-zA-Z0-9]{32}\b', full_text, 'Regex_Twilio_Account_SID')

# Regex_Firebase_URL
find_with_lines(r'.*firebaseio\.com', full_text, 'Regex_Firebase_URL')

# Regex_Slack_Token
find_with_lines(r'(xox[p|b|o|a]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})', full_text, 'Regex_Slack_Token')

# Regex_RSA_Private_Key
find_with_lines(r'-----BEGIN RSA PRIVATE KEY-----', full_text, 'Regex_RSA_Private_Key')

# Regex_SSH_DSA_Private_Key
find_with_lines(r'-----BEGIN DSA PRIVATE KEY-----', full_text, 'Regex_SSH_DSA_Private_Key')

# Regex_SSH_EC_Private_Key
find_with_lines(r'-----BEGIN EC PRIVATE KEY-----', full_text, 'Regex_SSH_EC_Private_Key')

# Regex_PGP_Private_Key
find_with_lines(r'-----BEGIN PGP PRIVATE KEY BLOCK-----', full_text, 'Regex_PGP_Private_Key')

# Regex_AWS_Access_Key_ID
find_with_lines(r'AKIA[0-9A-Z]{16}', full_text, 'Regex_AWS_Access_Key_ID')

# Regex_Amazon_MWS
find_with_lines(r'amzn\.mws\.[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', full_text, 'Regex_Amazon_MWS')

# Regex_AWS_API_Key
find_with_lines(r'AKIA[0-9A-Z]{16}', full_text, 'Regex_AWS_API_Key')

# Regex_Facebook_Access_Token
find_with_lines(r'EAACEdEose0cBA[0-9A-Za-z]+', full_text, 'Regex_Facebook_Access_Token')

# Regex_Facebook_OAuth
find_with_lines(r'[f|F][a|A][c|C][e|E][b|B][o|O][o|O][k|K].*[\'"][0-9a-f]{32}[\'"]', full_text, 'Regex_Facebook_OAuth')

# Regex_GitHub
find_with_lines(r'[g|G][i|I][t|T][h|H][u|U][b|B].*[\'"][0-9a-zA-Z]{35,40}[\'"]', full_text, 'Regex_GitHub')

# Regex_Generic_API_Key
find_with_lines(r'[a|A][p|P][i|I]_?[k|K][e|E][y|Y].*[\'"][0-9a-zA-Z]{32,45}[\'"]', full_text, 'Regex_Generic_API_Key')

# Regex_Generic_Secret
find_with_lines(r'[s|S][e|E][c|C][r|R][e|E][t|T].*[\'"][0-9a-zA-Z]{32,45}[\'"]', full_text, 'Regex_Generic_Secret')

# Regex_Google_API_Key
find_with_lines(r'AIza[0-9A-Za-z\-_]{35}', full_text, 'Regex_Google_API_Key')

# Regex_GCP_Service_Account
find_with_lines(r'"type": "service_account"', full_text, 'Regex_GCP_Service_Account')

# Regex_Google_OAuth_Access_Token
find_with_lines(r'ya29\.[0-9A-Za-z\-_]+', full_text, 'Regex_Google_OAuth_Access_Token')

# Regex_Heroku_API_Key
find_with_lines(r'[h|H][e|E][r|R][o|O][k|K][u|U].*[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}', full_text, 'Regex_Heroku_API_Key')

# Regex_MailChimp_API_Key
find_with_lines(r'[0-9a-f]{32}-us[0-9]{1,2}', full_text, 'Regex_MailChimp_API_Key')

# Regex_Mailgun_API_Key
find_with_lines(r'key-[0-9a-zA-Z]{32}', full_text, 'Regex_Mailgun_API_Key')

# Regex_URL_Password
find_with_lines(r'[a-zA-Z]{3,10}://[^/\s:@]{3,20}:[^/\s:@]{3,20}@.{1,100}["\'\s]', full_text, 'Regex_URL_Password')

# Regex_Braintree_Access_Token
find_with_lines(r'access_token\$production\$[0-9a-z]{16}\$[0-9a-f]{32}', full_text, 'Regex_Braintree_Access_Token')

# Regex_Picatic_API_Key
find_with_lines(r'sk_live_[0-9a-z]{32}', full_text, 'Regex_Picatic_API_Key')

# Regex_Slack_Webhook
find_with_lines(r'https://hooks\.slack\.com/services/T[a-zA-Z0-9_]{8}/B[a-zA-Z0-9_]{8}/[a-zA-Z0-9_]{24}', full_text, 'Regex_Slack_Webhook')

# Regex_Stripe_API_Key
find_with_lines(r'sk_live_[0-9a-zA-Z]{24}', full_text, 'Regex_Stripe_API_Key')

# Regex_Stripe_Restricted_API_Key
find_with_lines(r'rk_live_[0-9a-zA-Z]{24}', full_text, 'Regex_Stripe_Restricted_API_Key')

# Regex_Square_Access_Token
find_with_lines(r'sq0atp-[0-9A-Za-z\-_]{22}', full_text, 'Regex_Square_Access_Token')

# Regex_Square_OAuth_Secret
find_with_lines(r'sq0csp-[0-9A-Za-z\-_]{43}', full_text, 'Regex_Square_OAuth_Secret')

# Regex_Twilio_API_Key
find_with_lines(r'SK[0-9a-fA-F]{32}', full_text, 'Regex_Twilio_API_Key')

# Regex_Twitter_Access_Token
find_with_lines(r'[t|T][w|W][i|I][t|T][t|T][e|E][r|R].*[1-9][0-9]+-[0-9a-zA-Z]{40}', full_text, 'Regex_Twitter_Access_Token')


# Deduplicate (optional â€“ keeps first seen)
regex_dict = {}
for regex, desc, sample, line in regex_candidates:
    key = (regex, sample, line)
    if key not in regex_dict:
        regex_dict[key] = (desc, sample, line)

df_regex = pd.DataFrame(
    [(regex, desc, sample, line) for (regex, sample, line), (desc, sample, line) in regex_dict.items()],
    columns=['RegexPattern', 'Description', 'SampleValue', 'LineNumber']
)

df_regex.to_csv(output_csv_regex, index=True)
print('Regex patterns saved to:', output_csv_regex)
df_regex.head(10)


In [None]:
# ---- DETECT KEYWORDS INCLUDING MULTI-WORD PHRASES ----
candidate_keywords = tokens + common_phrases
word_counts = Counter(candidate_keywords)

# Remove generic and sensitive terms
generic_words = {'document','page','statement','date','amount','details','number','total','payment','invoice','name','address','phone','email'}

def is_sensitive(phrase):
    # Heuristic: capitalized names
    if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', phrase):
        return True
    # Heuristic: addresses
    if re.search(r'\b\d{1,5}\s+\w+', phrase):
        return True
    return False

filtered_keywords = {}
for kw, count in word_counts.items():
    if kw.lower() not in generic_words and not is_sensitive(kw):
        filtered_keywords[kw] = count

df_keywords = pd.DataFrame(filtered_keywords.items(), columns=['Keyword','Count'])
df_keywords = df_keywords.sort_values(by='Count', ascending=False)
df_keywords.to_csv(output_csv_keywords, index=True)
print('Keyword list (including multi-word phrases) saved to:', output_csv_keywords)
df_keywords.head(20)