In [4]:
import pandas as pd
import os
import smtplib
import json
from email.message import EmailMessage
from openai import AzureOpenAI
import time

# Document ID to Document Name mapping
DOCUMENT_ID_MAP = {
    1: ["AML_VER09.211223", "AML", "Anti-Money Laundering and Sanctions Rules and Guidance", "Anti-Money Laundering and Sanctions Rules and Guidance (AML)"],
    2: ["CIB_VER04.030220", "CIB", "Captive Insurance Business Rules (CIB)", "Captive Insurance Business Rules"],
    3: ["COBS_VER15.150823", "COBS", "COBs", "Conduct of Business Rulebook"],
    4: ["FEES_VER16.181223", "Fees Rules (FEES)", "FEES"],
    5: ["FP_VER01.110319", "Fund Passporting Rules (FP)", "FP"],
    6: ["FUNDS_VER08.040723", "FUNDS", "Fund Rules (FUNDS)"],
    7: ["GEN_VER08.181223", "GEN", "General Rulebook (GEN)", "General Rulebook"],
    8: ["GLO_VER19.181223", "GLO", "GLOSSARY (GLO)"],
    9: ["IFR_VER07.181223", "IFR", "Islamic Finance Rules (IFR)", "Islamic Finance Rules"],
    10: ["MIR_VER07.181223", "MIR", "Market Infrastructure Rulebook (MIR)", "Market Infrastructure Rulebook"],
    11: ["MKT_VER08.181223", "MKT", "Market Rules (MKT)", "Market Rules"],
    12: ["PIN_VER05.181223", "PIN", "Prudential – Insurance Business (PIN)", "Prudential – Insurance Business"],
    13: ["PRU_VER13.181223", "PRU", "Prudential – Investment, Insurance Intermediation and Banking Rules (PRU)", "Prudential – Investment, Insurance Intermediation and Banking Rules"],
    14: ["BRR Regulations (December 2018)", "BRR Regulations", "BRR", "BANK RECOVERY AND RESOLUTION REGULATIONS 2018", "BANK RECOVERY AND RESOLUTION REGULATIONS"],
    15: ["CRS Regulations 2017 (Consolidated_October 2023)", "CRS", "COMMON REPORTING STANDARD REGULATIONS 2017", "COMMON REPORTING STANDARD REGULATIONS"],
    16: ["Foreign Tax Account Compliance Regulations 2022", "FOREIGN ACCOUNT TAX COMPLIANCE REGULATIONS"],
    17: ["FSMR (Consolidated_December 2023)", "FSMR", "FINANCIAL SERVICES AND MARKETS REGULATIONS 2015"],
    18: ["Guidance – Regulatory Framework for Fund Managers of Venture Capital Funds (VER03.181223)", "Guidance – Regulatory Framework for Fund Managers of Venture Capital Funds"],
    19: ["Guidance - Virtual Asset Activities in ADGM (VER05.181223)", "Guidance – Regulation of Virtual Asset Activities in ADGM"],
    20: ["ADGM_Guidance_-_Application_of_English_Laws", "Guidance - English Law in ADGM"],
    21: ["API - Guidance Note_Final 14 October 2019 Eng", "Guidance – Application Programming Interfaces (APIs) in ADGM"],
    22: ["CMC_VER03.270922", "CMC", "Code of Market Conduct (CMC)", "Code of Market Conduct"],
    23: ["CONF_VER03.18042019", "CONF", "FSRA Confidentiality Policy"],
    24: ["Draft Guidance - FSRA Guiding Principles for Virtual Assets Regulation and Supervision (IA)", "Guiding Principles for the Financial Services Regulatory Authority’s Approach to Virtual Asset Regulation and Supervision"],
    25: ["Environmental Social and Governance Disclosures Guidance_VER01.040723", "ESG Disclosures Guidance", "Environmental Social Governance Guidance"],
    26: ["FinTech RegLab Guidance_VER01.31082016", "FinTech Regulatory Laboratory Guidance"],
    27: ["GPM_VER03.120623", "GPM", "Guidance & Policies Manual (GPM)", "Guidance & Policies Manual"],
    28: ["Guidance - Continuous Disclosure_VER01.280922", "Guidance – Continuous Disclosure"],
    29: ["Guidance - Digital Securities Offerings and Virtual Assets under the Financial Services and Markets Regulations_240220", "Guidance –Regulation of Digital Security Offerings and Virtual Assets under the Financial Services and Markets Regulations"],
    30: ["Guidance - Disclosure Requirements for Mining Reporting Entities_VER01.280922", "Guidance – Disclosure Requirements for Mining Reporting Entities"],
    31: ["Guidance - Disclosure Requirements for Petroleum Reporting Entities_VER01.280922", "Guidance – Disclosure Requirements for Petroleum Reporting Entities"],
    32: ["Guidance - Private Credit Funds_VER01.040523", "Supplementary Guidance – Private Credit Funds"],
    33: ["Guidance Regulation of Digital Securities Activities in ADGM_240224", "Guidance – Regulation of Digital Securities Activities in ADGM", "Guidance – Regulation of Digital Securities Activities"],
    34: ["Guidance - Regulation of Spot Commodities Activities in ADGM (VER02.181223)", "Guidance – Regulation of Spot Commodity Activities in ADGM"],
    35: ["Guidance_Regulatory Framework for PFP and Multilateral Trading Facilities dealing with Private Capital Markets (VER02.181223)", "Guidance – Regulatory Framework for Private Financing Platforms and Multilateral Trading Facilities dealing with Private Capital Markets"],
    36: ["SFWG_Guidance on Principles for the Effective Management of Climate-related Financial Risks", "Guidance - Principles for the Effective Management of Climate-related Financial Risks"],
    37: ["Supplementary Guidance Authorisation of Digital Investment Management (Robo-advisory) Activities", "Supplementary Guidance – Authorisation of Digital Investment Management (“Robo-advisory”) Activities"],
    38: ["Supplementary Guidance OTCLPs (VER02.181223)", "Supplementary Guidance – Regulatory Framework for Authorised Persons dealing in OTC Leveraged Products for Retail Clients"],
    39: ["Sustainable Finance Supplementary Guidance_VER01.040723", "Supplementary Guidance – Sustainable Finance Regulatory Framework"],
    40: ["UAE_CRS_Guidance_Notes_17 June 2020 (002)", "Guidance Notes for the Common Reporting Standard (CRS) United Arab Emirates", "CRS"]
}

def map_document_name(doc_id):
    return "; ".join(DOCUMENT_ID_MAP.get(doc_id, ["Unknown"]))

# Initialize Azure OpenAI environment
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://llm-adgm-mbzuai.openai.azure.com/"
os.environ["AZURE_OPENAI_KEY"] = "5d05acaa038d45579302ee82621be941"  # Replace with your actual API key
os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"

client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION")
)

def send_email(subject, body):
    """Sends an email with the specified subject and body."""
    sender = "Private Person <mailtrap@demomailtrap.com>"
    receiver = "A Test User <tubaextrainbox@gmail.com>"

    message = EmailMessage()
    message.set_content(body)
    message['Subject'] = subject
    message['From'] = sender
    message['To'] = receiver

    try:
        with smtplib.SMTP("live.smtp.mailtrap.io", 587) as server:
            server.starttls()
            server.login("api", "32e4c5bec87888bc11787b4160b18f65")
            server.send_message(message)
        print("Email sent successfully!")
    except Exception as e:
        print(f"Failed to send email: {e}")






## Load Training and JSON Data

In [5]:
# Load the training data from 'cross_reference_dataset_manually_labelled.csv'
training_csv_path = 'cross_reference_dataset_manually_labelled.csv'
df_training = pd.read_csv(training_csv_path)
df_training.dropna(how='all', inplace=True)
df_training['DocumentID'] = pd.to_numeric(df_training['DocumentID'], errors='coerce').fillna(0).astype(int)


# Load the new data from 'filtered_passages_ready_for_reference_extraction.json'
input_json_path = 'filtered_passages_ready_for_reference_extraction.json'
with open(input_json_path, 'r') as file:
    json_data = json.load(file)

df_json = pd.DataFrame(json_data)

# Select 10 random samples for testing
#df_json = df_json.sample(n=5, random_state=42)

# Convert DocumentID to integer and map SourceDocumentName
df_json['DocumentID'] = pd.to_numeric(df_json['DocumentID'], errors='coerce').fillna(0).astype(int)
df_json['SourceDocumentName'] = df_json['DocumentID'].apply(map_document_name)


# Output CSV file to save incremental results
output_csv_path = 'reference_text_extracted.csv'

# Load already processed data if available
if os.path.exists(output_csv_path):
    processed_data = pd.read_csv(output_csv_path)
    processed_ids = set(processed_data['ID'])
else:
    processed_data = pd.DataFrame(columns=[
        "SourceDocumentName", "ID", "DocumentID", "PassageID", "Passage", "ReferenceText", "ReferenceType"
    ])
    processed_ids = set()

#print(df_json)
print(df_training)

    SourceDocumentName                                    ID  DocumentID  \
0     AML_VER09.211223  ba2e367e-866d-49dd-af15-4ea075c30714           1   
1     AML_VER09.211223  ba2e367e-866d-49dd-af15-4ea075c30714           1   
2     AML_VER09.211223  ba2e367e-866d-49dd-af15-4ea075c30714           1   
3     AML_VER09.211223  ba2e367e-866d-49dd-af15-4ea075c30714           1   
4     AML_VER09.211223  611c820f-2988-4e25-9091-4a3b299501b4           1   
..                 ...                                   ...         ...   
326  COBS_VER15.150823  714c73ba-17f9-4065-b680-42d546523e0d           3   
327  COBS_VER15.150823  c42abaa9-4c93-4d5b-b989-aa4ec57cf600           3   
328  COBS_VER15.150823  c69593e4-12f2-4843-b91b-48b91755223f           3   
329  COBS_VER15.150823  951e1a4b-b181-47da-95f0-c4933c92f6c3           3   
330  COBS_VER15.150823  b4306039-437d-4433-9ef2-c5da009e38eb           3   

               PassageID                                            Passage  \
0     9.

## Few-Shot Prompt Creation

In [6]:
# Method to create a few-shot learning prompt from a DataFrame
def create_few_shot_prompt(df):
    prompt_examples = ""
    for index, row in df.iterrows():
        passage = row['Passage']
        reference_text = row['ReferenceText']
        reference_type = row['ReferenceType']
        passage_id = row['PassageID']

        # Ensure passage, reference_text, and reference_type are not null
        if pd.notnull(passage) and pd.notnull(reference_text) and pd.notnull(reference_type):
            prompt_examples += (
                f"Example {index + 1}:\n"
                f"SourcePassageID: {passage_id}\n"
                f"SourcePassage: {passage}\n"
                f"SourceReferenceText: {reference_text}\n"
                f"ReferenceType: {reference_type}\n\n"
            )

    return (
        "You are an AI that identifies reference types for regulatory passages based on the source passage, reference text, and document mapping.\n"
        "Below are examples of source passages, their corresponding reference texts, and reference types:\n\n" +
        prompt_examples +
        "Now, determine the reference type for the following passage:\n"
    )

# Generate the few-shot prompt from the cleaned training data
few_shot_prompt = create_few_shot_prompt(df_training)


## Few-Shot Learning Function

In [7]:
def few_shot_learning(item, index):
    source_passage = item['Passage']
    item_id = item['ID']
    passage_id = item['PassageID']
    document_id = int(item['DocumentID'])

    if pd.isnull(source_passage) or source_passage == "Missing Passage":
        print(f"Passage is missing for ID: {item_id}")
        return "None", "None"

    # Add the current passage to the prompt
    prompt = (
        few_shot_prompt +
        f"Passage: '{source_passage}'\n"
        "ReferenceText: "
    )

    messages = [
        {"role": "system", "content": "You are an AI that identifies reference texts and types."},
        {"role": "user", "content": prompt}
    ]

    try:
        # Call the Azure OpenAI ChatCompletion API
        response = client.chat.completions.create(
            model="gpt-4-turbo-1106",
            messages=messages,
            temperature=0.0,
            max_tokens=50,
            top_p=1.0,
            frequency_penalty=0,
            presence_penalty=0
        )

        # Extract generated reference text and type from response
        response_content = response.choices[0].message.content.strip()
        response_lines = response_content.split('\n')

        # Ensure there are exactly two lines in the response
        if len(response_lines) < 2:
            print(f"Unexpected response format for ID '{item_id}': {response_content}")
            return "Error", "Error"

        reference_text, reference_type = response_lines[0].strip(), response_lines[1].strip()
        return reference_text, reference_type

    except Exception as e:
        print(f"Error generating ReferenceText/Type for ID '{item_id}': {e}")
        return "Error", "Error"

## Processing and Saving Results

In [None]:
# Apply few-shot learning to define 'ReferenceText' and 'ReferenceType' for the 10 samples
total_items = len(df_json)
subject = "Initial Process Start"
body = f"Processing started. Total items: {total_items}"
send_email(subject, body)

processed_count = 0
start_time = time.time()

for index, row in df_json.iterrows():
    if row['ID'] in processed_ids:
        continue  # Skip already processed IDs

    if pd.notnull(row['Passage']) and row['Passage'] != "Missing Passage":
        reference_text, reference_type = few_shot_learning(row, index)

        # Skip the item if ReferenceText or ReferenceType could not be extracted
        if reference_text.lower() in ['none', 'error'] or reference_type.lower() in ['none', 'error']:
            print(f"Skipping ID '{row['ID']}' as extraction failed.")
            continue

        # Update ReferenceType based on the new criteria
        if 'internal' in reference_type.lower():
            reference_type = 'Internal'
        elif 'external' in reference_type.lower():
            reference_type = 'External'
        elif 'outsource' in reference_type.lower():
            reference_type = 'Outsource'
        elif len(reference_type) > 20:
            reference_type = 'Unresolved'
        else:
            reference_type = 'Unresolved'

        new_row = row.copy()
        new_row['ReferenceText'] = reference_text
        new_row['ReferenceType'] = reference_type
        processed_data = pd.concat([processed_data, pd.DataFrame([new_row])], ignore_index=True)
        processed_ids.add(row['ID'])

        # Incremental saving after each processed item
        processed_data.to_csv(output_csv_path, index=False)
        processed_count += 1

        # Print progress
        print(f"(Processed {index+1}/{total_items}) Document: '{row['DocumentID']}', Passage: '{row['PassageID']}', "
              f"ReferenceText: '{reference_text}', ReferenceType: '{reference_type}'")
        
        if (time.time() - start_time) >= 1800:  # 1800 seconds = 30 minutes
            subject = "Processing Update"
            body = f"Processed items: {processed_count} / {total_items}"
            send_email(subject, body)
            start_time = time.time()  # Reset the timer
            
# Final email after completion
subject = "Process Completed"
body = f"Processing completed. Total processed items: {processed_count} / {total_items}"
send_email(subject, body)


Email sent successfully!
Unexpected response format for ID '0d07644a-53d6-4729-990b-c4f6b2386712': The passage provided does not contain a specific rule number or title to identify a corresponding rule or regulation. Without this information, it is not possible to determine the reference type as internal, external, or otherwise. Additional context or a specific rule reference is needed
Skipping ID '0d07644a-53d6-4729-990b-c4f6b2386712' as extraction failed.
Unexpected response format for ID '8fdad81d-b309-441b-a66d-afa6d42d5275': The provided passage does not contain a specific reference text or rule number to identify. It appears to be a regulatory requirement related to Customer Due Diligence (CDD) and Anti-Money Laundering/Terrorist Financing and Sanctions (AML
Skipping ID '8fdad81d-b309-441b-a66d-afa6d42d5275' as extraction failed.
(Processed 27/1565) Document: '1', Passage: '9.1.1.(4)', ReferenceText: '(3)(c)', ReferenceType: 'Internal'
(Processed 28/1565) Document: '1', Passage: 

## Statistics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the processed CSV file
input_csv_path = 'reference_text_extracted.csv'  # Replace with the actual path to your CSV file
df = pd.read_csv(input_csv_path)

# Ensure that DocumentID is treated as an integer
df['DocumentID'] = df['DocumentID'].astype(int)
print(len(df['DocumentID']))

# Total Reference Text Count per Document
reference_text_count = df.groupby('DocumentID')['ReferenceText'].count().reset_index()
reference_text_count.columns = ['DocumentID', 'Total Reference Text Count']

# ReferenceType Count per Document
reference_type_count = df.groupby(['DocumentID', 'ReferenceType']).size().reset_index(name='Count')

# Overall Statistics
total_reference_texts = df['ReferenceText'].count()
total_reference_types = df['ReferenceType'].value_counts().reset_index()
total_reference_types.columns = ['ReferenceType', 'Total Count']

# Save results to CSV files
reference_text_count.to_csv('reference_text_count_per_document.csv', index=False)
reference_type_count.to_csv('reference_type_count_per_document.csv', index=False)
total_reference_types.to_csv('overall_reference_type_counts.csv', index=False)

# Visualization

# 1. Bar Chart for Total Reference Text Count per Document
plt.figure(figsize=(10, 6))
plt.bar(reference_text_count['DocumentID'], reference_text_count['Total Reference Text Count'], color='skyblue')
plt.xlabel('Document ID')
plt.ylabel('Total Reference Text Count')
plt.title('Total Reference Text Count per Document')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('total_reference_text_count_per_document.png')
plt.show()

# 2. Stacked Bar Chart for ReferenceType Count per Document
pivot_data = reference_type_count.pivot(index='DocumentID', columns='ReferenceType', values='Count').fillna(0)

pivot_data.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.xlabel('Document ID')
plt.ylabel('ReferenceType Count')
plt.title('ReferenceType Count per Document')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('reference_type_count_per_document.png')
plt.show()

# 3. Pie Chart for Overall ReferenceType Distribution
plt.figure(figsize=(8, 8))
plt.pie(
    total_reference_types['Total Count'],
    labels=total_reference_types['ReferenceType'],
    autopct='%1.1f%%',
    startangle=90,
    colors=['#ff9999', '#66b3ff', '#99ff99']
)
plt.title('Overall ReferenceType Distribution')
plt.tight_layout()
plt.savefig('overall_reference_type_distribution.png')
plt.show()

print("\nStatistics and visualizations have been saved.")
