## Imports

In [None]:
import os
import openai
import json
import tiktoken
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('max_colwidth', 0)

from pypdf import PdfReader
os.environ["OPENAI_API_KEY"] = "OPEN_AI_KEY"

from openai import OpenAI
client = OpenAI()
encoding = tiktoken.get_encoding("cl100k_base")

In [None]:
def call_chatgpt4(prompt):
    completion = client.chat.completions.create(
      model="gpt-4-turbo",
      #model="gpt-4o",
      messages=[
        {"role": "system", "content": "I am an auditor. My job is to perform regulatory assessment and compliance evaluation."},
        {"role": "user", "content": prompt}
      ],
      temperature = 0.0,
      seed= 42,
      top_p = 0.98,
      max_tokens  = 4096,
      response_format={ "type": "json_object" }
    )
    #output_token_count = len(encoding.encode(completion.choices[0].message.content))

    response_json = json.loads(completion.choices[0].message.content)
    
    return response_json

## Document Path declaration
There are two main types of documents
1. Regulatory Document
2. Policy Document(s)

In [None]:
regulatory_doc_filename = "DPTM-Checklist.pdf"
policy_doc_filename = "DBS Bank Privacy Policy.pdf" 
grab_policy_document = "Privacy Notice_GrabSG.pdf"

list_of_categories = ["Governance and Transparency","Management of Personal Data", "Care of Personal Data", "Individual’s Rights"]

In [None]:
# Extracting Text from Regulatory Document
# importing required modules
def text_from_pdf(filename):
    # creating a pdf reader object
    reader = PdfReader(filename)
    text = ""
    for pageno in range(len(reader.pages)):
        page = reader.pages[pageno]
        text += page.extract_text()
    #len(regulatory_text)
    return text

In [None]:
regulatory_text = text_from_pdf(regulatory_doc_filename)
policy_text = text_from_pdf(policy_doc_filename)
grab_policy_text = text_from_pdf(grab_policy_document)

### TRACE-inspired single-step prompt

In [None]:
def regulatory_mapping(regulatory_text, policy_text):
    analysis_prompt = """You are an auditor in charge of assessing data production regulations in Singapore. 
    Follow the detailed instructions below provide the regulatory-policy mapping and judgement:
    1. Extract all data protection requirements from the Data Protection Trustmark checklist document that is provided between the <regdata> XML like tags.
    <regdata>
    {regulatory_text}
    </regdata>
    2. For each regulatory requirement item, identify and extract the exact clauses from the policy document provided that maps to the requirement. 
    The policy document is below between the <poldata> XML like tags below:
    <poldata>
    {policy_text}
    </poldata>
    3. Taking each regulatory requirement extracted from step 1 as the major premise and the corresponding policy documentation extracted from step 2 as the minor premise, 
    provide the following
    - Judgement: Not Met, Partially Met and Completely Met
    - Explanation of the Judgement: reasoning behind the judgement
    - Type of Evidence: make an educated guess on the type of evidence required to check compliance of the policy. 
    Pick from Inspection, Observation, Inquiry, External Confirmation, Reperformance, Recalculation, Analytical Procesures and others.

    Provide the output in the following strictly serializable JSON format with correct indent all over the output. 
     {{
            "Regrequirement_Policy_Judegment": [
            {{
            "Regulatory_Requirement_Description" : The complete description of the checklist requirement as string,
            "Policy_Documentation" : The exact clause or clauses from the policy documentation as a string. Make sure to get complete information from the policy text,
            "Judgement" : as a string,
            "Judgement_Reasoning" : as a string,
            "Type_of_Evidence" : as a string
            }},
            ....
            ]
        }}
        """
    return analysis_prompt.format(regulatory_text=regulatory_text,policy_text=policy_text)

In [None]:
singlestepprompt = regulatory_mapping(regulatory_text,policy_text)
print(singlestepprompt)
print(len(singlestepprompt))
response_json_chatgpt4 = call_chatgpt4(singlestepprompt)

In [None]:
df = pd.DataFrame(response_json_chatgpt4["Regrequirement_Policy_Judegment"])
df.to_csv("SingleStep_DPTM_DBS_Trial2.csv")

In [None]:
reg_text="""Organisation shall have data protection policies and practices approved by
management, setting out the organisation’s approach to managing
personal data (include management of special categories of personal data
such as personal data of a sensitive nature) for  Customers, Job applicants, visitors etc via External data protection
notices"""
singlestepprompt = regulatory_mapping(reg_text,policy_text)
response_json_chatgpt4 = call_chatgpt4(singlestepprompt)
pd.DataFrame(response_json_chatgpt4["Regrequirement_Policy_Judegment"])

In [None]:
df = pd.DataFrame(response_json_chatgpt4["Regrequirement_Policy_Judegment"])
df["Policy_Documentation"][0]

### TRACE-inspired Workflow Prompting

### Step 1: Checklist requirement extraction from regulatory document

In [None]:
def regulatory_requirement_extraction_forcategory(regulatory_text_fromfile, category_of_interest):
    analysis_prompt = """
    You are an auditor in charge of assessing privacy regulations in Singapore. The  Data Production Trustmark (DPTM) checklist applicable for Singapore is provided between the <data> XML like tags.
    Your objective is to extract all regulatory requirements from the document. 
    Follow the detailed instructions below to analyze regulatory document:
    
    1. Read through the entire regulatory document . 
    2. For each checklist item, identify the category of the checklist requirement, the subcategory and finally who it applies to and the corresponding policy document on which details needs to be found.  
    3. Use only the information provided in the regulatory document
    4. For each item on the checklist corresponding to ONLY the category "{category_of_interest}" extract the following information:
        Category: The principle on which the checklist requiremnt is based. 
        Title: The subcategory to which the checklist requirement belongs.
        Stakeholder: The stakeholder to whom the checklist requirement applies
        Description: The exact checklist requirement from the "checklist" column of the given document. Do not summarise or combine the checklist description from multiple rows. If the same checklist is applicable to different stakeholders make sure to include each as a separate row in the table. If there are different items that need to be checked, list each requirement as a separate row.  
        Mapped Policy Document: An educated guess the policy document that would have details for the checklist requirement. If not known, specify UNKNOWN
        
        The regulatory document is provided below:

        <data>
        {regulatory_text}
        </data>

        Provide the output in the following strictly in serializable JSON format with correct indent all over the output.    
    
        {{
            "Checklist_requirements": [
            {{
            "Category" : The principle on which the checklist requiremnt is based as a string,
            "Title": A short 5-6 word subcategory to which the checklist requirement belongs to as a string,
            "Stakeholder" : The stakeholder to whom the checklist requirement applies to as a string,
            "Description" : The complete description of the checklist requirement as string,
            "Mapped_Policy_Document" : "The policy document name (5-6 words) to check for this requirement"
            }},
            ....
            ]
        }}
   

    """
    return analysis_prompt.format(regulatory_text=regulatory_text_fromfile,category_of_interest=category_of_interest)

In [None]:
# Looping thru the list of categories
for index,category in enumerate(list_of_categories):
    prompt_withcategory = regulatory_requirement_extraction_forcategory(regulatory_text,category)
    print(category)
    print(len(prompt_withcategory))
    response_json_chatgpt4 = call_chatgpt4(prompt_withcategory)
    print("Num of requirements extracted")
    print(len(response_json_chatgpt4["Checklist_requirements"]))
    df = pd.DataFrame(response_json_chatgpt4["Checklist_requirements"])
    if index > 0:
        df_checklist_req = pd.concat([df_checklist_req, df], ignore_index = True) 
    else:
        df_checklist_req = df.copy()

df_checklist_req.shape

In [None]:
#df_checklist_req.to_csv("Step1_Reg_Requirements.csv")

In [None]:
# Read from file
df_checklist_req = pd.read_csv("Step1_Reg_Requirements.csv", index_col=0)
print(df_checklist_req.shape)
df_checklist_req.head()

In [None]:
# Read from file
df_checklist_req_full = pd.read_csv("DPTM_Checklist_Full.csv", index_col=0)
print(df_checklist_req_full.shape)
df_checklist_req_full.head()

In [None]:
df_checklist_req_full["Category"].value_counts()

## Step 2: Policy Information extraction from policy document for regulatory checklist requirement

We extract the list of regulations for which we want to looking for policy clauses

In [None]:
# df_external_policy = df_checklist_req[df_checklist_req["Mapped_Policy_Document"].str.contains("External")]
# requirements_of_interest = df_external_policy["Description"].tolist()

In [None]:
df_rights_policy = df_checklist_req[df_checklist_req["Category"]=="Individual’s Rights"]
requirements_of_interest = df_rights_policy["Description"].tolist()
print(len(requirements_of_interest))
print(requirements_of_interest)

In [None]:
def policy_extraction_forregrequirement(policy_text_fromfile, requirements_of_interest):
    mapping_prompt = """
    You are an auditor in charge of assessing privacy regulations in Singapore. 
    Your objective is to extract relevant clauses from the policy document corresponding to these regulatory requirements {requirements_of_interest}
    The organization's policy is provided between the <data> XML like tags.
    The policy document text is provided below:

        <data>
        {policy_text}
        </data>

    
    Follow the detailed instructions below to map the policy to the corresponding regulatory requirement:    
    1. Read through the entire policy document . 
    2. For each regulatory requirement item, identify and extract the exact clauses from the policy document provided that maps to the requirement.        

        Provide the output in the following strictly serializable JSON format with correct indent all over the output.    
    
        {{
            "Regrequirement_Policy_Map": [
            {{
            "Regulatory_Requirement_Description" : The complete description of the checklist requirement as string,
            "Policy_Documentation" : The exact clause or clauses from the policy documentation as a string. Make sure to get complete information from the policy text,
            "Policy_Category" : The section of the policy document under which the policy is described under as a string
            }},
            ....
            ]
        }}
   

    """
    return mapping_prompt.format(policy_text=policy_text_fromfile,requirements_of_interest=requirements_of_interest)

In [None]:
# Looping thru the list of categories
for index,category in enumerate(list_of_categories):
    df_policy = df_checklist_req_full[df_checklist_req_full["Category"]==category]
    requirements_of_interest = df_policy["Description"].tolist()
    print(len(requirements_of_interest))
    mappingprompt = policy_extraction_forregrequirement(policy_text,requirements_of_interest)
    print(category)
    print(len(mappingprompt))
    response_json_chatgpt4 = call_chatgpt4(mappingprompt)
    print("Num of policy clause extracted:")
    print(len(response_json_chatgpt4["Regrequirement_Policy_Map"]))
    df = pd.DataFrame(response_json_chatgpt4["Regrequirement_Policy_Map"])
    if index > 0:
        df_reg_policy_map = pd.concat([df_reg_policy_map, df], ignore_index = True) 
    else:
        df_reg_policy_map = df.copy()

df_reg_policy_map.to_csv("Step2_Reg_Policy_Mapping_DBS_All.csv")

### Step 3 : Regulatory Prompting

In [None]:
def regrequirement_policy_alignment(reg_requirement,policy_documentation):
    regulatory_prompt = """
    You are an auditor in charge of assessing privacy regulations in Singapore.
    Your objective is to ensure that the policy documentation of an organization is in alignment with the regulatory requirements set by the regulators.

    Taking the regulatory requirement below as the major premise 
    {reg_requirement}.
    
    and the corresponding policy documentation as the minor premise
    {policy_documentation}.

    Based on the above assertions, extract the following:
    Judgement: Not Met, Partially Met and Completely Met
    Explanation of the Judgement: reasoning behind the judgement
    Type of Evidence: make an educated guess on the type of evidence required to check compliance of the policy. 
    Pick from Inspection, Observation, Inquiry, External Confirmation, Reperformance, Recalculation, Analytical Procesures and others.

    Provide the output in the following strictly serializable JSON format with correct indent all over the output.    
    
        {{
            "Regrequirement_Policy_Judgement": [
            {{
            "Judgement" : as a string,
            "Judgement_Reasoning" : as a string,
            "Type_of_Evidence" : as a string
            }},
            ....
            ]
        }}

    """
    return regulatory_prompt.format(reg_requirement=reg_requirement,policy_documentation=policy_documentation)

In [None]:
#for index, row in df_reg_policy_map.iterrows():
for index, row in df_reg_policy_map.iterrows():
    #print(row["Name"], row["Age"])
    reg_requirement = row["Regulatory_Requirement_Description"]
    policy_documentation = row["Policy_Documentation"]
    regulatoryprompt = regrequirement_policy_alignment(reg_requirement,policy_documentation)
    #print(regulatoryprompt)
    #print(index)
    #print(len(regulatoryprompt))
    response_json_chatgpt4 = call_chatgpt4(regulatoryprompt)
    df = pd.DataFrame(response_json_chatgpt4["Regrequirement_Policy_Judgement"])
    df["Regulatory_Requirement_Description"] = reg_requirement
    df["Policy_Documentation"] = policy_documentation
    if index > 0:
        df_reg_policy_judgement_map = pd.concat([df_reg_policy_judgement_map, df], ignore_index = True) 
    else:
        df_reg_policy_judgement_map = df.copy()

In [None]:
df_reg_policy_judgement_map[["Regulatory_Requirement_Description","Policy_Documentation","Judgement", "Judgement_Reasoning", "Type_of_Evidence"]].to_csv("Step3_Reg_Policy_Mapping_with_Judgement_DBS.csv")

### Step 4: Regulatory Requirement to Evidence Mapping

In [None]:
grab_recruitment_document = "Grab Recruitment Privacy Notice_Grab Careers.pdf"
dbs_recruitment_document = "DBS_Recruitment Policy.pdf"

grab_recruitment_evidence_text = text_from_pdf(grab_recruitment_document)
db_recruitment_evidence_text = text_from_pdf(dbs_recruitment_document)

In [None]:
jobapplicants_reg_requirements = df_reg_policy_map["Regulatory_Requirement_Description"][df_reg_policy_map["Regulatory_Requirement_Description"].apply(str.lower).str.contains("job")].to_list()
jobapplicants_reg_requirements

In [None]:
context_of_evidence_dbs = """This evidence document is the privacy policy notice that appears as the first step 
when candidates click apply for a job posting via the careers webpage of DBS https://www.dbs.com/careers/"""

In [None]:
context_of_evidence_grab = """This evidence document is Grab's recruitment privay policy that is available in 
Grab's careers webpage https://www.grab.careers/en/privacy-policy/"""

In [None]:
def evidence_forregrequirement(evidence_text_fromfile, context_of_evidence, requirements_of_interest):
    mapping_prompt = """
    You are an auditor in charge of assessing evidence for privacy regulations in Singapore. 
    Your objective is to check the relevance of the evidence text given and extract relevant clauses from the evidence corresponding to these regulatory requirements {requirements_of_interest}
    The evidence text is provided between the <data> XML like tags below.
        <data>
        {evidence_text_fromfile}
        </data>

    {context_of_evidence}
    
    Follow the detailed instructions below to map the policy to the corresponding regulatory requirement:    
    1. Read through the entire evidence document . 
    2. For each regulatory requirement item, identify and extract the exact clauses from the evidence document provided that maps to the requirement.        

        Provide the output in the following strictly serializable JSON format with correct indent all over the output.    
    
        {{
            "Regrequirement_Evidence_Map": [
            {{
            "Regulatory_Requirement_Description" : The complete description of the checklist requirement as string,
            "Evidence_Documentation" : The exact clause or clauses from the evidence documentation as a string. Make sure to get complete information from the evidence text
            }},
            ....
            ]
        }}
   

    """
    return mapping_prompt.format(evidence_text_fromfile=evidence_text_fromfile,context_of_evidence=context_of_evidence,requirements_of_interest=requirements_of_interest)

In [None]:
df.to_csv("Step4__Reg_Evidence_Mapping_DBS_JobApplicants.csv")

In [None]:
mapping_prompt = evidence_forregrequirement(db_recruitment_evidence_text,context_of_evidence_dbs,jobapplicants_reg_requirements)
response_json_chatgpt4 = call_chatgpt4(mapping_prompt)
#print(len(response_json_chatgpt4["Regrequirement_Evidence_Map"]))
df_reg_evidence_map_dbs = pd.DataFrame(response_json_chatgpt4["Regrequirement_Evidence_Map"])
df_reg_evidence_map_dbs.to_csv("Step4_Reg_Evidence_Mapping_DBS_JobApplicants.csv")
df_reg_evidence_map_dbs

In [None]:
mapping_prompt = evidence_forregrequirement(grab_recruitment_evidence_text,context_of_evidence_grab,jobapplicants_reg_requirements)
response_json_chatgpt4 = call_chatgpt4(mapping_prompt)
#print(len(response_json_chatgpt4["Regrequirement_Evidence_Map"]))
df_reg_evidence_map_grab = pd.DataFrame(response_json_chatgpt4["Regrequirement_Evidence_Map"])
df_reg_evidence_map_grab.to_csv("Step4_Reg_Evidence_Mapping_Grab_JobApplicants.csv")
df_reg_evidence_map_grab

### Step 5: Evidence Judgement

In [None]:
def regrequirement_evidence_alignment(reg_requirement,evidence_documentation):
    regulatory_prompt = """
    You are an auditor in charge of assessing evidence for privacy regulations in Singapore. 
    Your objective is to ensure that the evidence documentation of an organization is in alignment with the regulatory requirements set by the regulators.

    Taking the regulatory requirement below as the major premise 
    {reg_requirement}.
    
    and the corresponding evidence documentation as the minor premise
    {evidence_documentation}.

    Based on the above assertions, extract the following:
    Judgement: Not Met, Partially Met and Completely Met
    Explanation of the Judgement: reasoning behind the judgement

    Provide the output in the following strictly serializable JSON format with correct indent all over the output.    
    
        {{
            "Regrequirement_Evidence_Judgement": [
            {{
            "Judgement" : as a string,
            "Judgement_Reasoning" : as a string
            }},
            ....
            ]
        }}

    """
    return regulatory_prompt.format(reg_requirement=reg_requirement,evidence_documentation=evidence_documentation)

In [None]:
for index, row in df_reg_evidence_map_dbs.iterrows():
    #print(row["Name"], row["Age"])
    reg_requirement = row["Regulatory_Requirement_Description"]
    evidence_documentation = row["Evidence_Documentation"]
    regulatoryprompt = regrequirement_evidence_alignment(reg_requirement,evidence_documentation)
    #print(regulatoryprompt)
    print(index)
    #print(len(regulatoryprompt))
    response_json_chatgpt4 = call_chatgpt4(regulatoryprompt)
    df = pd.DataFrame(response_json_chatgpt4["Regrequirement_Evidence_Judgement"])
    df["Regulatory_Requirement_Description"] = reg_requirement
    df["Evidence_Documentation"] = evidence_documentation
    if index > 0:
        df_reg_evidence_judgement_map = pd.concat([df_reg_evidence_judgement_map, df], ignore_index = True) 
    else:
        df_reg_evidence_judgement_map = df.copy()

In [None]:
df_reg_evidence_judgement_map.to_csv("Step5_Reg_Evidence_Mapping_with_Judgement_DBS_jobapplicants.csv")