In [None]:
!pip install openai tiktoken tabulate


In [3]:
import fitz
import tiktoken
import os
import re
import openai
import dotenv
from tabulate import tabulate
from typing import List, Dict, Any, Tuple
from openai import OpenAI


# Load environment variables from .env file
dotenv.load_dotenv()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [4]:
my_path = "dataset/pdf/IT Outsourcing.PDF"

# Extract Filename
filename = os.path.basename(my_path)
filename = os.path.splitext(filename)[0]
print(filename)


IT Outsourcing


In [5]:
doc = fitz.open(my_path)

with open(f'dataset/Extracted Text/{filename}.txt', "w", encoding="utf-8") as output_file:
    for page in doc:
        text = page.get_text()
        output_file.write(text + "\n")


# Read the extracted text from file
with open(f'dataset/Extracted Text/{filename}.txt', 'r', encoding='utf-8') as file:
    text = file.read()


In [14]:
PROMPT = """


### LLM Prompt for Extracting and Structuring RBI Document into CSV Format

**Objective**:  
You are an advanced Large Language Model tasked with processing a raw text file containing the Reserve Bank of India (RBI) Master Direction on Outsourcing of Information Technology Services (Reference: RBI/2023-24/102, dated April 10, 2023). Your goal is to extract the relevant content and convert it into a structured CSV format with the following columns: `Chapter`, `Section No.`, `Section`, `Sub-Section`, and `TPM Policy sub point`. The output must exclude the index and appendices (Appendix I, II, III), focus on the introductory section and Chapters I to X, and align with the provided example format. The CSV must handle HTML tags, ensure proper quoting, and assign TPM policy references where applicable.

**Input**:  
The input is a raw text file containing the full RBI Master Direction document, structured as follows:  
- A header with document reference (RBI/2023-24/102), date (April 10, 2023), and recipient details.  
- An introductory section explaining the purpose and context of the Directions.  
- Ten chapters (I to X), covering:  
  - Chapter I: Preliminary (definitions, applicability, commencement).  
  - Chapter II: Role of the Regulated Entity (regulatory requirements, assessments).  
  - Chapter III: Governance Framework (policies, roles).  
  - Chapter IV: Evaluation and Engagement of Service Providers (due diligence).  
  - Chapter V: Outsourcing Agreement (legal agreements, clauses).  
  - Chapter VI: Risk Management (framework, business continuity).  
  - Chapter VII: Monitoring and Control of Outsourced Activities (audits, oversight).  
  - Chapter VIII: Outsourcing within a Group / Conglomerate.  
  - Chapter IX: Cross-Border Outsourcing.  
  - Chapter X: Exit Strategy.  
- An index (to be ignored).  
- Appendices I, II, and III (to be ignored).  

The text may include HTML tags (e.g., `<span style="font-weight: bold;">`) for formatting, numbered sections (e.g., 1, 2), subsections (e.g., (a), (b), (i)), and detailed provisions. The content includes regulatory guidelines, definitions, and requirements for Regulated Entities (REs).

**Instructions**:  

1. **Document Processing**:  
   - Parse the raw text to identify and extract the main content, including the introductory section and Chapters I to X.  
   - Exclude the index and appendices (Appendix I, II, III) from processing, as they are not to be included in the output.  
   - Recognize the document hierarchy:  
     - Chapters (e.g., "I - Preliminary", "II - Role of the Regulated Entity").  
     - Sections (e.g., "1. Short Title & Commencement", "2. Applicability").  
     - Subsections (e.g., "(a)", "(b)", "(i)", or detailed text with lists).  
   - Strip any HTML tags (e.g., `<span style="...">`) and retain only the plain text content, preserving the meaning and structure.  
   - For the introductory section, treat it as a standalone section under `Chapter: General` with no `Section No.` or `Section` title, as shown in the example.

2. **CSV Structure**:  
   - Create a CSV table with the following columns:  
     - `Chapter`: The chapter number and title (e.g., "I - Preliminary", "II - Role of the Regulated Entity").  
     - `Section No.`: The section number (e.g., "1", "2", or empty for chapter-level entries or the introductory section).  
     - `Section`: The section title (e.g., "Short Title & Commencement", "Applicability", or empty for the introductory section).  
     - `Sub-Section`: The subsection identifier or full text of the subsection (e.g., "(a)", "(b)", or detailed text including lists or conditions, as shown in the example).  
     - `TPM Policy sub point`: The corresponding policy reference (e.g., "1.2 Definition", "3.2 Supplier engagement process (Procurement FIM)") or "not specifically mentioned" if no reference applies.  
   - Each row should represent a section or subsection, with chapter-level entries (e.g., introductory section) included as needed.  
   - For sections with multiple subsections (e.g., Section 16 in Chapter V with clauses a–w), group related clauses into a single row (e.g., a–f, g–m, n–s, t–w) to match the example format, ensuring all clauses are included concisely.

3. **Content Extraction and Summarization**:  
   - For the introductory section, summarize the purpose in the `Sub-Section` column (e.g., "The RBI issued these Directions to regulate outsourcing of IT and IT-enabled Services by Regulated Entities to manage risks, ensuring no compromise to customer obligations or RBI supervision.") if no specific subsection text is provided.  
   - For numbered sections and subsections, extract the exact text as provided in the document, removing HTML tags and formatting.  
   - For subsections with detailed lists or conditions (e.g., Section 1(I)(i), 16(a)–(w)), include the full text in the `Sub-Section` column, ensuring clarity and completeness.  
   - Assign `TPM Policy sub point` values based on the provided example data. Use the following references where applicable, or "not specifically mentioned" if no reference is provided:  
     - "1.1B" (e.g., for applicability to specific entities).  
     - "1.2 Definition" (e.g., for definitions like "Group", "Material Outsourcing").  
     - "1.2 Definition of TPEM, 9. Governance, roles and responsibilities" (e.g., for regulatory obligations).  
     - "3.2 Supplier engagement process (Procurement FIM)" (e.g., for agreements, conflicts of interest).  
     - "Section 3" (e.g., for outsourcing need assessment).  
     - "4.3 Outsourcing assessments" (e.g., for compliance with laws).  
     - "10. Redressal of customer grievances" (e.g., for grievance mechanisms).  
     - "6.4 Outsourcing register" (e.g., for inventory of services).  
     - "9.1 Country Governance" (e.g., for IT outsourcing policy).  
     - "9. Governance, Roles and Responsibilities" (e.g., for Board roles).  
     - "3.6 IT Governance" (e.g., for Senior Management roles).  
     - "3. Engagement Initiation: Pre-Requisite Activities" (e.g., for due diligence).  
     - "3. Engagement Initiation: Pre-Requisite Activities, 4. Third Party Assessment" (e.g., for due diligence aspects).  
     - "4. Third Party Assessment, 7.2 Ongoing Risk Management, 6. Approval Requirements for Outsourcing engagements, 3.2.5 Information Security Review, 3. Sub-contracting" (e.g., for risk management framework).  
     - "3.2.3 Business Continuity Plan" (e.g., for BCP requirements).  
     - "3.2.4 Exit Strategy Plan" (e.g., for contingency plans).  
     - "3.2.5 Information Security Review, 3.5 Data sharing approval" (e.g., for data isolation).  
     - "7.1.2 Monitoring performance metrics" (e.g., for monitoring structures).  
     - "7.4 Independent Review / Audit Review" (e.g., for audits).  
     - "8.4 Disengagement / Termination" (e.g., for termination publicity).  
     - "5.1 Supplier agreements" (e.g., for access to data).  
     - "Definition" (e.g., for group outsourcing).  
     - "1.3. Intended Audience and Outcomes" (e.g., for group entity selection).  
     - "1.1.B Notes, 5.2.4 Transfer Pricing and Taxation, 1.3. Intended Audience and Outcomes" (e.g., for arm's length relationships).  
     - "3.8 Additional guidelines laid down by RBI for off-shore outsourcing" (e.g., for cross-border requirements).  

4. **CSV Formatting**:  
   - Ensure the output is a valid CSV format with comma-separated values.  
   - Enclose fields in double quotes (`"`) if they contain commas, quotes, or newlines to prevent parsing errors (e.g., `"I - Preliminary"`, `"The RBI issued these Directions, ensuring compliance"`).  
   - Include a header row with the column names: `Chapter`, `Section No.`, `Section`, `Sub-Section`, `TPM Policy sub point`.  
   - Avoid including non-CSV content (e.g., notes, comments) within the CSV. If a note is needed, place it outside the CSV content, after the closing ``` marker.  
   - Ensure each row has exactly 5 columns, matching the headers.

5. **Handling Special Cases**:  
   - For the introductory section, use `Chapter: General`, leave `Section No.` and `Section` empty, and provide a summarized description in `Sub-Section` (e.g., "The RBI issued these Directions to regulate outsourcing of IT and IT-enabled Services by Regulated Entities to manage risks, ensuring no compromise to customer obligations or RBI supervision.").  
   - For sections with complex subsections (e.g., Section 16 in Chapter V), group clauses into logical sets (e.g., a–f, g–m, n–s, t–w) to match the example format, ensuring all clauses are included in the `Sub-Section` column.  
   - For definitions (Section 3), include each defined term (e.g., "Group", "Material Outsourcing") as a separate row with its definition in `Sub-Section`.  
   - If the document contains ambiguities (e.g., unclear section boundaries), assume the structure follows the provided example and assign `TPM Policy sub point` as "not specifically mentioned" unless a clear match exists.  
   - Remove footnotes (e.g., "2*", "3*") and their explanations, including them only if part of the main subsection text.

6. **Output Requirements**:  
   - Provide the CSV content as plain text within triple backticks (```csv ... ```) to ensure proper formatting.  
   - Include the introductory section and all relevant sections from Chapters I to X, excluding the index and appendices.  
   - If the document is too large to process fully, provide a complete CSV for the introductory section, Chapter I, and Chapter II, and include a note outside the CSV indicating that the structure can be extended to remaining chapters.  
   - Ensure the CSV is valid and can be parsed by standard CSV parsers (e.g., Python’s `csv` module).  
   - Users should be able to copy the CSV content into a text editor and save it as `rbi_it_outsourcing.csv` for further use.

7. **Constraints**:  
   - Do not generate images or other non-text outputs.  
   - Do not modify the document’s content or invent details; base all entries on the provided text.  
   - If any part of the document is unclear (e.g., missing section titles), make reasonable assumptions based on context and note them in a comment outside the CSV (e.g., "Assumed introductory section summary based on document purpose").  
   - Exclude the index and appendices, even if referenced in the main text.  

8. **Example Output Format**:  
```csv
Chapter,Section No.,Section,Sub-Section,TPM Policy sub point
"General",,"Introduction","The RBI issued these Directions to regulate outsourcing of IT and IT-enabled Services by Regulated Entities to manage risks, ensuring no compromise to customer obligations or RBI supervision.","not specifically mentioned"
"I - Preliminary","1","Short Title & Commencement","a. These Directions shall be called the Reserve Bank of India (Outsourcing of Information Technology Services) Directions, 2023.","not specifically mentioned"
"I - Preliminary","1","Short Title & Commencement","b. These Directions shall come into effect from October 1, 2023.","not specifically mentioned"
"I - Preliminary","2","Applicability","(a) These Directions shall be applicable to the following entities, collectively referred to as ‘regulated entities’ or ’REs’ in these directions: i) Commercial Banks (Includes banks incorporated outside India licensed to operate in India (‘Foreign Banks’), Local Area Banks (LABs), Small Finance Banks (SFBs), Payments Banks (PBs)) (ii) Primary Co-operative Banks / Urban Co-operative Banks (iii) Non-Banking Financial Companies (iv) Credit Information Companies. (v) EXIM Bank, NABARD, NaBFID, NHB and SIDBI","1.1B"
...
```
Note: The above CSV includes the introductory section and Chapter I. The structure can be extended to Chapters II–X.

**Task**:  
Process the provided raw text file containing the RBI Master Direction document. Extract the introductory section and Chapters I to X, excluding the index and appendices. Generate a structured CSV output with the columns `Chapter`, `Section No.`, `Section`, `Sub-Section`, and `TPM Policy sub point`, following the instructions above. Ensure all fields are properly quoted to handle commas, and assign TPM policy references based on the provided example. If the output is too long, provide a sample covering the introductory section, Chapter I, and Chapter II, with a note indicating how to extend it. Place any notes or comments outside the CSV content.

**Additional Notes**:  
- If the raw text contains HTML tags, strip them completely and use the plain text content.  
- For lengthy subsections (e.g., Section 16 with clauses a–w), group clauses into logical sets to match the example format, ensuring all content is included.  
- If TPM policy references are unclear, default to "not specifically mentioned" unless a clear match exists from the provided example.  
- Ensure the CSV is user-friendly for regulatory compliance tracking, with clear subsection details and policy references.

**Response Format**:  
```csv
Chapter,Section No.,Section,Sub-Section,TPM Policy sub point
...
```
Note: Any additional comments or assumptions should appear here, outside the CSV.

"""


In [15]:
# Get the encoding for gpt-4o-mini
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
tokens = encoding.encode(text)
num_tokens = len(tokens)
print(f"Number of tokens: {num_tokens}")

# Calulate the number of tokens in the prompt
prompt_tokens = encoding.encode(PROMPT)
num_prompt_tokens = len(prompt_tokens)
print(f"Number of tokens in prompt: {num_prompt_tokens}")

# Calculate the total number of tokens
total_tokens = num_tokens + num_prompt_tokens
print(f"Total number of tokens: {total_tokens}")


Number of tokens: 11444
Number of tokens in prompt: 3101
Total number of tokens: 14545


In [16]:
import csv
from io import StringIO
from typing import List

def structure_text_with_gpt(text):
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": PROMPT + text}
        ],
        temperature=0,
    )
    structured_text = response.choices[0].message.content
    return structured_text




In [17]:
# Structure the text using GPT
structured_text = structure_text_with_gpt(text)
print("Structured Text:")
print(structured_text)


Structured Text:
```csv
Chapter,Section No.,Section,Sub-Section,TPM Policy sub point
"General",,"Introduction","The RBI issued these Directions to regulate outsourcing of IT and IT-enabled Services by Regulated Entities to manage risks, ensuring no compromise to customer obligations or RBI supervision.","not specifically mentioned"
"I - Preliminary","1","Short Title & Commencement","(a) These Directions shall be called the Reserve Bank of India (Outsourcing of Information Technology Services) Directions, 2023.","not specifically mentioned"
"I - Preliminary","1","Short Title & Commencement","(b) These Directions shall come into effect from October 1, 2023.","not specifically mentioned"
"I - Preliminary","1","Short Title & Commencement","I. With respect to existing outsourcing arrangements that are already in force as on the date of issuance of this Master Direction, REs shall ensure that: i. the agreements that are due for renewal before October 1, 2023 shall comply with the provisions 

In [19]:
# Parse the structured text into a list of lists
structured_data = parse_structured_text(structured_text)
# Print the structured data as a table
print("Parsed Structured Data:")
print_table(structured_data)


Parsed Structured Data:
+-----------------+-------------------------------------------+-------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------+------------------+------------------------------+
| Document_ID     | Chapter                                   | Section                                                                 | Subsection   | Description                                                                                                                                                                                                                                                  

In [23]:
import csv
with open(f'dataset/Excel Sheets/{filename}.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Document_ID", "Chapter", "Section", "Sub Section" , "Description", "Compliance_Requirements", "Effective_Date", "Applicability", "Role Assigned To"])
    writer.writerows(structured_data)
    
    

In [None]:
# !pip install PyPDF2


In [None]:
# First, install PyPDF2 if you haven't already
# pip install PyPDF2

from PyPDF2 import PdfReader

def extract_pdf_to_txt(pdf_path, txt_path):
    try:
        # Create a PDF reader object
        pdf_reader = PdfReader(pdf_path)
        
        # Get the number of pages
        num_pages = len(pdf_reader.pages)
        
        # Initialize a string to store all extracted text
        extracted_text = ""
        
        # Extract text from each page
        for page_num in range(num_pages):
            # Get the page object
            page = pdf_reader.pages[page_num]
            # Extract text from page
            text = page.extract_text()
            # Add page number and text to the extracted content
            extracted_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
        
        # Write the extracted text to a file
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(extracted_text)
            
        print(f"Text successfully extracted and saved to {txt_path}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage
pdf_file_path = "dataset/IT Outsourcing.PDF"  # Replace with your PDF file path
txt_file_path = "dataset/output.txt"     # Replace with desired output text file path

extract_pdf_to_txt(pdf_file_path, txt_file_path)


In [None]:
import re
import csv

# Read the extracted text from file
with open('dataset/output.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split text into lines
lines = text.split('\n')

# Initialize variables
current_chapter = ""
current_section = ""
current_section_heading = ""
current_subsection = ""
data = []

# Regular expressions
chapter_pattern = re.compile(r'^(CHAPTER\s*[–-]\s*[IVX]+|Chapter\s+[IVX]+)', re.IGNORECASE)
section_pattern = re.compile(r'^\d+\.\s+')
subsection_pattern = re.compile(r'^[a-z]\)\s+')

# Parse the text
for line in lines:
    line = line.strip()
    if not line:
        continue

    # Check for chapter
    if chapter_pattern.match(line):
        current_chapter = line.strip()
        current_section = ""
        current_section_heading = ""
        current_subsection = ""
        continue

    # Check for section
    if section_pattern.match(line):
        current_section = re.match(r'^\d+\.\s+', line).group(0).strip()
        # Extract the heading (text after the section number until the end of the line or next subsection)
        content = line[len(current_section):].strip()
        current_section_heading = content  # Assume this is the heading until a subsection appears
        current_subsection = ""
        data.append([current_chapter, current_section, current_section_heading, "", content])
        continue

    # Check for subsection
    if subsection_pattern.match(line):
        current_subsection = re.match(r'^[a-z]\)\s+', line).group(0).strip()
        content = line[len(current_subsection):].strip()
        # Use the current section heading for this subsection
        data.append([current_chapter, current_section, current_section_heading, current_subsection, content])
        continue

    # Append to previous content if no new chapter, section, or subsection
    if data:
        data[-1][-1] += " " + line

# Write to CSV
csv_file = "dataset/structured_output_with_headings.csv"
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Chapter", "Section", "Section Heading", "Subsection", "Content"])
    writer.writerows(data)

print(f"Data successfully written to {csv_file}")


In [None]:
import fitz


In [None]:
my_path = "dataset/DPSC - Digital Payment Security Controls.PDF"
doc = fitz.open(my_path)

with open("dataset/output.txt", "w", encoding="utf-8") as output_file:
    for page in doc:
        text = page.get_text()
        output_file.write(text + "\n")


In [None]:
import csv
import re

# Read the extracted text from file
with open('dataset/output.txt', 'r', encoding='utf-8') as file:
    pdf_text = file.read()

# Function to clean text (remove extra whitespace, newlines)
def clean_text(text):
    return ' '.join(text.split())

# Function to parse the index and return a dictionary of chapters and their sections
def parse_index(text):
    chapters = {}
    lines = text.split('\n')
    index_start = None
    index_end = None
    
    # Find index boundaries
    for i, line in enumerate(lines):
        if 'Index' in line.strip():
            index_start = i
        elif index_start is not None and (re.match(r'^\d+$', line.strip()) or 'Master Direction' in line):
            index_end = i
            break
    
    if index_start is None or index_end is None:
        return chapters
    
    current_chapter = None
    for line in lines[index_start:index_end]:
        line = line.strip()
        if not line:
            continue
        
        chapter_match = re.match(r'Chapter\s+([IVXLC]+)\s*[-–]?\s*(.+)', line, re.IGNORECASE)
        if chapter_match:
            chapter_num = chapter_match.group(1)
            chapter_title = chapter_match.group(2).strip()
            chapters[chapter_num] = {'title': chapter_title, 'sections': []}
            current_chapter = chapter_num
        elif current_chapter and not re.match(r'^\d+$', line) and line not in ['Introduction', 'Acronyms']:
            chapters[current_chapter]['sections'].append(line.strip())
    
    return chapters

# Function to parse the PDF content and structure it
def parse_pdf_to_structure(text):
    data = []
    chapters = parse_index(text)
    current_chapter = None
    current_section = None
    current_subsection = None
    current_content = []
    subsections_list = []
    
    lines = text.split('\n')
    
    for line in lines:
        line = line.strip()
        if not line or re.match(r'^\d+$', line):  # Skip empty lines and page numbers
            continue
        
        # Detect Chapter (match both "CHAPTER – I" and "Chapter III" formats)
        chapter_match = re.match(r'CHAPTER\s*[-–]?\s*([IVXLC]+)\s*(.*)', line, re.IGNORECASE)
        if chapter_match:
            # Save previous chapter data if exists
            if current_chapter and current_content:
                if subsections_list and current_subsection:
                    data.append({
                        'Chapter': current_chapter,
                        'Section': current_section,
                        'Subsection': current_subsection,
                        'Content': clean_text(' '.join(current_content))
                    })
                elif not subsections_list:
                    data.append({
                        'Chapter': current_chapter,
                        'Section': current_section,
                        'Subsection': '',
                        'Content': clean_text(' '.join(current_content))
                    })
            
            # Set new chapter
            chapter_num = chapter_match.group(1)
            chapter_title = chapter_match.group(2).strip() or chapters.get(chapter_num, {}).get('title', '')
            current_chapter = f"CHAPTER – {chapter_num} {chapter_title}".upper()
            current_section = chapters.get(chapter_num, {}).get('title', chapter_title).upper()
            subsections_list = chapters.get(chapter_num, {}).get('sections', [])
            current_subsection = None if subsections_list else ''
            current_content = []
            continue
        
        # Handle content for current chapter
        if current_chapter:
            # For chapters with subsections (e.g., I, II)
            if subsections_list:
                cleaned_line = re.sub(r'^\d+\.\s*', '', line).strip()
                if cleaned_line in subsections_list:
                    if current_subsection and current_content:
                        data.append({
                            'Chapter': current_chapter,
                            'Section': current_section,
                            'Subsection': current_subsection,
                            'Content': clean_text(' '.join(current_content))
                        })
                    current_subsection = cleaned_line
                    current_content = [line]
                elif current_subsection:
                    current_content.append(line)
            # For chapters without subsections (e.g., III, IV, V)
            else:
                current_content.append(line)
    
    # Append the last chapter's data
    if current_chapter and current_content:
        if subsections_list and current_subsection:
            data.append({
                'Chapter': current_chapter,
                'Section': current_section,
                'Subsection': current_subsection,
                'Content': clean_text(' '.join(current_content))
            })
        elif not subsections_list:
            data.append({
                'Chapter': current_chapter,
                'Section': current_section,
                'Subsection': '',
                'Content': clean_text(' '.join(current_content))
            })
    
    return data

# Parse the text
structured_data = parse_pdf_to_structure(pdf_text)

# Write to CSV
csv_file = "dataset/structured_output.csv"
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['Chapter', 'Section', 'Subsection', 'Content'])
    writer.writeheader()
    writer.writerows(structured_data)

print(f"CSV file '{csv_file}' has been created successfully.")
