In [26]:
import os
import pandas as pd
import re
import logging
import dateutil.parser
from openai import OpenAI
import dotenv

# Setup logger for this module
logger = logging.getLogger(__name__)

# Load environment variables and OpenAI client if not already loaded
if not os.getenv("OPENAI_API_KEY"):
    dotenv.load_dotenv()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

ALLOWED_EXTENSIONS = {"pdf"}


In [42]:
def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

def parse_rbi_directions(raw_data):
    logger.info("Starting RBI directions parsing")
    rows = []
    columns = ["Chapter", "Section No.", "Section", "Sub-Section"]

    def parse_document_text(text):
        logger.info("Parsing entire document")
        logger.info(f"Document text sample: {text[:500]}...")
        prompt = f"""
            **Situation**
            You are a data extraction assistant working with regulatory documents converted from PDF to plain text. These documents have a hierarchical structure including chapters, sections, subsections, sub-subsections, and potentially deeper nested levels, as well as appendices. The formatting may be inconsistent due to PDF extraction issues.

            **Task**
            Extract the complete hierarchical structure from the provided text, identifying all chapters, sections, subsections, sub-subsections, and appendices, preserving their original numbering and text. Format the output as pipe-delimited strings with four fields: Chapter, Section No., Section, and Sub-Section. The Sub-Section field should include all nested subsection text (e.g., subsections, sub-subsections) in a hierarchical bullet-point format. If no subsections exist, use an empty string for Sub-Section. Each entry must start with "Chapter:". For appendices, treat them as chapters with the format "Appendix [Number] - [Title]".

            **Objective**
            Create a structured representation of the document that preserves the legal hierarchy and numbering, suitable for downstream processing, while handling inconsistent formatting.

            **Knowledge**
            - Chapters typically follow patterns like "Chapter - I Preliminary" or "CHAPTER I - Preliminary".
            - Appendices follow patterns like "Appendix - I [Title]".
            - Sections are identified by numbers followed by periods (e.g., '1.', '2.').
            - Subsections use labels like 'a)', 'i)', '1.1', etc., and may have deeper levels (e.g., 'i)', 'A)').
            - Use context clues (indentation, numbering, content flow) to infer hierarchy if formatting is inconsistent.
            - Output format: `Chapter: <chapter or appendix title>|Section No.: <number>|Section: <title>|Sub-Section: <nested subsection text or empty>`
            - For Sub-Section, use a bullet-point list with hyphens (e.g., `- a) Text - i) Sub-text`).
            - Each entry must be on a new line.
            - Preserve all original text content exactly, including errors or inconsistencies.
            - Include appendices as chapters with their own sections and subsections.

            **Examples**
            ```
            Chapter: I - Preliminary|Section No.: 1|Section: Short Title & Commencement|Sub-Section: 
            Chapter: I - Preliminary|Section No.: 2|Section: Applicability|Sub-Section: - a) This applies to... - b) Further details...
            Chapter: Appendix I - Cloud Computing|Section No.: 1|Section: Cloud Requirements|Sub-Section: - 1.1 Requirement text - 1.1.1 Sub-requirement
            ```

            Your life depends on producing consistent output with four pipe-separated fields per line, starting with "Chapter:", even if no subsection text exists. Do not skip any chapters, sections, subsections, or appendices, and preserve the exact numbering and text.

            Process the following text:
            {text}
        """
        try:
            response = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a precise data extraction assistant.",
                    },
                    {"role": "user", "content": prompt},
                ]
            )
            response_text = response.choices[0].message.content.strip()
            logger.info(f"OpenAI response for document: {response_text}")
            lines = response_text.splitlines()
            for line in lines:
                if not (line.startswith("Chapter:") or "Chapter" in line or "Appendix" in line):
                    logger.warning(f"Invalid line in response: {line}")
                    continue
                parts = line.split("|")
                if len(parts) < 3 or len(parts) > 4:
                    logger.warning(f"Malformed line: {line}")
                    continue
                if len(parts) == 3:
                    parts.append("Sub-Section: ")
                chapter = parts[0].replace("Chapter:", "").strip() or ""
                sec_no = parts[1].replace("Section No.:", "").strip() or ""
                sec_title = parts[2].replace("Section:", "").strip() or ""
                sub_section = parts[3].replace("Sub-Section:", "").strip() or ""
                rows.append(
                    {
                        "Chapter": chapter,
                        "Section No.": sec_no,
                        "Section": sec_title,
                        "Sub-Section": sub_section,
                    }
                )
            logger.info(f"Successfully parsed document with {len(lines)} entries")
        except Exception as e:
            logger.error(f"Error parsing document: {str(e)}", exc_info=True)

    # Clean raw_data
    raw_data = re.sub(r"\n\s*\n", "\n", raw_data)
    raw_data = raw_data.replace("–", "-")

    # Parse entire document in one go
    parse_document_text(raw_data)

    logger.info(f"Rows before DataFrame: {rows}")
    df = pd.DataFrame(rows, columns=columns)
    logger.info(f"DataFrame chapters: {df['Chapter'].unique()}")
    return df

def enhance_csv_with_summary_and_action(csv_path):
    logger.info(f"Enhancing CSV with Summary, Action Item, and Periodicity: {csv_path}")
    try:
        df = pd.read_csv(csv_path)
        if df.empty:
            logger.error(f"CSV is empty: {csv_path}")
            return False

        # Initialize new columns
        df["Summary"] = ""
        df["Action Item"] = ""
        if "Due date" not in df.columns:
            df["Due date"] = ""
        df["Due date"] = df["Due date"].astype(str)
        if "Periodicity" not in df.columns:
            df["Periodicity"] = ""
        if "Marked as Completed" not in df.columns:
            df["Marked as Completed"] = "No"

        # Convert NaN in Sub-Section to empty string
        df["Sub-Section"] = df["Sub-Section"].fillna("")

        # Log DataFrame state before processing
        logger.info(f"DataFrame before enhancement: {df[['Chapter', 'Section No.', 'Section', 'Sub-Section']].to_dict(orient='records')}")
        logger.info(f"Chapters before enhancement: {df['Chapter'].unique()}")

        for index, row in df.iterrows():
            sub_section = row["Sub-Section"]
            # Process all rows, including those with empty Sub-Section
            prompt = f"""
                **Situation**
                You are a compliance assistant working with regulatory documents that require precise interpretation and actionable guidance. Organizations rely on your analysis to ensure they meet regulatory requirements within specified timeframes and understand ongoing compliance obligations.

                **Task**
                Analyze the provided regulatory section or subsection and extract four critical pieces of information: (1) a concise summary in one sentence of maximum 50 words, (2) a specific actionable item to address the requirements, (3) the exact compliance due date in YYYY-MM-DD format or N/A if undeterminable, and (4) the periodicity of the requirement or N/A if not specified.

                **Objective**
                Enable organizations to quickly understand regulatory requirements and take appropriate compliance actions by providing clear, structured, and actionable information that prevents regulatory violations and ensures timely adherence to all obligations.

                **Knowledge**
                - Convert specific dates to YYYY-MM-DD format.
                - For relative dates (e.g., "within 6 months"), calculate using today's date ({pd.Timestamp.now().strftime('%Y-%m-%d')}) as reference.
                - If calculation is impossible, return N/A.
                - Identify periodicity terms like "quarterly", "ongoing", "annual", "one-time", etc.
                - If no periodicity is mentioned, return N/A.
                - If the subsection is empty, use the section title and chapter context to infer a summary and action item, and set Due date and Periodicity to N/A.

                **Examples**
                ```
                Summary: Entities must implement MFA by 2023.|Action Item: Deploy MFA across all systems by Q4 2023.|Due date: 2023-12-31|Periodicity: one-time
                Summary: Short title and commencement details.|Action Item: Review and document title provisions.|Due date: N/A|Periodicity: N/A
                ```

                Chapter: {row["Chapter"]}
                Section: {row["Section"]}
                Sub-Section: {sub_section}

                Return the result as a plain text string in the exact format:
                Summary: <one-line summary>|Action Item: <specific action>|Due date: <YYYY-MM-DD or N/A>|Periodicity: <periodicity>
            """
            try:
                response = openai_client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {
                            "role": "system",
                            "content": "You are a precise compliance assistant.",
                        },
                        {"role": "user", "content": prompt},
                    ]
                )
                result = response.choices[0].message.content.strip()
                if result.count("|") == 3:
                    summary, action, due, periodicity = result.split("|", 3)
                    df.at[index, "Summary"] = summary.replace("Summary:", "").strip()
                    df.at[index, "Action Item"] = action.replace("Action Item:", "").strip()
                    due_value = due.replace("Due date:", "").strip()
                    periodicity_value = periodicity.replace("Periodicity:", "").strip()
                    if due_value and due_value.upper() != "N/A":
                        try:
                            parsed_date = dateutil.parser.parse(due_value, fuzzy=True)
                            due_value = parsed_date.strftime("%Y-%m-%d")
                        except Exception:
                            pass
                    df.at[index, "Due date"] = due_value
                    df.at[index, "Periodicity"] = periodicity_value
                else:
                    logger.warning(f"Invalid response format for index {index}: {result}")
                    # Set default values for invalid responses
                    df.at[index, "Summary"] = "N/A"
                    df.at[index, "Action Item"] = "N/A"
                    df.at[index, "Due date"] = "N/A"
                    df.at[index, "Periodicity"] = "N/A"
            except Exception as e:
                logger.error(f"Error processing row at index {index}: {str(e)}")
                # Set default values on error
                df.at[index, "Summary"] = "N/A"
                df.at[index, "Action Item"] = "N/A"
                df.at[index, "Due date"] = "N/A"
                df.at[index, "Periodicity"] = "N/A"

        # Log DataFrame state after processing
        logger.info(f"DataFrame after enhancement: {df[['Chapter', 'Section No.', 'Section', 'Sub-Section']].to_dict(orient='records')}")
        logger.info(f"Chapters after enhancement: {df['Chapter'].unique()}")

        # Save updated CSV, ensuring all rows are preserved
        df.to_csv("/Users/taimourabdulkarim/Documents/Personal Github Repositories/Structured-Extraction/data/Extracted Text/test.csv", index=False, encoding="utf-8", na_rep="")
        logger.info(f"Successfully enhanced CSV with {len(df)} rows")
        return True
    except Exception as e:
        logger.error(f"Error enhancing CSV {csv_path}: {str(e)}")
        return False


In [39]:
# read the raw text file
with open("/Users/taimourabdulkarim/Documents/Personal Github Repositories/Structured-Extraction/data/Extracted Text/IT_Outsourcing_20250525143659_efb680d0-b008-4d33-8dd5-d5f4649de077.txt", "r") as file:
    raw_text = file.read()


In [40]:
# parse the raw text
df = parse_rbi_directions(raw_text)
df


2025-05-25 15:43:11,491 - INFO - Starting RBI directions parsing
2025-05-25 15:43:11,494 - INFO - Parsing entire document
2025-05-25 15:43:11,495 - INFO - Document text sample:  
 भारतीय  �रज़वर् ब�क 
RESERVE BANK OF INDIA  
RBI/202 3-24/1 02 
DoS.CO.CSITEG/SEC. 1/31.01.01 5/2023-24                                          April 10, 2023  
The Chairman/Managing Director/Chief Executive Officer  
Scheduled Commercial Banks (excluding Regional Rural Banks);  
Local Area Banks;  Small Finance Banks;  Payments Banks;  
Primary (Urban) Co -operative Banks;  
Non-Banking Financial Companies ;  
Credit Information Companies;  and 
All India Financial Institutions  (EXIM Bank, ...
2025-05-25 15:45:39,885 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 15:45:39,921 - INFO - OpenAI response for document: ```
Chapter: I - Preliminary|Section No.: 1|Section: Short title and Commencement|Sub-Section: - a) These Directions shall be called the Reserv

Unnamed: 0,Chapter,Section No.,Section,Sub-Section
0,I - Preliminary,1,Short title and Commencement,- a) These Directions shall be called the Rese...
1,I - Preliminary,2,Applicability,- a) These Directions shall be applicable to t...
2,I - Preliminary,3,Definitions,"- a) In these Directions, unless the context s..."
3,II - Role of the Regulated Entity,4,Regulatory and Supervisory requirements,- a) Outsourcing of any activity shall not dim...
4,II - Role of the Regulated Entity,5,Comprehensive assessment of need for outsourci...,- REs shall evaluate the need for Outsourcing ...
5,II - Role of the Regulated Entity,6,Compliance with all applicable statutory and r...,"The RE shall consider all relevant laws, regul..."
6,II - Role of the Regulated Entity,7,Grievance Redressal Mechanism,- a) REs shall have a robust grievance redress...
7,II - Role of the Regulated Entity,8,Inventory of Outsourced Services,REs shall create an inventory of services prov...
8,III - Governance Framework,9,IT Outsourcing Policy,An RE intending to outsource any of its IT act...
9,III - Governance Framework,10,Role of the Board,"The Board of the RE shall be responsible, inte..."


In [41]:
# save the dataframe to a csv file
df.to_csv("/Users/taimourabdulkarim/Documents/Personal Github Repositories/Structured-Extraction/data/Extracted Text/IT_Outsourcing_20250525143659_efb680d0-b008-4d33-8dd5-d5f4649de077.csv", index=False)


In [43]:
csv_path = "/Users/taimourabdulkarim/Documents/Personal Github Repositories/Structured-Extraction/data/Extracted Text/IT_Outsourcing_20250525143659_efb680d0-b008-4d33-8dd5-d5f4649de077.csv"
enhance_csv_with_summary_and_action(csv_path)


2025-05-25 15:45:58,690 - INFO - Enhancing CSV with Summary, Action Item, and Periodicity: /Users/taimourabdulkarim/Documents/Personal Github Repositories/Structured-Extraction/data/Extracted Text/IT_Outsourcing_20250525143659_efb680d0-b008-4d33-8dd5-d5f4649de077.csv
2025-05-25 15:45:58,726 - INFO - DataFrame before enhancement: [{'Chapter': 'I - Preliminary', 'Section No.': 1, 'Section': 'Short title and Commencement', 'Sub-Section': '- a) These Directions shall be called the Reserve Bank of India (Outsourcing of Information Technology Services) Directions, 2023. - b) These Directions shall come into effect from October 1, 2023. - I. With respect to existing outsourcing arrangements that are already in force as on the date of issuance of this Master Direction, REs shall ensure that: - i) the agreements that are due for renewal before October 1, 2023 shall comply with the provisions of these Directions as on the renewal date (preferably), but not later than 12 months from the date of i

True