<a href="https://colab.research.google.com/github/visha1Sagar/BRSR-Report-Generation/blob/main/rep_gen_sat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install --quiet --upgrade langchain langchain-community langchain-chroma gradio langchain-openai pypdf langchain-experimental

In [4]:
import os
import json
from openai import AzureOpenAI
from langchain_openai import AzureChatOpenAI

In [5]:
from google.colab import userdata


In [6]:
AZURE_OPENAI_API_KEY = userdata.get('AZURE_OPENAI_API_KEY')
api_version = userdata.get('api_version')
AZURE_ENDPOINT = userdata.get('AZURE_ENDPOINT')

In [7]:
from langchain_core.prompts import PromptTemplate


In [8]:
!pip install --quiet pdfplumber

In [9]:
import pdfplumber
import re
import pandas as pd

def split_principle_with_tables(pdf_path):
    principles = {
        1: "Principle 1:",
        2: "Principle 2:",
        3: "Principle 3:",
        4: "Principle 4:",
        5: "Principle 5:",
        6: "Principle 6:",
        7: "Principle 7:",
        8: "Principle 8:",
        9: "Principle 9:"
    }

    principle_chunks = {key: {"text": "", "tables": []} for key in principles.keys()}
    all_text = ""
    all_tables = []

    # Combine text and tables for global processing
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            text = page.extract_text()
            tables = page.extract_tables()

            all_text += text + "\n"
            all_tables.append((page_number, tables))

    # Find headings and their positions (case-insensitive)
    headings = []
    for i, principle in principles.items():
        for match in re.finditer(fr"(?i){principle}", all_text):  # (?i) makes the regex case-insensitive
            headings.append((match.start(), i))  # Position and principle number

    # Sort headings by position
    headings = sorted(headings, key=lambda x: x[0])

    # Extract content between headings
    for idx, (start_pos, principle_num) in enumerate(headings):
        end_pos = headings[idx + 1][0] if idx + 1 < len(headings) else len(all_text)
        principle_chunks[principle_num]["text"] = all_text[start_pos:end_pos].strip()

        # Process tables within the page range
        start_page = start_pos // len(all_text) * len(all_tables)  # Estimate start page
        end_page = end_pos // len(all_text) * len(all_tables)      # Estimate end page
        principle_tables = []

        for page_number in range(int(start_page), int(end_page) + 1):
            if page_number < len(all_tables):
                page_tables = all_tables[page_number][1]
                principle_tables.extend(page_tables)

        principle_chunks[principle_num]["tables"] = principle_tables

    # Convert tables into Pandas DataFrames (optional)
    for principle, content in principle_chunks.items():
        content["tables"] = [
            pd.DataFrame(table) for table in content["tables"] if table
        ]

    return principle_chunks


In [2]:
ct = split_principle_with_tables("/content/SPIL-AR2022-23-Business-Responsibility-and-Sustainability-Report.pdf")

In [7]:
import os
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import pdfplumber
import pandas as pd

In [8]:


# # Set up your Azure OpenAI API details
# os.environ["OPENAI_API_KEY"] = "YOUR_AZURE_OPENAI_API_KEY"
# api_base = "https://YOUR_RESOURCE_NAME.openai.azure.com/"
# api_version = "2023-03-15-preview"
# model_name = "gpt-4"  # Replace with your deployed GPT-4 model name

llm = AzureChatOpenAI(
    api_key=AZURE_OPENAI_API_KEY  ,
    api_version=api_version,
    azure_endpoint = AZURE_ENDPOINT
    )


In [None]:
df = pd.read_excel("ghg_data08.xlsx")

In [None]:
from langchain.agents import AgentType
from langchain_experimental.agents import create_pandas_dataframe_agent


pandas_df_agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True,
    handle_parsing_errors=True,
)




In [None]:

response = pandas_df_agent.run("Total electricity consumption (A)")





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "total_electricity_consumption = df[(df['Category'] == 'Electricity purchase')]['Total_Value'].sum()"}`


[0m[36;1m[1;3m[0m[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "total_electricity_consumption = df[(df['Category'] == 'Electricity purchase')]['Total_Value'].sum() \ntotal_electricity_consumption"}`


[0m[36;1m[1;3m5436207.0[0m[32;1m[1;3mThe total electricity consumption (A) is 5,436,207 kWh.[0m

[1m> Finished chain.[0m


## Spliting the tea

In [9]:
from langchain.prompts import PromptTemplate


# Prompts with templates
template_data_extraction_prompt = PromptTemplate(
    input_variables=["pdf_content"],
    template="""
You are a specialized AI assistant assigned to identify and list specific data points required to complete a section of a Business Responsibility and Sustainability Report (BRSR) template.
Based on the following PDF content, focus on identifying only the essential data items necessary for precise and accurate completion of the template.
Please avoid making assumptions or including irrelevant data points in your response. List each required data point concisely and unambiguously to ensure clarity.

PDF Content:
{pdf_content}
"""
)

csv_data_extraction_prompt = PromptTemplate(
    input_variables=["csv_content", "response", "topic"],
    template="""
As an AI assistant focused on extracting and preparing data from a CSV file, your task is to accurately identify and process specific data fields outlined in the ChatGPT response: {response}.
Begin by locating each of these data fields within the CSV content provided, ensuring they are precisely represented and appropriately transformed as needed. Beyond these predefined fields, scan the content for any additional data relevant to the topic: {topic}.
Ensure all data is clearly quantified and accurately captured to support BRSR reporting requirements.

CSV Content:
{csv_content}
"""
)

report_part_writing_prompt = PromptTemplate(
    input_variables=["data", "topic"],
    template="""
You are an AI assistant tasked with drafting the section "{topic}" of a Business Responsibility and Sustainability Report (BRSR) using the data provided: {data}. Use the attached sample of the "{topic}" section as a general guide for structure, formatting, and tone, ensuring that the draft is both comprehensive and aligned with BRSR standards.

Guidelines for Drafting the Section:

- Adherence to Template Structure: Follow the sequence and structure of the attached sample to maintain alignment with BRSR reporting standards. Include main headings and subheadings as presented in the template but feel free to adapt the content and add context where necessary to fit the new data.
- Quantitative Data in Tables: Present all quantitative data in table format to enhance readability and clarity. Where tables are not feasible, ensure that quantitative details are still clear, precise, and easy to locate.
- Handling Missing Data: If specific data fields are missing from {data}, label them as "Not Available" within the draft to indicate incomplete data transparently.
- Structured Flexibility for Completeness: Use the provided template as a structural foundation, but adapt it to ensure all relevant data is logically organized. Feel free to create additional minor headings or sub-sections if it aids in clarity and does not diverge from the BRSR’s standards.
- Professional and Clear Presentation: Ensure that the report section is presented professionally, with an emphasis on clarity, data accuracy, and alignment with BRSR conventions.

Final Output Expectations: The draft should accurately reflect both the content provided and the intended structure, maintaining all essential headings and a professional tone while integrating the data in a clear, organized, and reader-friendly manner.
"""
)

paraphrase_text_part = PromptTemplate(
    input_variables=["text", "message", "report"],
    template="""
You are an AI assistant assigned to paraphrase part of reports, given context : {report},
1. Find {text} in it and
2. Paraphrase it given user's message : {message}

The final output should be paraphrased text based on given user's message .
"""
)

In [21]:
# principle_sections = [
#     "Principle 1: Businesses should conduct and govern themselves with integrity, and in a manner that is ethical, transparent, and accountable.",
#     "Principle 2: Businesses should provide goods and services in a manner that is sustainable and safe.",
#     "Principle 3: Businesses should respect and promote the well-being of all employees, including those in their value chains.",
#     "Principle 4: Businesses should respect the interests of and be responsive to all their stakeholders.",
#     "Principle 5: Businesses should respect and promote human rights.",
#     "Principle 6: Businesses should respect and make efforts to protect and restore the environment.",
#     "Principle 7: Businesses, when engaging in influencing public and regulatory policy, should do so in a manner that is responsible and transparent.",
#     "Principle 8: Businesses should promote inclusive growth and equitable development.",
#     "Principle 9: Businesses should engage with and provide value to their consumers in a responsible manner."
# ]

In [24]:
principle_sections = [
    "Principle 1:",
    "Principle 2:",
    "Principle 3:",
    "Principle 4:",
    "Principle 5:",
    "Principle 6:",
    "Principle 7:",
    "Principle 8:",
    "Principle 9:"
]

In [27]:


import pdfplumber
import re

def split_by_principle_strict(pdf_path):
    principles = dict()

    for index,value in enumerate(principle_sections):
      principles[index+1] = value

    principle_chunks = {key: "" for key in principles.keys()}
    all_text = ""

    # Combine text from all pages for global processing
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            all_text += page.extract_text() + "\n"

    # Find headings and their positions
    headings = []
    for i, principle in principles.items():
        for match in re.finditer(fr"(?i){principle}", all_text):  # (?i) makes the regex case-insensitive
            headings.append((match.start(), i))

    # Sort headings by position
    headings = sorted(headings, key=lambda x: x[0])

    # Extract content between headings
    for idx, (start_pos, principle_num) in enumerate(headings):
        end_pos = headings[idx + 1][0] if idx + 1 < len(headings) else len(all_text)
        principle_chunks[principle_num] = all_text[start_pos:end_pos].strip()

    return principle_chunks


def extract_text_from_csv(file_path):
    filetype = file_path.split(".")[-1]

    if filetype == "xlsx":
        df = pd.ExcelFile(file_path)
        df_string = ""
        for sheet_name in df.sheet_names:
            df_string += df.parse(sheet_name).to_string(index=False)
        return df_string
    elif filetype == "csv":
        df = pd.read_csv(file_path)
    else:
        return "Couldn't read Spreadsheet - Error - filename :"+file_path
    return df.to_string(index=False)  # Convert to string for prompt input

# Function to call Azure OpenAI with each prompt type
def run_template_data_extraction(file_path):
    try:
        pdf_content = extract_text_from_pdf(file_path)
        prompt = template_data_extraction_prompt.format(pdf_content=pdf_content)
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        return f"Error: {e}"

def run_csv_data_extraction(file_path, response_text, topic):
    try:
        csv_content = extract_text_from_csv(file_path)
        prompt = csv_data_extraction_prompt.format(csv_content=csv_content, response=response_text, topic=topic)
        response = llm.invoke(prompt)

        return response.content
    except Exception as e:
        return f"Error: {e}"

def run_report_part_writing(data, topic):
    prompt = report_part_writing_prompt.format(data=data, topic=topic)
    try:
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        return f"Error: {e}"





In [None]:
# Example usage
template_data_response = run_template_data_extraction("brsr.pdf")


In [None]:
print(template_data_response)

### Essential Data Points for BRSR Completion

1. **Total Energy Consumption**:
   - Total electricity consumption (FY 2022-23, FY 2021-22)
   - Total fuel consumption (FY 2022-23, FY 2021-22)
   - Energy consumption through other sources (Steam) (FY 2022-23, FY 2021-22)
   - Total energy consumption (A+B+C) (FY 2022-23, FY 2021-22)
   - Energy intensity per rupee of turnover (Total energy consumption/ turnover in Million)

2. **Designated Consumers**:
   - Whether the entity has designated consumers under the PAT Scheme (Y/N)
   - Targets achieved under the PAT scheme (Yes/No)
   - Remedial actions taken if targets were not achieved.

3. **Water Withdrawal Details**:
   - Water withdrawal by source (in kilolitres) (Surface water, Groundwater, Third party water, Seawater/Desalinated Water, Others) (FY 2022-23, FY 2021-22)
   - Total volume of water withdrawal (in kilolitres) (i + ii + iii + iv + v) (FY 2022-23, FY 2021-22)
   - Total volume of water consumption (in kilolitres) (FY 2022

In [None]:
csv_data_extraction_response = run_csv_data_extraction("ghg_data08.xlsx",template_data_response , "Principle 6 of Section C")


In [None]:
report_response = run_report_part_writing(csv_data_extraction_response, "Principle 6 of Section C")

In [None]:
print(report_response)

# Principle 6: Environmental Management

This section outlines our commitment to environmental sustainability, detailing our energy consumption, water usage, air emissions, greenhouse gas emissions, and waste management practices. We strive to minimize our environmental footprint through responsible resource management and compliance with applicable regulations.

## 1. Total Energy Consumption

| Category                              | FY 2022-23 (kWh) | FY 2021-22 (kWh) |
|---------------------------------------|------------------|------------------|
| Total Electricity Consumption          | 1,275,247        | 1,246,798        |
| Total Fuel Consumption (Natural Gas)  | 790,508          | 782,260          |
| **Total Energy Consumption (A + B)**  | **2,065,755**    | **2,029,058**    |

- **Energy Intensity per Rupee of Turnover:** Not Available

## 2. Designated Consumers

- **Whether the entity has designated consumers under the PAT Scheme:** Yes
- **Targets achieved under the PAT 

# Principle 6: Environmental Management

This section outlines our commitment to environmental sustainability, detailing our energy consumption, water usage, air emissions, greenhouse gas emissions, and waste management practices. We strive to minimize our environmental footprint through responsible resource management and compliance with applicable regulations.

## 1. Total Energy Consumption

| Category                              | FY 2022-23 (kWh) | FY 2021-22 (kWh) |
|---------------------------------------|------------------|------------------|
| Total Electricity Consumption          | 1,275,247        | 1,246,798        |
| Total Fuel Consumption (Natural Gas)  | 790,508          | 782,260          |
| **Total Energy Consumption (A + B)**  | **2,065,755**    | **2,029,058**    |

- **Energy Intensity per Rupee of Turnover:** Not Available

## 2. Designated Consumers

- **Whether the entity has designated consumers under the PAT Scheme:** Yes
- **Targets achieved under the PAT scheme:** Yes
- **Remedial actions taken if targets were not achieved:** Not Provided

## 3. Water Withdrawal Details

| Source                          | FY 2022-23 (kilolitres) | FY 2021-22 (kilolitres) |
|---------------------------------|-------------------------|-------------------------|
| Surface Water                   | 100,000                 | 95,000                  |
| Groundwater                     | 75,000                  | 70,000                  |
| Third-party Water               | 15,000                  | 14,000                  |
| Seawater/Desalinated Water      | 20,000                  | 18,000                  |
| Others                          | 5,000                   | 4,500                   |
| **Total Volume of Water Withdrawal** | **210,000**         | **202,500**            |

- **Total Volume of Water Consumption:** Not Available
- **Water Intensity per Rupee of Turnover:** Not Available

## 4. Zero Liquid Discharge Mechanism

- **Coverage and implementation details of Zero Liquid Discharge:** Not Provided

## 5. Air Emissions Details

| Emission Type                                | FY 2022-23 (kg) |
|----------------------------------------------|-----------------|
| NOx                                          | 12,000          |
| SOx                                          | 8,000           |
| Particulate Matter                           | 1,500           |
| Persistent Organic Pollutants                | 100             |
| Volatile Organic Compounds                   | 2,000           |
| Hazardous Air Pollutants                     | 500             |
| Others                                       | 50              |

## 6. Greenhouse Gas Emissions

- **Total Scope 1 Emissions:**
  - CO2 from Green Gas: 697 GJ
  - CO2 from Natural Gas: 263,057 kWh

- **Total Scope 2 Emissions:**
  - CO2 from Electricity Purchase: 429,395 kWh

- **Total Scope 1 and 2 emissions per rupee of turnover:** Not Available

## 7. Projects to Reduce GHG Emissions

- **Details of projects aimed at reducing Greenhouse Gas emissions:** Not Provided

## 8. Waste Management Details

| Waste Type              | FY 2022-23 (metric tonnes) | FY 2021-22 (metric tonnes) |
|------------------------|----------------------------|----------------------------|
| Plastic Waste          | 200.0                      | 180.0                      |
| E-Waste                | 50.0                       | 45.0                       |
| Bio-medical Waste      | 30.0                       | 25.0                       |
| **Total Waste Generated** | **280.0**               | **250.0**                 |

- **Total Waste Recovered through Recycling or Other Recovery Operations:** Not Provided

## 9. Waste Management Practices

- **Description of waste management practices and strategies to reduce hazardous and toxic chemicals:** Not Provided

## 10. Compliance with Environmental Laws

- **Compliance Status with Applicable Environmental Laws/Regulations/Guidelines in India:** Yes (Compliant)
- **Details of Non-compliance:** None noted