In [None]:
# Run this in command prompt to start running ElasticSearch locally
!docker run -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.security.http.ssl.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.12.1

In [1]:
import pandas as pd

In [2]:
SAMPLE_CSV_FILEPATH = r'C:\Zhenjie\University\Y3S2\dsa3101-ay2425s2-team6\data-pipelines\data\esg-csv\2023_BOQ_ESG_Report.csv'

df = pd.read_csv(SAMPLE_CSV_FILEPATH)
df

Unnamed: 0,id,text_chunk,tags
0,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,SUSTAINABILITY \nSUPPLEMENT,"sustainability supplement, sustainability supp..."
1,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,,
2,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Contents.\nBOQ Group approach to sustainabilit...,"boq group, sustainability supplement, sustaina..."
3,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Important information and disclaimer \nThis do...,"statements may, future events, financial posit..."
4,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,This report provides BOQ Group stakeholders wi...,"boq group, boq group, boq group, boq group, bo..."
5,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Our purpose.\nBuilding social capital through ...,"building social, social capital, building soci..."
6,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Spirited Be outrageously courageous\nIn bankin...,"create value, create value, different demands ..."
7,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Sustainability balanced scorecard.\nThe table ...,"boq retail, boq retail, people customer custom..."
8,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,2023 highlights.\nEmployee\nEngagement Score (...,"climate active, sustainability supplement, emp..."
9,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Sustainable \ndevelopment \ngoals\nLocal grass...,"community partnerships, community partnerships..."


In [2]:
SAMPLE_CSV_FILEPATH = './data/esg-csv/2022_Citigroup_ESG_Report.csv'

# Load the CSV file
df = pd.read_csv(SAMPLE_CSV_FILEPATH)
df.head()

Unnamed: 0,id,text_chunk,tags
0,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,What’s Inside\n2 A bout This Report\n3 L etter...,"o ur, o ur, climate risk, o ur, o ur, climate ..."
1,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Citi 2022 ESG Report\nPage 2\nESG at Citi Sust...,"esg report, financial disclosures, financial d..."
2,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Citi 2022 ESG Report\nPage 3\nESG at Citi Sust...,"sustainable finance, sustainable finance, sust..."
3,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,2022 Highlights\nCiti 2022 ESG Report\nPage 4\...,"sustainable finance, sustainable finance, citi..."
4,C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-te...,Citi 2022 ESG Report\nPage 5\nSustainable Fina...,"across citi, citi esg, esg priorities, esg pri..."


In [3]:
# For `tags` we use `text`` data type insted of 'keyword' data type
# because according to official Elasticsearch documentation,
# 'keyword' datatype is used when you require and exact value search like zip codes etc
# But in our case, our tags are a string of keywords separated by commas.
# The embeddings field is a dense vector of 384 dimensions because we
# are using 'all-MiniLM-L6-v2' emebedding model which produces vectors of 384 dimensions
# The embedding is obtained by passing the `text_chunk` through the embedding model

index_mapping = {
    "mappings": {
        "properties": {
        "text_chunk" : {"type" : "text", "analyzer": "standard"},
        "tags": {"type": "keyword"},
        "chunk_embedding": {
            "type": "dense_vector",
            "dims": 384,
            "similarity": "cosine"},
        "company_name": {"type": "keyword"},
        "report_year": {"type": "integer"},
        "source_path": {"type": "keyword"},
        }
    }
}

In [3]:
from elasticsearch import Elasticsearch

# Initialize local Elasticsearch on port 9200
# Can also check manually that service is running by typing http://localhost:9200/ in browser
try:
    es = Elasticsearch('http://localhost:9200/')
except Exception as e:
    raise Exception(
        status_code=500, detail=f"Failed to connect to Elasticsearch: {str(e)}"
    )

In [4]:
from sentence_transformers import SentenceTransformer

# Load a pretrained embeddings model for RAG
# Used this instead of the OLlama embeddings as this one produces dense embeddings
# which are more suitable for the task of semantic search
# Initialize S-BERT model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings using S-BERT model
def get_embeddings(text):
    try:
        # Create embeddings and convert to list from as needed by Elasticsearch
        return embedding_model.encode(text).tolist()
    except Exception as e:
        print(f"Error fetching embeddings for text: {text}. Error: {str(e)}")
        return None

In [6]:
# Sample text to encode
sample_text = "This is a sample text"

# Encode the sample text
sample_text_embedding = get_embeddings(sample_text)

# Print the shape of vector embedding
# Remember this as it will be used to create index mapping for ElasticSearch later on
print(sample_text_embedding) # can see that the vector is of shape (384,)

[0.010029466822743416, 0.09550520032644272, -0.02096635289490223, 0.02516229636967182, 0.04209551587700844, 0.025395972654223442, 0.032659776508808136, 0.05784359201788902, 0.04338082671165466, 0.007828210480511189, 0.0398300439119339, -0.012325815856456757, 0.011496727354824543, -0.06146809458732605, 0.02351446822285652, 0.050312191247940063, 0.04488016664981842, -0.03876474127173424, -0.014595787040889263, 0.007335928734391928, 0.04716607183218002, 0.08016200363636017, 0.03173895180225372, 0.01579270139336586, -0.003150620497763157, 0.044689055532217026, -0.02329280599951744, 0.08901121467351913, 0.11823387444019318, -0.007691595237702131, -0.04131339490413666, 0.0796300396323204, 0.1785944104194641, 0.026123933494091034, 0.06597475707530975, 0.021756784990429878, -0.0512271411716938, 0.06648346036672592, 0.039052173495292664, 0.04873555153608322, 0.008930841460824013, -0.08045849949121475, 0.04347360134124756, 0.014143365435302258, 0.023846838623285294, -0.03677743300795555, -0.0919

In [6]:
index_name = 'esg_reports_demo'

# Create Elasticsearch index with mappings
def create_index():
    try:
        # Delete index if it already exists
        if es.indices.exists(index=index_name):
            es.indices.delete(index=index_name)
        # Create index with mapping
        es.indices.create(index=index_name, mappings=index_mapping)
        print(f"Index '{index_name}' created successfully!")
    except Exception as e:
        print(f"Error creating index '{index_name}': {str(e)}")

In [8]:
'''




'''

import os
# Get the filename from the CSV file path
# Split the filename by `_` and the first element is the year
# The second element is the company name and we remove the hypen from the name
csv_filename = os.path.basename(SAMPLE_CSV_FILEPATH)
report_year = csv_filename.split('_')[0]
company_name = csv_filename.split('_')[1].replace('-', ' ')

print(f"Report Year: {report_year}")
print(f"Company Name: {company_name}")

Report Year: 2022
Company Name: Citigroup


In [9]:
# Convert data into Elasticsearch format

actions = [
    {
        "_index": index_name,
        "id": row['id'],
        "_source": {
            "text_chunk": row['text_chunk'],
            "tags": row['tags'],
            "chunk_embedding": get_embeddings(row['text_chunk']),
            "company_name": company_name,
            "report_year": report_year,
            "source_path": row['id'],
        }
    }
    for _, row in df.iterrows()
]

print(actions)

[{'_index': 'esg_reports_demo', 'id': 'C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-team6/data-pipelines/data/esg-pdf/2022_Citigroup_ESG_Report.pdf_0', '_source': {'text_chunk': 'What’s Inside\n2 A bout This Report\n3 L etter from Our CEO\n4 2 022 Highlights\nESG at Citi\n5 E SG Across Citi\n6 E SG Governance at Citi\n8 O ur Material ESG Issues\n10 S takeholder Engagement at Citi\nSustainable Finance\n12 O ur $1 Trillion Goal\n22 F inancing the Low-Carbon Transition\n23 F inancing Social Impact\nClimate Risk  \nand Net Zero\n25 O ur Net Zero Commitment\n30 O ur Approach to Managing  \nClimate Risk\n32 R educing Climate Risk in  \nOur Financing.\nSustainable Operations\n34 O perational Footprint Goals\n36 Sus tainable and Healthy Buildings\n37 E fficient Travel\n37 M anaging Climate Risk in  \nOur Operations\n38 En vironmental Performance  \nfor Operations\nBuilding Equitable and \nResilient Communities\n43 Action for Racial Equity\n45 C iti Impact Fund\n46 S trategic Philanthropy:  \nTh

In [10]:
from elasticsearch import helpers

# Bulk index the data into Elasticsearch
helpers.bulk(es, actions)
print("Data indexed successfully!") 

Data indexed successfully!


In [9]:
# ============= SEARCHING =============

# Lexical Search Function
def lexical_search(query: str, top_k: int, input_company_name: str = None):
    '''Returns the top-k lexical search results for the given query'''

    # Base query: full-text search on `text_chunk` field
    query_body = {
    "query": {
        "bool": {
            "must": [
                {"match": {"text_chunk": query}}  # Main search in text_chunk
            ],
            "should": [
                {"match": {"tags": query}}  # Boost relevance if query appears in tags
            ],
            "filter": []  # Filtering conditions
            }
        },
    "size": top_k,
    "_source": ["text_chunk", "tags", "source_path", "company_name", "report_year"]
    }

    # Add company_name filter if provided
    if input_company_name:
        # Convert `input_company_name` to lowercase because the `company_name` field is automatically converted to lowercase in ES
        input_company_name = input_company_name.lower()
        query_body["query"]["bool"]["filter"].append({"term": {"company_name": input_company_name}})


    # Execute lexical search
    lexical_results = es.search(index=index_name, body=query_body)

    lexical_hits = lexical_results['hits']['hits']
    max_bm25_score = max([hit["_score"] for hit in lexical_hits], default=1.0)

    # Normalize lexical scores
    for hit in lexical_hits:
        hit["_normalized_score"] = hit["_score"] / max_bm25_score

    return lexical_hits

In [12]:
input_query = "What is Citigroup's GHG emissions?"

In [10]:
company_name

'Citigroup'

In [13]:
results = lexical_search(input_query, top_k=5, input_company_name=company_name)

In [26]:
# Base query: full-text search on `text_chunk` field
query_body = {
"query": {
    "bool": {
        "must": [
            {"match": {"text_chunk": input_query}}  # Main search in text_chunk
        ],
        "should": [
            {"match": {"tags": input_query}}  # Boost relevance if query appears in tags
        ],
        "filter": [{"term": {"company_name": "citigroup"}}]  # Filtering conditions
        }
    },
"size": 3,
"_source": ["text_chunk", "tags", "source_path", "company_name", "report_year"]
}

# Match all queries
# query_body = {
#     "query": {
#         "match_all": {}
#     }
# }

results = es.search(index=index_name, body=query_body)

In [27]:
results['hits']['hits']

[]

In [30]:

# Base query: full-text search on `text_chunk` field
query_body = {
"query": {
    "bool": {
        "must": [
            {"match": {"text_chunk": input_query}}  # Main search in text_chunk
        ],
        "should": [
            {"match": {"tags": input_query}}  # Boost relevance if query appears in tags
        ],
        "filter": [{"term": {"company_name": "Citigroup"}}]  # Filtering conditions
        }
    },
"size": 3,
"_source": ["text_chunk", "tags", "source_path", "company_name", "report_year"]
}

# # Add company_name filter if provided
# if input_company_name:
#     # Convert `input_company_name` to lowercase because the `company_name` field is automatically converted to lowercase in ES
#     input_company_name = input_company_name.lower()
#     query_body["query"]["bool"]["filter"].append({"term": {"company_name": input_company_name}})


# Execute lexical search
lexical_results = es.search(index=index_name, body=query_body)


In [31]:
lexical_results['hits']['hits']

[{'_index': 'esg_reports_demo',
  '_id': 'plI9TZUBHidLzRCZWfw6',
  '_score': 8.802879,
  '_source': {'text_chunk': 'Citi 2022 ESG Report\nPage 26\nESG at Citi Sustainable Finance Sustainable Operations Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices\nClimate Risk & Net Zero\nOur Net Zero Commitment Our Approach to Managing Climate Risk Reducing Climate Risk in Our Financing\nOur climate risk and net zero work are related and \nreinforce each other. Whereas our climate risk \nwork focuses on the identification, measurement \nand management of key risks arising from climate \nchange, our net zero work focuses on our impacts \non the climate and achieving our net zero \nemissions reduction targets, which also help \nreduce risk. There are common links between the \ntwo workstreams. For example, both rely on \ncommon data elements such as GHG emissions \nand a better understanding of our clients’ \nclimate change mitigation and/or adaptation \nplans.

In [None]:
# Semantic Search Function
def semantic_search(query: str, top_k: int, input_company_name: str = None):
    # Generate embeddings for the query using S-BERT
    query_embedding = get_embeddings(query)

    # Perform a cosine similarity search using the query embedding
    script_query = {
        "script_score": {
            "query": {"bool": {"filter": []}}, # filters will be added below
            "script": {
                "source": "cosineSimilarity(params.query_embedding, 'chunk_embedding') + 1.0", # Cosine similarity calculation
                "params": {"query_embedding": query_embedding}, # Pass the query embedding as a parameter
            },
        }
    }

    # Add filters for company_name
    if input_company_name:
        # Convert `input_company_name` to lowercase because the `company_name` field is automatically converted to lowercase in ES
        input_company_name = input_company_name.lower()
        script_query["script_score"]["query"]["bool"]["filter"].append({"term": {"company_name": input_company_name}})

    # Execute semantic search
    semantic_results = es.search(
        index=index_name,
        body={
            "query": script_query,
            "_source": {"excludes": ["chunk_embedding"]},
            "size": top_k,
        },
        source_excludes=["chunk_embedding"],
    )

    semantic_hits = semantic_results['hits']['hits']
    max_semantic_score = max([hit["_score"] for hit in semantic_hits], default=1.0)

    # Normalize semantic scores
    for hit in semantic_hits:
        hit["_normalized_score"] = hit["_score"] / max_semantic_score

    return semantic_hits

In [43]:
# Combine lexical and semantic search results using Reciprocal Rank Fusion (RRF)
def reciprocal_rank_fusion(lexical_hits, semantic_hits, k=60):
    '''
    k: The rank bias parameter (higher values reduce the impact of rank).
    '''
    rrf_scores = {}

    # Process lexical search results
    for rank, hit, in enumerate(lexical_hits, start=1):
        doc_id = hit["_id"]
        score = 1 / (k + rank) # Reciprocal Rank Fusion (RRF) score
        if doc_id in rrf_scores:
            rrf_scores[doc_id]["rrf_score"] += score
        else:
            rrf_scores[doc_id] = {
                "text_chunk": hit["_source"]["text_chunk"],
                "tags": hit["_source"]["tags"],
                "source_path": hit["_source"]["source_path"],
                "report_year": hit["_source"]["report_year"],
                "company_name": hit["_source"]["company_name"],
                "lexical_score": hit["_normalized_score"],
                "semantic_score": 0,
                "rrf_score": score,
            }

    # Process semantic search results
    for rank, hit in enumerate(semantic_hits, start=1):
        doc_id = hit["_id"]
        score = 1 / (k + rank) # RRF formula
        if doc_id in rrf_scores:
            rrf_scores[doc_id]["rrf_score"] += score
            rrf_scores[doc_id]["semantic_score"] = hit["_normalized_score"]
        else:
            rrf_scores[doc_id] = {
                "text_chunk": hit["_source"]["text_chunk"],
                "tags": hit["_source"]["tags"],
                "source_path": hit["_source"]["source_path"],
                "report_year": hit["_source"]["report_year"],
                "company_name": hit["_source"]["company_name"],
                "lexical_score": 0,
                "semantic_score": hit["_normalized_score"],
                "rrf_score": score,
            }

    # Sort by the RRF score in descending order
    sorted_results = sorted(
        rrf_scores.values(), key=lambda x: x["rrf_score"], reverse=True
    )

    return sorted_results

In [None]:
# Hybrid Search Function
def hybrid_search(query: str, lexical_top_k: int, semantic_top_k: int, input_company_name: str = None):
    # Get lexical and semantic search results
    lexical_hits = lexical_search(query, lexical_top_k, input_company_name)
    semantic_hits = semantic_search(query, semantic_top_k, input_company_name)
    # Combine using RRF
    combined_results = reciprocal_rank_fusion(lexical_hits, semantic_hits, k=60)
    return combined_results

In [None]:
esg_metrics = ['GHG emissions', 'Electricity consumption', 'Water consumption', 'Gender ratio', 'Turnover rate', 'Board of Director gender ratio', 'Number of Corruption cases']
esg_metric = esg_metrics[0]

company_name = "Citigroup"

In [None]:
lexical_search(esg_metric, 3, company_name)

[{'_index': 'esg_reports_demo',
  '_id': 'f6nWS5UBjTprI_kMLTzn',
  '_score': 17.534065,
  '_ignored': ['tags.keyword', 'text_chunk.keyword'],
  '_source': {'text_chunk': 'Citi 2022 ESG Report\nPage 35\nESG at Citi Sustainable Finance Climate Risk & Net Zero Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices\nSustainable Operations\nOperational Footprint Goals Sustainable and Healthy Buildings Managing Climate Risk in Our Operations Environmental Performance for OperationsEfficient Travel\n2025 OPERATIONAL FOOTPRINT GOALS\n(measured against a 2010 baseline) \nGoals Progress through 2022\nGHG emissions\n45% reduction in location-based GHG emissions 48%\nEnergy\n40% reduction in energy consumption  36%\nMaintain 100% renewable electricity sourcing  100%*\nWater\n30% reduction in total water consumption  37%\n25% of water consumed to come from  \nreclaimed/reused sources\n 7%\nSustainable buildings\n40% of floor area to be LEED-, WELL- or  \nequivalent 

In [None]:
semantic_search(esg_metric, 3, company_name)

[{'_index': 'esg_reports_demo',
  '_id': 'gqnWS5UBjTprI_kMLTzn',
  '_score': 1.593087,
  '_ignored': ['tags.keyword', 'text_chunk.keyword'],
  '_source': {'text_chunk': 'Citi 2022 ESG Report\nPage 38\nESG at Citi Sustainable Finance Climate Risk & Net Zero Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices\nSustainable Operations\nOperational Footprint Goals Sustainable and Healthy Buildings Managing Climate Risk in Our Operations Environmental Performance for OperationsEfficient Travel\nEnvironmental Performance for Operations\nREGIONAL OPERATIONAL ENVIRONMENTAL PERFORMANCE \nRegion GHG Emissions CO2e\nLocation-Based (mt)\nGHG Emissions CO2e \nMarket-Based (mt)\nCarbon Credits CO 2e\n(mt)\nNet CO2e\nMarket-Based (mt)\nTotal Energy \nConsumption \n(GWh)\nTotal Water \nConsumption \n(m3)\nTotal Waste\n(mt)\nAsia Pacific 122,652 26,403 631 25,772 213 641,543 3,411 \nEurope, Middle East and Africa 44,768 10,682 3,614 7, 0 69  175 314,000 2,510 \nLatin 

In [None]:
retrieved_chunk = hybrid_search(esg_metric, 3, 3, company_name)
retrieved_context = retrieved_chunk[0]['text_chunk']
print(retrieved_context)

Citi 2022 ESG Report
Page 38
ESG at Citi Sustainable Finance Climate Risk & Net Zero Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices
Sustainable Operations
Operational Footprint Goals Sustainable and Healthy Buildings Managing Climate Risk in Our Operations Environmental Performance for OperationsEfficient Travel
Environmental Performance for Operations
REGIONAL OPERATIONAL ENVIRONMENTAL PERFORMANCE 
Region GHG Emissions CO2e
Location-Based (mt)
GHG Emissions CO2e 
Market-Based (mt)
Carbon Credits CO 2e
(mt)
Net CO2e
Market-Based (mt)
Total Energy 
Consumption 
(GWh)
Total Water 
Consumption 
(m3)
Total Waste
(mt)
Asia Pacific 122,652 26,403 631 25,772 213 641,543 3,411 
Europe, Middle East and Africa 44,768 10,682 3,614 7, 0 69  175 314,000 2,510 
Latin America and Mexico 84,593 3,500 1,763 1,737 212 8 37, 5 4 4  8,357 
North America 258,540 38,921 37, 9 0 5  1,016 763 1,866,117 13,504 
GHG EMISSIONS (SCOPE 1 & 2) BY REGION IN 2022 
Region Scope

In [49]:
source_path = retrieved_chunk[0]['source_path']
print(source_path)

C:/Zhenjie/University/Y3S2/dsa3101-ay2425s2-team6/data-pipelines/data/esg-pdf/2022_Citigroup_ESG_Report.pdf_37


In [None]:
from langchain_core.prompts import PromptTemplate

template = """
You are an AI assistant that helps users find information in ESG reports and answer questions about them.
Given the following context retrieved from an ESG report, evaluate based on the ESG metric provided.
If the answer is not in the context, say "I don't know".

Context: {context}
ESG Metric: {esg_metric}
Answer: 
"""

prompt = PromptTemplate(template=template, input_variables=["context", "esg_metric"])

In [36]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model='llama3.2')

In [None]:
from langchain_core.output_parsers import StrOutputParser

rag_chain = prompt | llm | StrOutputParser()

response = rag_chain.invoke({"context": retrieved_context, "esg_metric": esg_metric})
print(response)

Based on the context provided from Citi's 2022 ESG Report, the answer is:

GHG (Greenhouse Gas) emissions refer to the total amount of CO2e (carbon dioxide equivalent) emitted by Citi's operations in each region. The data is presented in two formats: Location-Based and Market-Based.

Here are some key findings from the table:

- Asia Pacific: 122,652 mt (mt = metric tons) CO2e emissions in Location-Based and 26,403 mt CO2e emissions in Market-Based.
- Europe, Middle East and Africa: 44,768 mt CO2e emissions in Location-Based and 10,682 mt CO2e emissions in Market-Based.
- Latin America and Mexico: 84,593 mt CO2e emissions in Location-Based and 3,500 mt CO2e emissions in Market-Based.
- North America: 258,540 mt CO2e emissions in Location-Based and 38,921 mt CO2e emissions in Market-Based.

Please note that the total figure for each region's GHG emissions may not add up to the individual regional totals due to rounding.


In [50]:
retrieved_chunk

[{'text_chunk': 'Citi 2022 ESG Report\nPage 38\nESG at Citi Sustainable Finance Climate Risk & Net Zero Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices\nSustainable Operations\nOperational Footprint Goals Sustainable and Healthy Buildings Managing Climate Risk in Our Operations Environmental Performance for OperationsEfficient Travel\nEnvironmental Performance for Operations\nREGIONAL OPERATIONAL ENVIRONMENTAL PERFORMANCE \nRegion GHG Emissions CO2e\nLocation-Based (mt)\nGHG Emissions CO2e \nMarket-Based (mt)\nCarbon Credits CO 2e\n(mt)\nNet CO2e\nMarket-Based (mt)\nTotal Energy \nConsumption \n(GWh)\nTotal Water \nConsumption \n(m3)\nTotal Waste\n(mt)\nAsia Pacific 122,652 26,403 631 25,772 213 641,543 3,411 \nEurope, Middle East and Africa 44,768 10,682 3,614 7, 0 69  175 314,000 2,510 \nLatin America and Mexico 84,593 3,500 1,763 1,737 212 8 37, 5 4 4  8,357 \nNorth America 258,540 38,921 37, 9 0 5  1,016 763 1,866,117 13,504 \nGHG EMISSIONS (

In [54]:
# Save the LLM response together with the retrieved context and source path of the chunk into a dictionary
# which we will save as a JSON file

# Split the source_path by `.pdf` to get the actual path and the page number
source_path_parts = source_path.split('.pdf')
source_path_actual = source_path_parts[0] + '.pdf'
page_number = source_path_parts[1].replace('_', '')

response_dict = {
    "retrieved_context": retrieved_context,
    "source_path": source_path_actual,
    "page_number": page_number,
    "llm_response": response
}

In [55]:
response_dict

{'retrieved_context': 'Citi 2022 ESG Report\nPage 38\nESG at Citi Sustainable Finance Climate Risk & Net Zero Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices\nSustainable Operations\nOperational Footprint Goals Sustainable and Healthy Buildings Managing Climate Risk in Our Operations Environmental Performance for OperationsEfficient Travel\nEnvironmental Performance for Operations\nREGIONAL OPERATIONAL ENVIRONMENTAL PERFORMANCE \nRegion GHG Emissions CO2e\nLocation-Based (mt)\nGHG Emissions CO2e \nMarket-Based (mt)\nCarbon Credits CO 2e\n(mt)\nNet CO2e\nMarket-Based (mt)\nTotal Energy \nConsumption \n(GWh)\nTotal Water \nConsumption \n(m3)\nTotal Waste\n(mt)\nAsia Pacific 122,652 26,403 631 25,772 213 641,543 3,411 \nEurope, Middle East and Africa 44,768 10,682 3,614 7, 0 69  175 314,000 2,510 \nLatin America and Mexico 84,593 3,500 1,763 1,737 212 8 37, 5 4 4  8,357 \nNorth America 258,540 38,921 37, 9 0 5  1,016 763 1,866,117 13,504 \nGHG EMISS

In [59]:
output_dict = {}

# If the year and company name are not in the output dictionary, add them
if report_year not in output_dict:
    output_dict[report_year] = {}
    if company_name not in output_dict[report_year]:
        output_dict[report_year][company_name] = {}

# Add the response dictionary to the output dictionary
output_dict[report_year][company_name][esg_metric] = response_dict

In [60]:
output_dict

{'2022': {'Citigroup': {'GHG emissions': {'retrieved_context': 'Citi 2022 ESG Report\nPage 38\nESG at Citi Sustainable Finance Climate Risk & Net Zero Building Equitable & Resilient Communities Talent & DEI Responsible Business Appendices\nSustainable Operations\nOperational Footprint Goals Sustainable and Healthy Buildings Managing Climate Risk in Our Operations Environmental Performance for OperationsEfficient Travel\nEnvironmental Performance for Operations\nREGIONAL OPERATIONAL ENVIRONMENTAL PERFORMANCE \nRegion GHG Emissions CO2e\nLocation-Based (mt)\nGHG Emissions CO2e \nMarket-Based (mt)\nCarbon Credits CO 2e\n(mt)\nNet CO2e\nMarket-Based (mt)\nTotal Energy \nConsumption \n(GWh)\nTotal Water \nConsumption \n(m3)\nTotal Waste\n(mt)\nAsia Pacific 122,652 26,403 631 25,772 213 641,543 3,411 \nEurope, Middle East and Africa 44,768 10,682 3,614 7, 0 69  175 314,000 2,510 \nLatin America and Mexico 84,593 3,500 1,763 1,737 212 8 37, 5 4 4  8,357 \nNorth America 258,540 38,921 37, 9 0 