In [1]:
!pip -q install openai
# !pip -q install pypdf2
!pip -q install tiktoken
# !pip -q install pycryptodome

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!git clone https://github.com/yohanesnuwara/ESG-Chat

Cloning into 'ESG-Chat'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 65 (delta 32), reused 11 (delta 6), pack-reused 0[K
Unpacking objects: 100% (65/65), 83.45 MiB | 10.28 MiB/s, done.


In [3]:
import numpy as np
import pandas as pd
import tiktoken
import openai
# import PyPDF2
import os
# import nltk
# nltk.download('punkt')

In [23]:
%%writefile utils.py
import numpy as np
import pandas as pd
import tiktoken
import openai
import os

#COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

def load_embeddings(df):
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    max_dim = max([int(c) for c in df.columns if c != "Article_ID"])
    return {
           r.Article_ID: [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

    # Parameters for tokenization of completion
    MAX_SECTION_LEN = 500
    SEPARATOR = "\n* "
    ENCODING = "gpt2"  # encoding for text-davinci-003

    encoding = tiktoken.get_encoding(ENCODING)
    separator_len = len(encoding.encode(SEPARATOR))

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += 70 + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.Text.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )  
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                temperature=0.,
                max_tokens=1000,
                model="text-davinci-003"
            )

    return response["choices"][0]["text"].strip(" \n")

Overwriting utils.py


In [26]:
%%writefile chat.py
import numpy as np
import pandas as pd
import tiktoken
import openai
import os
from utils import *

def chat_ESG(key_api, query, show_prompt=False):

    # OpenAI Key
    openai.api_key = key_api

    COMPLETIONS_MODEL = "text-davinci-003"
    EMBEDDING_MODEL = "text-embedding-ada-002"

    # Paths to report tabular and embedding 
    report = "https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Database.csv?raw=true"
    embed = "https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Embedding.csv?raw=true"

    # Read report tabular and embedding
    embed_df = pd.read_csv(embed)
    embed_df.rename(columns={'Unnamed: 0':'Article_ID'}, inplace=True)

    df = pd.read_csv(report)
    df = df.set_index("Article_ID")

    # Convert embedding dataframe to dictionary
    embed = load_embeddings(embed_df)

    answer_query_with_context(query, df, embed, show_prompt)

Overwriting chat.py


In [27]:
from utils import *
from chat import *

key_api = "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"
query = "How to promote wellbeing? Give long and detail answer."

chat_ESG(key_api, query)

Selected 6 document sections:
Nestle_1315
General Electric_1615
General Electric_1625
General Electric_1620
Toyota_895
Toyota_1035


NameError: ignored

In [22]:
import numpy as np
import pandas as pd
import tiktoken
import openai
import os
from utils import *

# OpenAI Key
openai.api_key = "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"

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

# Paths to report tabular and embedding 
report = "https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Database.csv?raw=true"
embed = "https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Embedding.csv?raw=true"

# Read report tabular and embedding
embed_df = pd.read_csv(embed)
embed_df.rename(columns={'Unnamed: 0':'Article_ID'}, inplace=True)

df = pd.read_csv(report)
df = df.set_index("Article_ID")

# Convert embedding dataframe to dictionary
embed = load_embeddings(embed_df)

# Parameters for tokenization of completion
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.,
    "max_tokens": 1000,
    "model": COMPLETIONS_MODEL,
}

answer_query_with_context("How to promote wellbeing? Give long and detail answer.", 
                          df, embed, show_prompt=False)

Selected 6 document sections:
Nestle_1315
General Electric_1615
General Electric_1625
General Electric_1620
Toyota_895
Toyota_1035


'HealthAhead has been GE’s global wellbeing program for more than 10 years. Its mission is to support a culture that inspires and encourages GE employees and their families to optimize their health and wellbeing and live a well-balanced life. To promote wellbeing, HealthAhead has implemented a variety of initiatives, such as launching new branding and structure with a focus on holistic health, introducing four pillars—physical, social, emotional—and adding financial well-being as a new pillar, presenting two global emotional well-being campaigns (It’s OK not to feel OK) with leader support and emotional wellbeing stories from GE employees around the world, hosting more than 30 live webinars for employees throughout the year, covering a range of wellbeing topics, logging more than 1 million well-being minutes during their “Rise to the Challenge” campaign in May, providing access to well-being benefits such as Employee Assistance Programs (EAP), digital tools and mobile apps such as meQu

In [6]:
%%writefile chat.py
import numpy as np
import pandas as pd
import tiktoken
import openai
import os
from utils import *

openai.api_key = "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"

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

# Paths to report tabular and embedding 
report = "https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Database.csv?raw=true"
embed = "https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Embedding.csv?raw=true"

# Read report tabular and embedding
embed_df = pd.read_csv(embed)
embed_df.rename(columns={'Unnamed: 0':'Article_ID'}, inplace=True)
df = pd.read_csv(report)

embed = load_embeddings(embed_df)

#print(embed_df.columns)

Overwriting chat.py


In [7]:
!python chat.py

In [None]:
pd.read_csv("https://github.com/yohanesnuwara/ESG-Chat/blob/main/model/ESG_Report_Embedding.csv?raw=true")



Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,Starbucks_0,-0.019942,-0.048955,0.005541,-0.044380,-0.006389,-0.001577,-0.004670,0.005945,-0.013869,...,0.002767,0.035546,0.000794,-0.018246,-0.018549,0.005860,-0.016182,-0.012015,-0.026002,-0.032233
1,Starbucks_5,-0.002771,-0.047556,-0.011181,-0.016389,-0.003793,0.004710,-0.001918,0.011358,-0.027433,...,0.006531,0.023149,-0.012216,-0.006148,-0.017607,-0.022297,0.010186,-0.011561,-0.015799,-0.021629
2,Starbucks_10,0.002730,-0.033203,-0.019511,-0.040392,-0.007406,0.014653,0.001151,0.010006,-0.001576,...,0.014456,0.027753,-0.004002,-0.001777,-0.024685,-0.008360,0.005342,-0.014956,-0.020275,-0.026449
3,Starbucks_15,-0.023969,-0.034607,0.005196,-0.021888,0.005945,-0.008244,-0.008869,0.012189,-0.005625,...,0.000655,0.026554,0.008740,-0.021738,-0.017562,0.008971,-0.012638,-0.016882,-0.011196,-0.031641
4,Starbucks_20,0.002771,-0.018533,-0.024239,-0.029290,-0.016716,0.009828,-0.000477,0.008699,-0.004914,...,0.001233,0.007710,0.004366,-0.019268,-0.023624,-0.005275,0.009193,-0.023437,-0.018039,-0.021526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,Goldman Sachs_700,-0.007945,-0.025129,0.008143,-0.017448,0.004560,0.017210,-0.021460,0.005200,-0.004154,...,0.014848,0.015824,0.011614,0.001909,-0.027029,-0.012531,0.004900,0.008750,-0.012307,-0.042286
3107,Goldman Sachs_705,0.003380,-0.023107,-0.022084,-0.012623,0.006259,0.022881,-0.021074,0.000579,-0.010391,...,0.016357,0.018922,0.014019,-0.001457,-0.024157,-0.032900,0.010298,0.006308,-0.025871,-0.019121
3108,Goldman Sachs_710,-0.013269,-0.006742,-0.008667,-0.014987,-0.007383,0.016271,-0.030104,0.003291,-0.002206,...,0.013315,-0.000996,0.030804,0.007066,-0.009678,-0.019952,0.022688,-0.027952,0.014767,0.005776
3109,Goldman Sachs_715,-0.022932,-0.022263,-0.016829,-0.022057,0.005962,0.005012,-0.028044,0.000936,-0.019134,...,-0.004011,0.013063,0.018284,0.012573,-0.005888,-0.031443,-0.009181,0.001600,0.009451,-0.003454


In [None]:
openai.api_key = "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"

In [None]:
def trim(text, n_start, n_end):
  # Split the sentence into a list of sentences
  sentences = nltk.sent_tokenize(text)

  # Slice the list to only include the first two sentences
  trimmed_sentences = sentences[n_start:n_end]

  # Join the sliced list of sentences back into a string
  trimmed_sentence = " ".join(trimmed_sentences)  

  return trimmed_sentence

# trim(training_data, 0, 2)

In [None]:
# !wget https://home.barclays/content/dam/home-barclays/documents/citizenship/ESG/Barclays-PLC-ESG-Report-2019.pdf
# !wget https://www.jpmorganchase.com/content/dam/jpmc/jpmorgan-chase-and-co/documents/jpmc-cr-esg-report-2019.pdf
# !wget https://www.morganstanley.com/pub/content/dam/msdotcom/sustainability/Morgan-Stanley_2019-Sustainability-Report_Final.pdf
# !wget https://www.goldmansachs.com/our-commitments/sustainability/sustainable-finance/documents/reports/2019-sustainability-report.pdf

In [None]:
pdf_files = [["Starbucks", "Coffee", 2020, "https://stories.starbucks.com/uploads/2021/04/Starbucks-2020-Global-Environmental-and-Social-Impact-Report.pdf"],
             ["Xerox", "Printing", 2022, "https://www.xerox.com/downloads/usa/en/x/Xerox_CSR_Report.pdf"],
             ["Nestle", "Food", 2022, "https://www.nestle.com/sites/default/files/2022-03/creating-shared-value-sustainability-report-2021-en.pdf"],
             ["Tupperware", "Appliances", 2020, "https://www.tupperwarebrands.com/assets/files/TWB_2020%20Sustainability%20Report_FINAL.pdf"],
             ["Halliburton", "Energy", 2022, "https://cdn.brandfolder.io/XG3NESCC/at/vk8fc5fxq88hwskjw6gf3mvb/halliburton-2022-annual-sustainability-report.pdf"],
             ["Pfizer", "Medicine", 2021, "https://www.pfizer.com/sites/default/files/investors/financial_reports/annual_reports/2021/files/Pfizer_ESG_Report.pdf"],
             ["Carrefour", "Retail", 2021, "https://www.carrefour.com/sites/default/files/2021-06/Carrefour%20-%20Sustainability-Linked%20Bond%20Framework%20%281%29.pdf"],
             ["Lenovo", "Electronic", 2022, "https://investor.lenovo.com/en/sustainability/reports/FY2022-lenovo-sustainability-report.pdf"],
             ["General Electric", "Machinery", 2021, "https://www.ge.com/sites/default/files/ge2021_sustainability_report.pdf"],
             ["Conoco Phillips", "Energy", 2021, "https://static.conocophillips.com/files/resources/conocophillips-2021-sustainability-report.pdf"],
             ["Toyota", "Automotive", 2021, "https://global.toyota/pages/global_toyota/sustainability/report/sdb/sdb21_en.pdf"],
             ["Samsung", "Electronic", 2021, "https://image-us.samsung.com/SamsungUS/home/pdf/Samsung-Electronics-Sustainability-Report-2021.pdf"],
             ["Barclays", "Bank", 2019, "https://home.barclays/content/dam/home-barclays/documents/citizenship/ESG/Barclays-PLC-ESG-Report-2019.pdf"],
             ["Morgan Stanley", "Bank", 2019, "https://www.morganstanley.com/pub/content/dam/msdotcom/sustainability/Morgan-Stanley_2019-Sustainability-Report_Final.pdf"],
             ["Goldman Sachs", "Bank", 2019, "https://www.goldmansachs.com/our-commitments/sustainability/sustainable-finance/documents/reports/2019-sustainability-report.pdf"]]

for f in pdf_files:
    # Download reports
    os.system(f"wget {f[3]}")

In [None]:
import pandas as pd
import urllib
import os

# Create an empty list to store DataFrames
dfs = []

# pdf_files = [["Starbucks", "Coffee", 2020, "/content/Starbucks-2020-Global-Environmental-and-Social-Impact-Report.pdf"],
#              ["Xerox", "Printing", 2022, "/content/Xerox_CSR_Report.pdf"],
#              ["Nestle", "Food", 2022, "/content/creating-shared-value-sustainability-report-2021-en.pdf"],
#              ["Tupperware", "Appliances", 2020, "/content/TWB_2020 Sustainability Report_FINAL.pdf"],
#              ["Halliburton", "Energy", 2022, "/content/halliburton-2022-annual-sustainability-report.pdf"],
#              ["Pfizer", "Medicine", 2021, "/content/Pfizer_ESG_Report.pdf"],
#              ["Carrefour", "Retail", 2021, "/content/Carrefour - Sustainability-Linked Bond Framework (1).pdf"],
#              ["Lenovo", "Electronic", 2022, "/content/FY2022-lenovo-sustainability-report.pdf"],
#              ["General Electric", "Machinery", 2021, "/content/ge2021_sustainability_report.pdf"],
#              ["Conoco Phillips", "Energy", 2021, "/content/conocophillips-2021-sustainability-report.pdf"],
#              ["Toyota", "Automotive", 2021, "/content/sdb21_en.pdf"],
#              ["Samsung", "Electronic", 2021, "/content/Samsung-Electronics-Sustainability-Report-2021.pdf"],
#              ["Barclays", "Bank", 2019, "/content/Barclays-PLC-ESG-Report-2019.pdf"],
#              ["Morgan Stanley", "Bank", 2019, "/content/Morgan-Stanley_2019-Sustainability-Report_Final.pdf"],
#              ["Goldman Sachs", "Bank", 2019, "/content/2019-sustainability-report.pdf"]]


pdf_files = [["Starbucks", "Coffee", 2020, "/content/Starbucks-2020-Global-Environmental-and-Social-Impact-Report.pdf"],
             ["Xerox", "Printing", 2022, "/content/Xerox_CSR_Report.pdf"],
             ["Nestle", "Food", 2022, "/content/creating-shared-value-sustainability-report-2021-en.pdf"],
             ["Tupperware", "Appliances", 2020, "/content/TWB_2020 Sustainability Report_FINAL.pdf"],
             ["Pfizer", "Medicine", 2021, "/content/Pfizer_ESG_Report.pdf"],
             ["Carrefour", "Retail", 2021, "/content/Carrefour - Sustainability-Linked Bond Framework (1).pdf"],
             ["Lenovo", "Electronic", 2022, "/content/FY2022-lenovo-sustainability-report.pdf"],
             ["General Electric", "Machinery", 2021, "/content/ge2021_sustainability_report.pdf"],
             ["Conoco Phillips", "Energy", 2021, "/content/conocophillips-2021-sustainability-report.pdf"],
             ["Toyota", "Automotive", 2021, "/content/sdb21_en.pdf"],
             ["Samsung", "Electronic", 2021, "/content/Samsung-Electronics-Sustainability-Report-2021.pdf"],
             ["Morgan Stanley", "Bank", 2019, "/content/Morgan-Stanley_2019-Sustainability-Report_Final.pdf"],
             ["Goldman Sachs", "Bank", 2019, "/content/2019-sustainability-report.pdf"]]



interval = 5

for f in pdf_files:
    # # Download reports
    # os.system(f"wget {f[3]}")
    company_name = f[0]
    # local_path = '/content/' + f[3].split("/")[-1]

    pdf_file = open(f[3], "rb")
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text() 

    # Clean and format document 
    training_data = text.replace("\n", " ")

    # NLTK
    sentences = nltk.sent_tokenize(training_data)

    # trim sentence
    for l in range(0, len(sentences), interval):
        trimmed_sentence = trim(training_data, l, l+interval)
        
        # Append a new DataFrame to the list with the company name and trimmed sentence
        dfs.append(pd.DataFrame({"Article_ID": [f"{company_name}_{l}"], "Text": [trimmed_sentence]}))

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
df = df.set_index("Article_ID")


In [None]:
df

Unnamed: 0_level_0,Text
Article_ID,Unnamed: 1_level_1
Starbucks_0,STARBUCKS 2020 REPORTEN V IRO NMENTAL & SOCIAL...
Starbucks_5,"Now, meeting the ambitious targets set out in ..."
Starbucks_10,Our goal to make a positive impact on the live...
Starbucks_15,STARBUCKS 2020 REPORTEN V IRO NMENTAL & SOCIAL...
Starbucks_20,"Inclusion in every way. We are, most certainl..."
...,...
Goldman Sachs_700,We have been carbon neutral across our global ...
Goldman Sachs_705,TCFD AppendixGoldman Sachs 2019 Sustainability...
Goldman Sachs_710,This document should not be used as a basis fo...
Goldman Sachs_715,This report contains “forward-looking statemen...


In [None]:
df.to_csv("ESG_Report_Database.csv")

In [None]:
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [10]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.Text) for idx, r in df.iterrows()
    }

def load_embeddings(df):
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    # df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }  

In [None]:
embed_df = compute_doc_embeddings(df)

# embed_df

In [None]:
# An example embedding:
example_entry = list(embed_df.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

Starbucks_0 : [-0.019710786640644073, -0.048967961221933365, 0.005664057098329067, -0.04426051303744316, -0.006367544177919626]... (1536 entries)


In [None]:
embedding_df = pd.DataFrame.from_dict(embed_df, orient='columns').T

embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
Starbucks_0,-0.019942,-0.048955,0.005541,-0.044380,-0.006389,-0.001577,-0.004670,0.005945,-0.013869,-0.023478,...,0.002767,0.035546,0.000794,-0.018246,-0.018549,0.005860,-0.016182,-0.012015,-0.026002,-0.032233
Starbucks_5,-0.002771,-0.047556,-0.011181,-0.016389,-0.003793,0.004710,-0.001918,0.011358,-0.027433,-0.007572,...,0.006531,0.023149,-0.012216,-0.006148,-0.017607,-0.022297,0.010186,-0.011561,-0.015799,-0.021629
Starbucks_10,0.002730,-0.033203,-0.019511,-0.040392,-0.007406,0.014653,0.001151,0.010006,-0.001576,-0.017694,...,0.014456,0.027753,-0.004002,-0.001777,-0.024685,-0.008360,0.005342,-0.014956,-0.020275,-0.026449
Starbucks_15,-0.023969,-0.034607,0.005196,-0.021888,0.005945,-0.008244,-0.008869,0.012189,-0.005625,-0.013930,...,0.000655,0.026554,0.008740,-0.021738,-0.017562,0.008971,-0.012638,-0.016882,-0.011196,-0.031641
Starbucks_20,0.002771,-0.018533,-0.024239,-0.029290,-0.016716,0.009828,-0.000477,0.008699,-0.004914,-0.012714,...,0.001233,0.007710,0.004366,-0.019268,-0.023624,-0.005275,0.009193,-0.023437,-0.018039,-0.021526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Goldman Sachs_700,-0.007945,-0.025129,0.008143,-0.017448,0.004560,0.017210,-0.021460,0.005200,-0.004154,-0.012729,...,0.014848,0.015824,0.011614,0.001909,-0.027029,-0.012531,0.004900,0.008750,-0.012307,-0.042286
Goldman Sachs_705,0.003380,-0.023107,-0.022084,-0.012623,0.006259,0.022881,-0.021074,0.000579,-0.010391,-0.014218,...,0.016357,0.018922,0.014019,-0.001457,-0.024157,-0.032900,0.010298,0.006308,-0.025871,-0.019121
Goldman Sachs_710,-0.013269,-0.006742,-0.008667,-0.014987,-0.007383,0.016271,-0.030104,0.003291,-0.002206,-0.024633,...,0.013315,-0.000996,0.030804,0.007066,-0.009678,-0.019952,0.022688,-0.027952,0.014767,0.005776
Goldman Sachs_715,-0.022932,-0.022263,-0.016829,-0.022057,0.005962,0.005012,-0.028044,0.000936,-0.019134,-0.005759,...,-0.004011,0.013063,0.018284,0.012573,-0.005888,-0.031443,-0.009181,0.001600,0.009451,-0.003454


In [None]:
embedding_df.to_csv("ESG_Report_Embedding.csv")

In [6]:
import numpy as np

def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [None]:
order_document_sections_by_query_similarity("How to wellbeing?", embed_df)[:5]

[(0.8318255133968178, 'General Electric_1615'),
 (0.8314669996387144, 'Nestle_1315'),
 (0.8284146780346089, 'General Electric_1625'),
 (0.8274008532606205, 'General Electric_1620'),
 (0.8223406710868736, 'Toyota_890')]

In [16]:
import tiktoken

MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.,
    "max_tokens": 1000,
    "model": COMPLETIONS_MODEL,
}

In [7]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += 70 + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.Text.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"



In [8]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [None]:
answer_query_with_context("How to promote wellbeing? Give long and detail answer.", 
                          df, embed_df, show_prompt=True)

Selected 6 document sections:
Nestle_1315
General Electric_1615
General Electric_1625
General Electric_1620
Toyota_895
Toyota_1035
Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* To support employees implementing  what they have learned, 94.4% of our canteens now offer   a healthy meal and 53.0% of our sites provide a nap room  or equivalent space for resting. We also launched a series of global webinars called  HealthTalks, which aim to raise awareness and provide  easy-to-apply tips on how to develop and maintain  healthier lifestyle habits. Six webinars were delivered:  –Mental health and well-being  –Sleep  –Physical activity and active breaks  –Emotional intelligence  –Maintaining good relationships in a hybrid world  –How to cope with the mental load of parenting By creating a respectful, supportive environment   in which our employees feel free to be their best se

'HealthAhead has been GE’s global wellbeing program for more than 10 years. Its mission is to support a culture that inspires and encourages GE employees and their families to optimize their health and wellbeing and live a well-balanced life. To promote wellbeing, HealthAhead has implemented a variety of initiatives, such as launching new branding and structure with a focus on holistic health, introducing four pillars—physical, social, emotional—and adding financial well-being as a new pillar, presenting two global emotional well-being campaigns (It’s OK not to feel OK) with leader support and emotional wellbeing stories from GE employees around the world, hosting more than 30 live webinars for employees throughout the year, covering a range of wellbeing topics, logging more than 1 million well-being minutes during their “Rise to the Challenge” campaign in May, providing access to well-being benefits such as Employee Assistance Programs (EAP), digital tools and mobile apps such as meQu

In [None]:
import numpy as np
int(778.87)

778

In [None]:
df.iloc[42,:]

Article_ID                                            Xerox_130
Text          A recent  example is the launch of the Xerox® ...
Name: 42, dtype: object

In [None]:
df.to_csv("ESG_Reports.csv")

In [None]:
df

Unnamed: 0,Article_ID,Text
0,Starbucks_0,STARBUCKS 2020 REPORTEN V IRO NMENTAL & SOCIAL...
1,Starbucks_10,Our goal to make a positive impact on the live...
2,Starbucks_20,"Inclusion in every way. We are, most certainl..."
3,Starbucks_30,"Recently, we extended COVID-19 benefits for U..."
4,Starbucks_40,"Since the start of the pandemic, Starbucks has..."
...,...,...
527,Goldman Sachs_680,"Accordingly, we have a well-established enterp..."
528,Goldman Sachs_690,"In addition, our ongoing risk monitoring from ..."
529,Goldman Sachs_700,We have been carbon neutral across our global ...
530,Goldman Sachs_710,This document should not be used as a basis fo...
