In [92]:
!pip install tensorflow
!pip install pytorch
!pip install torch
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pytorch
  Building wheel for pytorch (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[6 lines of outpu

### ESG-BERT
- Domain Specific BERT Model for Text Mining in Sustainable Investing
- URL: https://huggingface.co/nbroad/ESG-BERT
- This pre-trained model is able to classify each text and returns a label number which correlates to a textual label: 
    * __label__Business_Ethics :  0 
    * __label__Data_Security :  1 
    * __label__Access_And_Affordability :  2 
    * __label__Business_Model_Resilience :  3 
    * __label__Competitive_Behavior :  4 
    * __label__Critical_Incident_Risk_Management :  5 
    * __label__Customer_Welfare :  6 
    * __label__Director_Removal :  7 
    * __label__Employee_Engagement_Inclusion_And_Diversity :  8 
    * __label__Employee_Health_And_Safety :  9 
    * __label__Human_Rights_And_Community_Relations :  10 
    * __label__Labor_Practices :  11 
    * __label__Management_Of_Legal_And_Regulatory_Framework :  12 
    * __label__Physical_Impacts_Of_Climate_Change :  13 
    * __label__Product_Quality_And_Safety :  14 
    * __label__Product_Design_And_Lifecycle_Management :  15 
    * __label__Selling_Practices_And_Product_Labeling :  16 
    * __label__Supply_Chain_Management :  17 
    * __label__Systemic_Risk_Management :  18 
    * __label__Waste_And_Hazardous_Materials_Management :  19 
    * __label__Water_And_Wastewater_Management :  20 
    * __label__Air_Quality :  21 
    * __label__Customer_Privacy :  22 
    * __label__Ecological_Impacts :  23 
    * __label__Energy_Management :  24 
    * __label__GHG_Emissions :  25


In [81]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import math

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

### Step 1: Mapping ESG-BERT's topics to S&P ESG Criteria Topics
1. Using ESG-BERT to classify our S&P Global ESG Criteria Topics into its labels
2. We only take the predict label with highest probability.
3. For those labels without any S&P ESG Criteria Topics, we will manually map them ourselves based on relevancy

`labels_to_criteria`
- Key: ESG-BERT's ESG_Labels
- Value: S&P Global ESG Criteria Topics

In [82]:
import pandas as pd
import numpy as np
labels_to_criteria = {}
manual_mapping = []

# Criteria_Topics is a concatenated list of criteria topics from the industries that we are analyzing
with open("Scrapers/criteria_topics.txt", "r") as f:
    criteria_topics = eval(f.read())

for topic in criteria_topics:
    inputs = tokenizer(topic, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    top_probs, top_labels = torch.topk(probs, k=1)
    label = model.config.id2label[top_labels[0][0].item()]
    prob = top_probs[0][0].item()

    if label not in labels_to_criteria:
        labels_to_criteria[label] = []
    labels_to_criteria[label].append(topic)

# KEY: ESG-BERT's ESG_Labels
# Value: List of S&P Global ESG Criteria Topics
labels_to_criteria

{'Physical_Impacts_Of_Climate_Change': ['Climate Strategy'],
 'Business_Model_Resilience': ['Human Capital Development',
  'Innovation Management'],
 'Human_Rights_And_Community_Relations': ['Human Rights',
  'Environmental Policy & Management Systems',
  'Social Impacts on Communities',
  'Stakeholder Engagement'],
 'Data_Security': ['Information Security Cybersecurity & System Availability',
  'Network Reliability'],
 'Energy_Management': ['Operational Eco-Efficiency',
  'Low Carbon Strategy',
  'Energy Mix',
  'Electricity Generation',
  'Fuel Efficiency'],
 'Product_Design_And_Lifecycle_Management': ['Product Stewardship',
  'Packaging',
  'Sustainable Marketing & Brand Perception',
  'Sustainable Finance',
  'Circular Fashion'],
 'Supply_Chain_Management': ['Supply Chain Management'],
 'Employee_Engagement_Inclusion_And_Diversity': ['Talent Attraction & Retention',
  'Financial Inclusion'],
 'Labor_Practices': ['Labor Practice Indicators'],
 'Systemic_Risk_Management': ['Risk & Cr

For those ESG_labels that are not mapped to any ESG criteria topics, we would map them manually based on relevancy

In [83]:
labels_to_criteria = {
    'Physical_Impacts_Of_Climate_Change':['Climate Strategy'],
    'Business_Model_Resilience': ['Human Capital Development', 'Innovation Management'],
    'Human_Rights_And_Community_Relations': ['Human Rights', 'Environmental Policy & Management Systems', 'Social Impacts on Communities', 'Stakeholder Engagement'],
    'Data_Security': ['Information Security Cybersecurity & System Availability', 'Network Reliability'],
    'Energy_Management': ['Operational Eco-Efficiency', 'Low Carbon Strategy', 'Energy Mix', 'Electricity Generation', 'Fuel Efficiency'],
    'Product_Design_And_Lifecycle_Management': ['Product Stewardship', 'Packaging', 'Sustainable Marketing & Brand Perception', 'Sustainable Finance', 'Circular Fashion'],
    'Supply_Chain_Management': ['Supply Chain Management'],
    'Employee_Engagement_Inclusion_And_Diversity': ['Talent Attraction & Retention', 'Financial Inclusion'],
    'Labor_Practices': ['Labor Practice Indicators'],
    'Systemic_Risk_Management': ['Risk & Crisis Management', 'Decarbonization Strategy'],
    'Product_Quality_And_Safety': ['Product Quality & Recall Management'],
    'Management_Of_Legal_And_Regulatory_Framework': ['Responsibility of Content', 'Compliance with Applicable Export Control Regimes'],
    'Competitive_Behavior': ['Corporate Governance', 'Market Opportunities'],
    'Employee_Health_And_Safety': ['Occupational Health & Safety', 'Health Outcome Contribution'],
    'Access_And_Affordability': ['Access to Healthcare'],
    'Business_Ethics': ['Business Ethics'],
    'Ecological_Impacts': ['Biodiversity', 'Sustainable Agricultural Practices'],
    'Waste_And_Hazardous_Materials_Management': ['Food Loss & Waste'],
    'Water_And_Wastewater_Management': ['Water Related Risks'],
    "Air_Quality": ['Low Carbon Strategy'], # manual mapping
    "Critical_Incident_Risk_Management": ['Risk & Crisis Management'], # manual mapping
    "Customer_Privacy": ['Information Security Cybersecurity & System Availability'], # manual mapping
    "Customer_Welfare": ['Social Impacts on Communities', 'Human Rights', 'Stakeholder Engagement'], # manual mapping
    "Director_Removal": ['Business Ethics'], # manual mapping
    "GHG_Emissions": ['Operational Eco-Efficiency', 'Low Carbon Strategy', 'Decarbonization Strategy'], # manual mapping
    "Selling_Practices_And_Product_Labeling": ['Business Ethics', 'Product Stewardship', 'Packaging', 'Sustainable Marketing & Brand Perception'], # manual mapping
}

### Step 2: Preparing all the mappings collected from webscrapping

Import Industry to Criteria Topics Mapping as `industry_criteria_mapping`
- Key: Industry
- Value: List of most relevant criteria topics

In [84]:
industry_criteria_df = pd.read_csv("Scrapers/Industry Criteria Topics.csv").transpose()
industry_criteria_mapping = {}

for industry, topics in industry_criteria_df.iterrows():
    if industry not in industry_criteria_mapping:
        industry_criteria_mapping[industry] = []
    topics = topics.dropna()
    industry_criteria_mapping[industry].extend(topics.to_list())
industry_criteria_mapping

{'Industry: THQ Computers & Peripherals and Office Electronics': ['Climate Strategy',
  'Human Capital Development',
  'Human Rights',
  'Information Security Cybersecurity & System Availability',
  'Innovation Management',
  'Operational Eco-Efficiency',
  'Product Stewardship',
  'Supply Chain Management'],
 'Industry: SOF Software': ['Climate Strategy',
  'Environmental Policy & Management Systems',
  'Human Capital Development',
  'Information Security Cybersecurity & System Availability',
  'Innovation Management',
  'Operational Eco-Efficiency',
  'Talent Attraction & Retention'],
 'Industry: RTS Retailing': ['Climate Strategy',
  'Human Rights',
  'Labor Practice Indicators',
  'Operational Eco-Efficiency',
  'Packaging',
  'Risk & Crisis Management',
  'Supply Chain Management',
  'Sustainable Marketing & Brand Perception'],
 'Industry: SEM Semiconductors & Semiconductor Equipment': ['Climate Strategy',
  'Environmental Policy & Management Systems',
  'Human Capital Development

Import the company to industry mappings as `company_industry_mapping` dictionary
- key: TICKER
- Value: Industry

In [85]:
with open("Scrapers/company_industry_mapping.txt", "r") as f:
    company_industry_mapping = eval(f.read())

company_industry_mapping

{'AAPL': 'Industry: THQ Computers & Peripherals and Office Electronics',
 'MSFT': 'Industry: SOF Software',
 'ADBE': 'Industry: SOF Software',
 'AMZN': 'Industry: RTS Retailing',
 'HD': 'Industry: RTS Retailing',
 'NVDA': 'Industry: SEM Semiconductors & Semiconductor Equipment',
 'AVGO': 'Industry: SEM Semiconductors & Semiconductor Equipment',
 'TXN': 'Industry: SEM Semiconductors & Semiconductor Equipment',
 'AMD': 'Industry: SEM Semiconductors & Semiconductor Equipment',
 'QCOM': 'Industry: SEM Semiconductors & Semiconductor Equipment',
 'INTC': 'Industry: SEM Semiconductors & Semiconductor Equipment',
 'GOOGL': 'Industry: IMS Interactive Media, Services & Home Entertainment',
 'META': 'Industry: IMS Interactive Media, Services & Home Entertainment',
 'TSLA': 'Industry: AUT Automobiles',
 'UNH': 'Industry: HEA Health Care Providers & Services',
 'XOM': 'Industry: OGX Oil & Gas Upstream & Integrated',
 'CVX': 'Industry: OGX Oil & Gas Upstream & Integrated',
 'JPM': 'Industry: BNK Ban

### Step 3: Run report into ESG-BERT model to classify each sentence

In [86]:
import torch
import math

def run_model(input_file, model):
    counter = 1
    max_seq_length = tokenizer.model_max_length
    # with open(input_file, "r") as f:
    input_df = pd.read_csv(input_file)
    output = {}
    for line in input_df['sentences']:
        if len(line) <= max_seq_length:
            output['Line ' + str(counter)] = {}
            inputs = tokenizer(line, return_tensors="pt")
            outputs = model(**inputs)
            probs = outputs.logits.softmax(dim=1)

            # Extract the top 3 probabilities and labels
            top_probs, top_labels = torch.topk(probs, k=3)
            # store output
            output['Line ' + str(counter)]['Sentence'] = line
            for i in range(3):
                label = model.config.id2label[top_labels[0][i].item()]
                prob = top_probs[0][i].item()
                output['Line ' + str(counter)][f'ESG BERT Topic {i+1}'] = (label, prob)
            counter += 1
        else:
            # split input text into chunks
            text_chunks = []
            for i in range(math.ceil(len(line)/max_seq_length)):
                start = i * max_seq_length
                end = min((i+1)*max_seq_length, len(line))
                text_chunks.append(line[start:end])
            for chunk in text_chunks:
                output['Line ' + str(counter)] = {}
                inputs = tokenizer(chunk, return_tensors="pt", padding= True, truncation=True, max_length=max_seq_length)
                outputs = model(**inputs)
                probs = outputs.logits.softmax(dim=1)
                top_probs, top_labels = torch.topk(probs, k=3)
                
                output['Line ' + str(counter)]['Sentence'] = line
                for i in range(3):
                    label = model.config.id2label[top_labels[0][i].item()]
                    prob = top_probs[0][i].item()
                    output['Line ' + str(counter)][f'ESG BERT Topic {i+1}'] = (label, prob)
                counter += 1
    return output

### Step 4: Mapping output labels to our criteria topics
- We will map the output labels generated by ESG-BERT with the mapping, `mapped_to_criteria`
- We also have decided to classify sentences whose label with the highest probability is less than 0.50 as `Non-ESG`

In [87]:
''' [How it works]
    1. Get the list of most relevant criteria topics for the company based on its industry
    2. Based on the ESG_label with the highest probability, we get the list of potential criteria topics that can be mapped to the target ESG_label
    3. We check the list of potential criteria topics using the list of most relevant criteria topics
    4. If it exists, we just map the label to that criteria topic.
'''
def process_output(result, industry_criteria_mapping, industry):
    df = pd.DataFrame.from_dict(result, orient= 'index')
    # Get the list of criteria topics most relevant to the company based on its industry
    industry_criteria_topics = industry_criteria_mapping[industry]
    def mapperFunction(row, industry_criteria_topics):
        value = row[1]
        if value < 0.5:
            return ("NON-ESG", value)
        else:
            # get the list of potential criteria topics that can be mapped based on the target ESG-label
            mapped_criteria_topics = labels_to_criteria[row[0]]
            result = []
            # for each criteria topic that is relevant to the company
            for topic in mapped_criteria_topics:
                # check if the topic exist in the list of potential criteria topic
                if topic in industry_criteria_topics:
                    # if it exist, then we would just map that ESG-Label to that criteria topic
                    result.append(topic)
            return (result, value)
    df['Mapped Criteria Topic 1'] = df['ESG BERT Topic 1'].apply(lambda x: mapperFunction(x,industry_criteria_topics))
    df['Mapped Criteria Topic 2'] = df['ESG BERT Topic 2'].apply(lambda x: mapperFunction(x,industry_criteria_topics))
    df['Mapped Criteria Topic 3'] = df['ESG BERT Topic 3'].apply(lambda x: mapperFunction(x,industry_criteria_topics))
    return df

### Step 5: Export as CSV

In [88]:
def export_output_to_csv(df, output_filename):
    df.to_csv(output_filename)

### Step 6: Run all reports into the ESG-BERT Model

In [89]:
# sample_input_file = "PDF Text Data/preprocessed/AAPL_2022.csv"
# output = "sample_output.csv"

# output_results = run_model(sample_input_file, model)
# output_df = process_output(output_results, industry_criteria_mapping, industry)


In [90]:
# output_df.iloc[12]

### NOTE: This will take about 2 hours to classify all 50 ESG reports
- For sample output, please run the commented code instead and comment out this chunk.

In [91]:
import os
import pandas as pd

'''
Note: This will take about 2 hours to classify all 25 ESG reports.

For a sample output please run the code chunk below
'''
folder_path = 'pdf_text'
output_path = "output/classified_sentences"

if not os.path.exists(output_path):
    os.makedirs(output_path)

for filename in os.listdir(folder_path):
    print(f"processing {filename}")
    if filename == ".DS_Store":
        continue

    # Get industry mapping
    ticker = filename.split('_')[0]
    industry = company_industry_mapping[ticker]

    input_file = folder_path + '/' + filename
    output_file = output_path + '/' + filename.replace(".csv", "_results.csv")
    output_result = run_model(input_file, model)
    output_df = process_output(output_result, industry_criteria_mapping, industry)
    export_output_to_csv(output_df, output_file)
    print(f"Exported to {output_file}")

######## SAMPLE OUTPUT RUN THIS ########
# filename = "AMZN_2021.csv"
# ticker = filename.split("_")[0]
# industry = company_industry_mapping[ticker]
# input_file = "pdf_text" + "/" + filename
# ouptut_file = "output/classified_sentences/" + filename.replace(".csv", "_results.csv")
# output_result = run_model(input_file, model)
# output_df = process_output(output_result, industry_criteria_mapping, industry)
# export_output_to_csv(output_df, output_file)


processing NEE_2022.csv
Exported to output/classified_sentences/NEE_2022_results.csv
processing NFLX_2021.csv
Exported to output/classified_sentences/NFLX_2021_results.csv
processing LIN_2021.csv
Exported to output/classified_sentences/LIN_2021_results.csv
processing AMGN_2021.csv
Exported to output/classified_sentences/AMGN_2021_results.csv
processing INTC_2022.csv
Exported to output/classified_sentences/INTC_2022_results.csv
processing .DS_Store
processing VZ_2022.csv
Exported to output/classified_sentences/VZ_2022_results.csv
processing ADBE_2021.csv
Exported to output/classified_sentences/ADBE_2021_results.csv
processing CSCO_2022.csv
Exported to output/classified_sentences/CSCO_2022_results.csv
processing TXN_2021.csv
Exported to output/classified_sentences/TXN_2021_results.csv
processing ACN_2022.csv
Exported to output/classified_sentences/ACN_2022_results.csv
processing RTX_2021.csv
Exported to output/classified_sentences/RTX_2021_results.csv
processing BAC_2022.csv
Exported to 

In [57]:
test = pd.read_csv("output/classified_sentences/AAPL_2022_results.csv")
test

Unnamed: 0.1,Unnamed: 0,Sentence,ESG BERT Topic 1,ESG BERT Topic 2,ESG BERT Topic 3,Mapped Criteria Topic 1,Mapped Criteria Topic 2,Mapped Criteria Topic 3
0,Line 1,Apples 2022 ESG ReportEnvironmental Social Gov...,('Employee_Engagement_Inclusion_And_Diversity'...,"('Human_Rights_And_Community_Relations', 0.042...","('Employee_Health_And_Safety', 0.0128874722868...","([], 0.8622750043869019)","('NON-ESG', 0.04276227578520775)","('NON-ESG', 0.012887472286820412)"
1,Line 2,Apples 2022 ESG ReportEnvironmental Social Gov...,"('Human_Rights_And_Community_Relations', 0.562...","('Labor_Practices', 0.11484292894601822)",('Employee_Engagement_Inclusion_And_Diversity'...,"(['Human Rights'], 0.562854528427124)","('NON-ESG', 0.11484292894601822)","('NON-ESG', 0.059670183807611465)"
2,Line 3,Apples 2022 ESG ReportEnvironmental Social Gov...,('Management_Of_Legal_And_Regulatory_Framework...,"('Human_Rights_And_Community_Relations', 0.011...","('Business_Ethics', 0.010208303108811378)","([], 0.8992740511894226)","('NON-ESG', 0.01192495971918106)","('NON-ESG', 0.010208303108811378)"
3,Line 4,Numbers and percentages in this report include...,"('Customer_Privacy', 0.1519385725259781)",('Management_Of_Legal_And_Regulatory_Framework...,('Employee_Engagement_Inclusion_And_Diversity'...,"('NON-ESG', 0.1519385725259781)","('NON-ESG', 0.10689134150743484)","('NON-ESG', 0.10238881409168243)"
4,Line 5,"For more information, see About the report.Con...","('Human_Rights_And_Community_Relations', 0.882...","('Business_Model_Resilience', 0.01581687480211...","('Ecological_Impacts', 0.012996220029890537)","(['Human Rights'], 0.882281482219696)","('NON-ESG', 0.01581687480211258)","('NON-ESG', 0.012996220029890537)"
...,...,...,...,...,...,...,...,...
1558,Line 1559,24 As of our latest alumni survey in December ...,('Employee_Engagement_Inclusion_And_Diversity'...,"('Director_Removal', 0.034573838114738464)","('Data_Security', 0.03089836798608303)","([], 0.6831240057945251)","('NON-ESG', 0.034573838114738464)","('NON-ESG', 0.03089836798608303)"
1559,Line 1560,25 As of the end of December 2021.,('Management_Of_Legal_And_Regulatory_Framework...,"('Director_Removal', 0.11516179889440536)","('Systemic_Risk_Management', 0.10996365547180176)","('NON-ESG', 0.13106544315814972)","('NON-ESG', 0.11516179889440536)","('NON-ESG', 0.10996365547180176)"
1560,Line 1561,( https://www.,('Management_Of_Legal_And_Regulatory_Framework...,"('Competitive_Behavior', 0.10344302654266357)","('Energy_Management', 0.06293976306915283)","('NON-ESG', 0.21969163417816162)","('NON-ESG', 0.10344302654266357)","('NON-ESG', 0.06293976306915283)"
1561,Line 1562,apple.com/newsroom/2021/12/apple-marks-a- year...,('Management_Of_Legal_And_Regulatory_Framework...,"('Energy_Management', 0.005543217528611422)","('Access_And_Affordability', 0.003056949470192...","([], 0.9603014588356018)","('NON-ESG', 0.005543217528611422)","('NON-ESG', 0.003056949470192194)"
