In [1]:
!pip install tensorflow
!pip install pytorch

Collecting tensorflow
  Downloading tensorflow-2.12.0-cp39-cp39-macosx_10_15_x86_64.whl (230.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.1/230.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting numpy<1.24,>=1.22
  Downloading numpy-1.23.5-cp39-cp39-macosx_10_9_x86_64.whl (18.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting jax>=0.3.15
  Downloading jax-0.4.7.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting tensorboard<2.13,>=2.12
  Downloading tensorboard-2.12.0-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m 

### ESG-BERT
- Domain Specific BERT Model for Text Mining in Sustainable Investing
- URL: https://huggingface.co/nbroad/ESG-BERT
- This pre-trained model is able to classify each text and returns a label number which correlates to a textual label: 
    * __label__Business_Ethics :  0 
    * __label__Data_Security :  1 
    * __label__Access_And_Affordability :  2 
    * __label__Business_Model_Resilience :  3 
    * __label__Competitive_Behavior :  4 
    * __label__Critical_Incident_Risk_Management :  5 
    * __label__Customer_Welfare :  6 
    * __label__Director_Removal :  7 
    * __label__Employee_Engagement_Inclusion_And_Diversity :  8 
    * __label__Employee_Health_And_Safety :  9 
    * __label__Human_Rights_And_Community_Relations :  10 
    * __label__Labor_Practices :  11 
    * __label__Management_Of_Legal_And_Regulatory_Framework :  12 
    * __label__Physical_Impacts_Of_Climate_Change :  13 
    * __label__Product_Quality_And_Safety :  14 
    * __label__Product_Design_And_Lifecycle_Management :  15 
    * __label__Selling_Practices_And_Product_Labeling :  16 
    * __label__Supply_Chain_Management :  17 
    * __label__Systemic_Risk_Management :  18 
    * __label__Waste_And_Hazardous_Materials_Management :  19 
    * __label__Water_And_Wastewater_Management :  20 
    * __label__Air_Quality :  21 
    * __label__Customer_Privacy :  22 
    * __label__Ecological_Impacts :  23 
    * __label__Energy_Management :  24 
    * __label__GHG_Emissions :  25


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")

model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

### Step 1: Mapping ESG-BERT's topics to S&P ESG Criteria Topics
1. Map each criteria topic that is most relevant to the "Health Care Equipments and Supplies" industry to the 26 possible topics
2. The remaining topics that cannot be mapped with the "Health Care Equipments and Supplies" Industry criteria topics, we will use the other criteria topics provided by S&P Global

In [2]:
mapped_to_criteria = {
    "Business_Ethics": ["Business Ethics"],
    "Data_Security": ["Information Security/Cybersecurity & System Availability"], # self added
    "Access_And_Affordability": ["Health Outcome Contribution"], # self added
    "Business_Model_Resilience": ["Codes of Business Conduct"], # self added
    "Competitive_Behavior": ["Business Ethics"], # self added
    "Critical_Incident_Risk_Management": ["Risk & Crisis Management"], # self added
    "Customer_Welfare": ["Customer Relationship Management"], # self added
    "Director_Removal": ["Anti-Crime Policy & Measures"], # self added
    "Employee_Engagement_Inclusion_And_Diversity": ["Human Capital Development"],
    "Employee_Health_And_Safety": ["Operational Eco-Efficiency"],
    "Human_Rights_And_Community_Relations": ["Community Relations"],
    "Labor_Practices": ["Labor Practice Indicators"], # self added
    "Management_Of_Legal_And_Regulatory_Framework": ["Corporate Governance"], # self added
    "Physical_Impacts_Of_Climate_Change": ["Climate Change"],
    "Product_Quality_And_Safety": ["Product Quality & Recall Management"],
    "Product_Design_And_Lifecycle_Management": ["Product Stewardship"], # self added
    "Selling_Practices_And_Product_Labeling": ["Marketing Practices"], # self added
    "Supply_Chain_Management": ["Supply Chain Management"],
    "Systemic_Risk_Management": ["Risk & Crisis Management"], # self added
    "Waste_And_Hazardous_Materials_Management": ["Operational Eco-Efficiency"],
    "Water_And_Wastewater_Management": ["Operational Eco-Efficiency"],
    "Air_Quality": ["Operational Eco-Efficiency"],
    "Customer_Privacy": ["Privacy Protection"], # self added
    "Ecological_Impacts": ["Natural Capital"],
    "Energy_Management": ["Operational Eco-Efficiency"],
    "GHG_Emissions": ["Climate Change"]
}

### Step 2: Run each report into ESG-BERT Pre-trained model to classify each sentence

In [7]:
import torch
import math

def run_model(input_file, model):
    counter = 1
    max_seq_length = tokenizer.model_max_length
    with open(input_file, "r") as f:
        output = {}
        for line in f:
            if len(line) <= max_seq_length:
                output['Line ' + str(counter)] = {}
                inputs = tokenizer(line, return_tensors="pt")
                outputs = model(**inputs)
                probs = outputs.logits.softmax(dim=1)

                # Extract the top 3 probabilities and labels
                top_probs, top_labels = torch.topk(probs, k=3)
                # store output
                output['Line ' + str(counter)]['Sentence'] = line
                for i in range(3):
                    label = model.config.id2label[top_labels[0][i].item()]
                    prob = top_probs[0][i].item()
                    output['Line ' + str(counter)][f'ESG BERT Topic {i+1}'] = (label, prob)
                    
                counter += 1
            else:
                # split input text into chunks
                text_chunks = []
                for i in range(math.ceil(len(line)/max_seq_length)):
                    start = i * max_seq_length
                    end = min((i+1)*max_seq_length, len(line))
                    text_chunks.append(line[start:end])
                for chunk in text_chunks:
                    output['Line ' + str(counter)] = {}
                    inputs = tokenizer(chunk, return_tensors="pt", padding= True, truncation=True, max_length=max_seq_length)
                    outputs = model(**inputs)
                    probs = outputs.logits.softmax(dim=1)
                    top_probs, top_labels = torch.topk(probs, k=3)
                    
                    output['Line ' + str(counter)]['Sentence'] = line
                    for i in range(3):
                        label = model.config.id2label[top_labels[0][i].item()]
                        prob = top_probs[0][i].item()
                        output['Line ' + str(counter)][f'ESG BERT Topic {i+1}'] = (label, prob)
                    counter += 1
    return output

In [8]:
def process_output(result):
    df = pd.DataFrame.from_dict(result, orient= 'index')
    def mapperFunction(row):
        value = row[1]
        if value < 0.5:
            return ("NON-ESG", value)
        else:
            return (mapped_to_criteria[row[0]], value)
    df['Mapped Criteria Topic 1'] = df['ESG BERT Topic 1'].apply(lambda x: mapperFunction(x))
    df['Mapped Criteria Topic 2'] = df['ESG BERT Topic 2'].apply(lambda x: mapperFunction(x))
    df['Mapped Criteria Topic 3'] = df['ESG BERT Topic 3'].apply(lambda x: mapperFunction(x))
    return df

In [9]:
def export_output_to_csv(df, output_filename):
    df.to_csv(output_filename)

In [15]:
import os
import pandas as pd

folder_path = 'esg_reports'

for foldername in os.listdir(folder_path):
    company_folder_path = folder_path + '/' + foldername

    if foldername == ".DS_Store":
        continue

    for filename in os.listdir(company_folder_path):
        if filename == ".DS_Store":
            continue
        if not os.path.exists('data/' + company_folder_path):
            os.makedirs('data/' + company_folder_path)

        input_file = company_folder_path + '/' + filename
        output_file = 'data/' + input_file.replace('.txt','.csv')
        output_result = run_model(input_file, model)
        output_df = process_output(output_result)
        export_output_to_csv(output_df, output_file)

### Step 3: Mapping output labels to our criteria topics
- We will map the output labels generated by ESG-BERT with the mapping, `mapped_to_criteria`
- We also have decided to classify sentences whose label with the highest probability is less than 0.50 as `Non-ESG`

In [18]:
a = pd.read_csv("data/esg_reports/ABT_Abbott/ABT_2018.txt")
a

Unnamed: 0.1,Unnamed: 0,Sentence,ESG BERT Topic 1,ESG BERT Topic 2,ESG BERT Topic 3,Mapped Criteria Topic 1,Mapped Criteria Topic 2,Mapped Criteria Topic 3
0,Line 1,GLOBAL SUSTAINABILITY REPORT 2018CHANGING ...,"('Business_Model_Resilience', 0.3318382799625397)","('Product_Design_And_Lifecycle_Management', 0....","('Supply_Chain_Management', 0.0887330025434494)","('NON-ESG', 0.3318382799625397)","('NON-ESG', 0.28791943192481995)","('NON-ESG', 0.0887330025434494)"
1,Line 2,Our Business .\n,"('Business_Model_Resilience', 0.28659284114837...","('Customer_Privacy', 0.1499490737915039)","('Director_Removal', 0.07373366504907608)","('NON-ESG', 0.28659284114837646)","('NON-ESG', 0.1499490737915039)","('NON-ESG', 0.07373366504907608)"
2,Line 3,Corporate Governance .\n,"('Competitive_Behavior', 0.2577420771121979)","('Systemic_Risk_Management', 0.16401955485343933)","('Business_Ethics', 0.09896515309810638)","('NON-ESG', 0.2577420771121979)","('NON-ESG', 0.16401955485343933)","('NON-ESG', 0.09896515309810638)"
3,Line 4,Sustainability at Abbott .\n,"('Product_Design_And_Lifecycle_Management', 0....","('Supply_Chain_Management', 0.08550751954317093)","('Business_Model_Resilience', 0.06462757289409...","('NON-ESG', 0.4124208986759186)","('NON-ESG', 0.08550751954317093)","('NON-ESG', 0.06462757289409637)"
4,Line 5,Supporting the Sustainable Development Goals .\n,"('Human_Rights_And_Community_Relations', 0.603...","('Ecological_Impacts', 0.07986337691545486)","('Business_Model_Resilience', 0.05616339296102...","(['Community Relations'], 0.6033405065536499)","('NON-ESG', 0.07986337691545486)","('NON-ESG', 0.05616339296102524)"
...,...,...,...,...,...,...,...,...
1246,Line 1247,Presented at the American Diabetes Associati...,"('Access_And_Affordability', 0.15332642197608948)","('Employee_Health_And_Safety', 0.1416211277246...","('Customer_Welfare', 0.10249901562929153)","('NON-ESG', 0.15332642197608948)","('NON-ESG', 0.14162112772464752)","('NON-ESG', 0.10249901562929153)"
1247,Line 1248,https://plan.\n,('Management_Of_Legal_And_Regulatory_Framework...,"('Business_Model_Resilience', 0.09156531095504...","('Energy_Management', 0.08322495222091675)","('NON-ESG', 0.1623869389295578)","('NON-ESG', 0.09156531095504761)","('NON-ESG', 0.08322495222091675)"
1248,Line 1249,core-apps.com/tristar_ada18/abstract/518844674...,"('Data_Security', 0.20764286816120148)",('Management_Of_Legal_And_Regulatory_Framework...,"('Competitive_Behavior', 0.09675122052431107)","('NON-ESG', 0.20764286816120148)","('NON-ESG', 0.17650355398654938)","('NON-ESG', 0.09675122052431107)"
1249,Line 1250,Data on file: Abbott Diabetes CareABOUT THE CO...,"('Business_Model_Resilience', 0.21484653651714...","('Product_Design_And_Lifecycle_Management', 0....","('Supply_Chain_Management', 0.06737373024225235)","('NON-ESG', 0.21484653651714325)","('NON-ESG', 0.2018822878599167)","('NON-ESG', 0.06737373024225235)"
