In [1]:
!pip install tensorflow
!pip install pytorch

Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pytorch
  Building wheel for pytorch (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[6 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/27/xh3tm0h964n5gjzv1brx5vrc0000gn/T/pip-install-juw2wdqd/pytorch_c87e77a8236e4432af4e47b81150a314/setup.py", line 15, in <module>
  [31m   [0m     raise Exception(message)
  [31m   [0m Exception: You tried to install "pytorch". The package named for PyTorch is "torch"
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This e

### ESG-BERT
- Domain Specific BERT Model for Text Mining in Sustainable Investing
- URL: https://huggingface.co/nbroad/ESG-BERT
- This pre-trained model is able to classify each text and returns a label number which correlates to a textual label: 
    * __label__Business_Ethics :  0 
    * __label__Data_Security :  1 
    * __label__Access_And_Affordability :  2 
    * __label__Business_Model_Resilience :  3 
    * __label__Competitive_Behavior :  4 
    * __label__Critical_Incident_Risk_Management :  5 
    * __label__Customer_Welfare :  6 
    * __label__Director_Removal :  7 
    * __label__Employee_Engagement_Inclusion_And_Diversity :  8 
    * __label__Employee_Health_And_Safety :  9 
    * __label__Human_Rights_And_Community_Relations :  10 
    * __label__Labor_Practices :  11 
    * __label__Management_Of_Legal_And_Regulatory_Framework :  12 
    * __label__Physical_Impacts_Of_Climate_Change :  13 
    * __label__Product_Quality_And_Safety :  14 
    * __label__Product_Design_And_Lifecycle_Management :  15 
    * __label__Selling_Practices_And_Product_Labeling :  16 
    * __label__Supply_Chain_Management :  17 
    * __label__Systemic_Risk_Management :  18 
    * __label__Waste_And_Hazardous_Materials_Management :  19 
    * __label__Water_And_Wastewater_Management :  20 
    * __label__Air_Quality :  21 
    * __label__Customer_Privacy :  22 
    * __label__Ecological_Impacts :  23 
    * __label__Energy_Management :  24 
    * __label__GHG_Emissions :  25


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

### Step 1: Mapping ESG-BERT's topics to S&P ESG Criteria Topics
1. Map each criteria topic that is most relevant to the "Health Care Equipments and Supplies" industry to the 26 possible topics
2. The remaining topics that cannot be mapped with the "Health Care Equipments and Supplies" Industry criteria topics, we will use the other criteria topics provided by S&P Global

In [5]:
mapped_to_criteria = {
    "Business_Ethics": ["Business Ethics"],
    "Data_Security": ["Information Security/Cybersecurity & System Availability"], # self added
    "Access_And_Affordability": ["Health Outcome Contribution"], # self added
    "Business_Model_Resilience": ["Codes of Business Conduct"], # self added
    "Competitive_Behavior": ["Business Ethics"], # self added
    "Critical_Incident_Risk_Management": ["Risk & Crisis Management"], # self added
    "Customer_Welfare": ["Customer Relationship Management"], # self added
    "Director_Removal": ["Anti-Crime Policy & Measures"], # self added
    "Employee_Engagement_Inclusion_And_Diversity": ["Human Capital Development"],
    "Employee_Health_And_Safety": ["Operational Eco-Efficiency"],
    "Human_Rights_And_Community_Relations": ["Community Relations"], # self added
    "Labor_Practices": ["Labor Practice Indicators"], # self added
    "Management_Of_Legal_And_Regulatory_Framework": ["Corporate Governance"], # self added
    "Physical_Impacts_Of_Climate_Change": ["Climate Change"],
    "Product_Quality_And_Safety": ["Product Quality & Recall Management"],
    "Product_Design_And_Lifecycle_Management": ["Product Stewardship"], # self added
    "Selling_Practices_And_Product_Labeling": ["Marketing Practices"], # self added
    "Supply_Chain_Management": ["Supply Chain Management"],
    "Systemic_Risk_Management": ["Risk & Crisis Management"], # self added
    "Waste_And_Hazardous_Materials_Management": ["Operational Eco-Efficiency"],
    "Water_And_Wastewater_Management": ["Operational Eco-Efficiency"],
    "Air_Quality": ["Operational Eco-Efficiency"],
    "Customer_Privacy": ["Privacy Protection"], # self added
    "Ecological_Impacts": ["Natural Capital"],
    "Energy_Management": ["Operational Eco-Efficiency"],
    "GHG_Emissions": ["Climate Change"]
}

### Step 2: Run report into ESG-BERT model to classify each sentence

In [6]:
import torch
import math

def run_model(input_file, model):
    counter = 1
    max_seq_length = tokenizer.model_max_length
    with open(input_file, "r") as f:
        output = {}
        for line in f:
            if len(line) <= max_seq_length:
                output['Line ' + str(counter)] = {}
                inputs = tokenizer(line, return_tensors="pt")
                outputs = model(**inputs)
                probs = outputs.logits.softmax(dim=1)

                # Extract the top 3 probabilities and labels
                top_probs, top_labels = torch.topk(probs, k=3)
                # store output
                output['Line ' + str(counter)]['Sentence'] = line
                for i in range(3):
                    label = model.config.id2label[top_labels[0][i].item()]
                    prob = top_probs[0][i].item()
                    output['Line ' + str(counter)][f'ESG BERT Topic {i+1}'] = (label, prob)
                    
                counter += 1
            else:
                # split input text into chunks
                text_chunks = []
                for i in range(math.ceil(len(line)/max_seq_length)):
                    start = i * max_seq_length
                    end = min((i+1)*max_seq_length, len(line))
                    text_chunks.append(line[start:end])
                for chunk in text_chunks:
                    output['Line ' + str(counter)] = {}
                    inputs = tokenizer(chunk, return_tensors="pt", padding= True, truncation=True, max_length=max_seq_length)
                    outputs = model(**inputs)
                    probs = outputs.logits.softmax(dim=1)
                    top_probs, top_labels = torch.topk(probs, k=3)
                    
                    output['Line ' + str(counter)]['Sentence'] = line
                    for i in range(3):
                        label = model.config.id2label[top_labels[0][i].item()]
                        prob = top_probs[0][i].item()
                        output['Line ' + str(counter)][f'ESG BERT Topic {i+1}'] = (label, prob)
                    counter += 1
    return output

### Step 3: Mapping output labels to our criteria topics
- We will map the output labels generated by ESG-BERT with the mapping, `mapped_to_criteria`
- We also have decided to classify sentences whose label with the highest probability is less than 0.50 as `Non-ESG`

In [7]:
def process_output(result):
    df = pd.DataFrame.from_dict(result, orient= 'index')
    def mapperFunction(row):
        value = row[1]
        if value < 0.5:
            return ("NON-ESG", value)
        else:
            return (mapped_to_criteria[row[0]], value)
    df['Mapped Criteria Topic 1'] = df['ESG BERT Topic 1'].apply(lambda x: mapperFunction(x))
    df['Mapped Criteria Topic 2'] = df['ESG BERT Topic 2'].apply(lambda x: mapperFunction(x))
    df['Mapped Criteria Topic 3'] = df['ESG BERT Topic 3'].apply(lambda x: mapperFunction(x))
    return df

### Step 4: Export as CSV

In [8]:
def export_output_to_csv(df, output_filename):
    df.to_csv(output_filename)

### Step 5: Run all reports into the ESG-BERT Model

In [9]:
import os
import pandas as pd

folder_path = 'esg_reports'

for foldername in os.listdir(folder_path):
    company_folder_path = folder_path + '/' + foldername

    if foldername == ".DS_Store":
        continue

    for filename in os.listdir(company_folder_path):
        if filename == ".DS_Store":
            continue
        if not os.path.exists('data/' + company_folder_path):
            os.makedirs('data/' + company_folder_path)

        input_file = company_folder_path + '/' + filename
        output_file = 'data/' + input_file.replace('.txt','.csv')
        output_result = run_model(input_file, model)
        output_df = process_output(output_result)
        export_output_to_csv(output_df, output_file)

In [7]:
import pandas as pd


In [8]:
df = pd.read_csv("data/esg_reports/ABT_Abbott/ABT_2021.txt")
df = df[['Unnamed: 0', 'Sentence']]
df = df.rename(columns={"Unnamed: 0" : "Line"})
df.to_csv("Result")

In [15]:
pd.set_option('display.width', 5)
df

Unnamed: 0,Line,Sentence
0,Line 1,"GLOBAL SUSTAINABILITY REPORT 2021For Abbott,..."
1,Line 2,"In this report, we detail our progress agains..."
2,Line 3,The data presented reflect 2021 performance u...
3,Line 4,We have aligned our reporting with the requir...
4,Line 5,We have aligned our reporting with the requir...
...,...,...
2019,Line 2020,"The goal of our three-year, $5 million effort..."
2020,Line 2021,The pilot seeks to better understand and addr...
2021,Line 2022,"Focusing initial efforts in Columbus, Ohio, we..."
2022,Line 2023,In collaboration with the National Center for ...
