In [None]:
# Acknowledge
# This is a Group project with ZhiWei Zhan, Boyangzhang, Jiayou Qu. The pulished code may have very similar content with their published code.
# Some codes are inspired by open source project and ChatGPT
# The FIN-BERT using in the code are from https://huggingface.co/yiyanghkust/finbert-esg

# This step 2 code has done:
# 1, Generate sentence embeddings for the sentences of 10K reports and aggregate them to report representations

In [1]:
# import necessary packages
import glob
import pandas as pd
import os
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import concurrent.futures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load 10k rpeort data from step1
folder_path = 'D:/UCL/workspace/final/test'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

In [3]:
print(csv_files)

['D:/UCL/workspace/final/test\\ACN-2010.csv', 'D:/UCL/workspace/final/test\\ACN-2011.csv', 'D:/UCL/workspace/final/test\\ACN-2012.csv', 'D:/UCL/workspace/final/test\\ACN-2013.csv', 'D:/UCL/workspace/final/test\\ACN-2015.csv', 'D:/UCL/workspace/final/test\\ACN-2016.csv', 'D:/UCL/workspace/final/test\\ACN-2017.csv', 'D:/UCL/workspace/final/test\\ACN-2018.csv', 'D:/UCL/workspace/final/test\\ACN-2019.csv', 'D:/UCL/workspace/final/test\\ACN-2020.csv', 'D:/UCL/workspace/final/test\\ACN-2021.csv', 'D:/UCL/workspace/final/test\\ACN-2022.csv', 'D:/UCL/workspace/final/test\\ACN-2023.csv', 'D:/UCL/workspace/final/test\\ADBE-2009.csv', 'D:/UCL/workspace/final/test\\ADBE-2010.csv', 'D:/UCL/workspace/final/test\\ADBE-2011.csv', 'D:/UCL/workspace/final/test\\ADBE-2012.csv', 'D:/UCL/workspace/final/test\\ADBE-2013.csv', 'D:/UCL/workspace/final/test\\ADBE-2014.csv', 'D:/UCL/workspace/final/test\\ADBE-2015.csv', 'D:/UCL/workspace/final/test\\ADBE-2016.csv', 'D:/UCL/workspace/final/test\\ADBE-2017.csv', 

In [4]:
def split_long_sentence(sentence, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(sentence)
    if len(tokens) <= max_length:
        return [sentence]
    
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        chunk_sentence = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_sentence)
    
    return chunks

In [5]:
def loadFinbertESG(df, tokenizer, model, labels, device, batch_size=32):
    all_sen = []
    for i, row in df.iterrows():
        print(i)
        sen = row['Sentences']
        all_sen.extend(split_long_sentence(sen, tokenizer, 512))
    
    results_list = []
    for i in range(0, len(all_sen), batch_size):
        print(i)
        batch = all_sen[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, max_length=512, padding='max_length')
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            probs = F.softmax(outputs.logits, dim=-1)
        results_list.append(probs.cpu())

    all_probs = torch.cat(results_list, dim=0)

    idx = 0
    for i, row in df.iterrows():
        sen_len = len(split_long_sentence(row['Sentences'], tokenizer, 512))
        avg_probs = torch.mean(all_probs[idx:idx + sen_len], dim=0)
        results = {label: prob.item() for label, prob in zip(labels, avg_probs)}
        df.loc[i, 'Env'] = results['Environmental']
        df.loc[i, 'Soc'] = results['Social']
        df.loc[i, 'Gov'] = results['Governance']
        df.loc[i, 'None'] = results['None']
        idx += sen_len
    return df

In [6]:
def process_file(file, tokenizer, model, labels, device):
    df = pd.read_csv(file)
    df = df.rename(columns={'0': 'Sentences'})
    if len(df) > 50:
        print(f"{file}: {len(df)}")
        df = loadFinbertESG(df, tokenizer, model, labels, device)
        processed_file_path = os.path.join(str(folder_path)+"/result", os.path.basename(file))
        print("I am writing to " + str(processed_file_path))
        df.to_csv(processed_file_path, index=False)

In [7]:
def main():
    # put it on GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #introduce Fin-BERT
    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg', num_labels=4)
    finbert.to(device)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
    labels = ['None', 'Environmental', 'Social', 'Governance']
    print("I am coming")

    #Let FinBERT give representation on every sentence
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        futures = [executor.submit(process_file, file, tokenizer, finbert, labels, device) for file in csv_files]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

I am coming
D:/UCL/workspace/final/test\ACN-2010.csv: 1190
0
32
64
96
128
160
192
224
256
288
320
352
384
416
448
480
512
544
576
608
640
672
704
736
768
800
832
864
896
928
960
992
1024
1056
1088
1120
1152
1184
I am writing to D:/UCL/workspace/final/test/result\ACN-2010.csv
D:/UCL/workspace/final/test\ACN-2011.csv: 1102
0
32
64
96
128
160
192
224
256
288
320
352
384
416
448
480
512
544
576
608
640
672
704
736
768
800
832
864
896
928
960
992
1024
1056
1088
I am writing to D:/UCL/workspace/final/test/result\ACN-2011.csv
D:/UCL/workspace/final/test\ACN-2012.csv: 684
0
32
64
96
128
160
192
224
256
288
320
352
384
416
448
480
512
544
576
608
640
672
I am writing to D:/UCL/workspace/final/test/result\ACN-2012.csv
D:/UCL/workspace/final/test\ACN-2013.csv: 711
0
32
64
96
128
160
192
224
256
288
320
352
384
416
448
480
512
544
576
608
640
672
704
I am writing to D:/UCL/workspace/final/test/result\ACN-2013.csv
D:/UCL/workspace/final/test\ACN-2015.csv: 599
0
32
64
96
128
160
192
224
256
288
320
