In [None]:
# Acknowledge
# This is a Group project with ZhiWei Zhan, Boyangzhang, Jiayou Qu. The pulished code may have very similar content with their published code.
# Some codes are inspired by open source project and ChatGPT
# the model RNN, LSTM, GRU are reference from pytorch official website https://pytorch.org/

# This step 3 code has done:
# 1, train the mapping model with step 3 data and make prdiction
# 2, test the mapping model

In [1]:
#import package
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data as ds
from model import RNN,LSTM,GRU
import torch.nn.functional as F
import ast
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
#regression dataset
class MyDataset1(ds.Dataset):
    def __init__(self, X, Y):
        self.samples = X
        self.labels = Y

    def __getitem__(self, index):
        sample = self.samples[index]
        label = [self.labels[index]]
        #print(label)
        sample=torch.Tensor(sample).float()
        label= torch.Tensor(label).float()

        return sample, label

    def __len__(self):
        return len(self.samples)

In [3]:
#load step 3 report representation data
report_repre=pd.read_csv('D:/UCL/workspace/final/ESGOverallData.csv')
report_repre.dropna(inplace=True)
sorted_df = report_repre.sort_values(by='Year')
filtered_df = sorted_df[sorted_df['BloombergOverall'] != 0]
report_repre=filtered_df

In [7]:
report_repre

Unnamed: 0.1,Unnamed: 0,CompanyTicker,Year,CSV.Length,ESGData,BloombergOverall
285,285,GEN,2015,438,"[[0.0055219298228621, 0.0078070522285997, 0.00...",3.31
488,488,QLYS,2015,262,"[[0.0129457972943782, 0.0162092745304107, 0.01...",1.70
191,191,EFX,2015,1044,"[[0.0027446697931736, 0.0058759758248925, 0.00...",1.66
57,57,ANSS,2015,837,"[[0.0023921008687466, 0.0108177633956074, 0.00...",1.48
496,496,QTWO,2015,1245,"[[0.0087282583117485, 0.0131738986819982, 0.00...",1.33
...,...,...,...,...,...,...
306,306,HUBS,2022,1297,"[[0.0087282583117485, 0.0131738986819982, 0.00...",3.66
291,291,GEN,2022,785,"[[0.0055219298228621, 0.0078070522285997, 0.00...",4.69
474,474,PRO,2022,273,"[[0.0129457972943782, 0.0162092745304107, 0.01...",2.97
464,464,PCTY,2022,789,"[[0.0032370127737522, 0.0083294827491045, 0.00...",2.21


In [8]:
def generate_data(df):
    scaler = MinMaxScaler()
    x_train=[]
    y_train=[]
    for index,row in df.iterrows():
        x_train.append(ast.literal_eval(row["ESGData"]))
        y_train.append(float(row["BloombergOverall"]))
    y_train=scaler.fit_transform(np.array(y_train).reshape(-1,1))
    return x_train,y_train
x,y=generate_data(report_repre)

In [10]:
# add padding for too short samples and trim too long samples
def pad_time_series(data, target_length=None, padding_value=0.0):
    # Determine the target length if not provided
    final_result=[]
    if target_length is None:
        target_length = max(len(sample) for sample in data)

    for sample in data:
        if len(sample) < target_length:
            for _ in range(target_length - len(sample)):
                sample.append([padding_value for _ in range(4)])
            final_result.append(sample)
        
        elif len(sample) > target_length:
            fourth_values = [item[3] for item in sample]
            indices_to_keep = sorted(range(len(fourth_values)), key=lambda i: fourth_values[i], reverse=False)[:target_length]
            trimmed_report = [sample[i] for i in sorted(indices_to_keep)]
            final_result.append(trimmed_report)
        
        else:
            final_result.append(sample)

    return final_result
            
x = pad_time_series(x, 500, padding_value=0.25)
len(x[0])

500

In [11]:
#training loop
from scipy.stats import pearsonr
from scipy.stats import spearmanr
def loop(model,train_loader,test_loader,optimizer,criterion,device,E=20):
    loss_all=[]
    result_all=[]
    for epoch in range(E): 
        loss_epoch=[]
        print(epoch)
        for samples, labels in train_loader:
            samples=samples.to(device)
            labels=labels.to(device)
            #print(samples.shape)
            #print(labels.shape)
        # Set the flag to training mode
            model.train()
            
            optimizer.zero_grad()
            outputs = model(samples)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            loss_epoch.append(float(loss.cpu()))
            #print(outputs)
        print(sum(loss_epoch)/len(loss_epoch))
        loss_all.append(sum(loss_epoch)/len(loss_epoch))

        # evaulate the model in every epoch
        model.eval()
        result=[]
        true_a=[]
        test_loss=0
        for samples, true in test_loader:
            samples=samples.to(device)
            outputs = model(samples)
            test = criterion(outputs.squeeze(), true.to(device))
            test_loss += test.item()
            result.append(float(outputs.cpu()))
            true_a.append(float(true.cpu()))
        
        test_loss /= len(test_loader)
        ic, _ = pearsonr(result, true_a)
        rank_ic, _ = spearmanr(result, true_a)
        result_all.append(result)
        print('test loss:', test_loss, 'pearson:', ic, 'spearman:', rank_ic)

    return result,loss_epoch,model

In [13]:
# split the train and test set
train_x, test_x, train_y, test_y = train_test_split(
    x, y, test_size=0.2)

In [23]:
#parameters
batch_size=64
learning_rate=0.01
epoch=50
hidden_num=256
output_dim=1 #if None put ''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
#make dataset
train_dataset = MyDataset1(train_x,train_y)
test_dataset = MyDataset1(test_x,test_y)
train_loader=ds.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader=ds.DataLoader(dataset=train_dataset, batch_size=1, shuffle=True)
criterion = nn.MSELoss()

In [None]:
#train and test the model
features_dim=np.array(x).shape[2]
modelRNN=GRU.GRU_Model(features_dim,hidden_num,output_dim).to(device)
optimizerRNN= optim.Adam(modelRNN.parameters(), lr=learning_rate)
resultRNN,lossRNN,modelRNN=loop(modelRNN,train_loader,test_loader,optimizerRNN,criterion,device,E=epoch)