#### 1. Loading required Libraries

In [1]:
import numpy as np               # Linear Algebra
import pandas as pd              # Data Handling
import seaborn as sns            # Visualization
import matplotlib.pyplot as plt  # Visualization

#### 2. Importing dataset into dataframe

In [2]:
df = pd.read_excel("../datasets/Reputation Risk Scoring w ESG 2.0.xlsx")
df.head(10)

Unnamed: 0,E1,E2,Words,Weight,Unnamed: 4,ESG Weight (1-10),ESG Category
0,Information Security,Data Privacy,AML,2,,8,Social
1,Fraud / Financial Crime,AML,AML CFT,2,,8,Governance
2,Financial Performance,Hostile takeovers,Acquisition,1,,2,Governance
3,Financial Exposure,Financial Exposure,Activist Actiivty,5,,2,Social
4,Management,Activist Pressures,Activist Pressures,5,,2,Social
5,Fraud / Financial Crime,Bribery,Anti-bribery acts,7,,8,Governance
6,Management,Leadership Changes,CEO hired,1,,0,
7,Management,Leadership Changes,CFO hired,1,,0,
8,Management,Leadership Changes,COO hired,1,,0,
9,Management,Social Responsibility,CSR,1,,2,Environmental


In [3]:
train = pd.read_excel("../datasets/training_set.xlsx")
train.head()

Unnamed: 0,title,cleaned text,relevance
0,Blackstone to buy EagleClaw Midstream for abou...,"EagleClaw Midstream Ventures LLC, the largest ...",0
1,Worldwide Mobile Crusher and Screener Industry...,WireThe report has been added to offering. Acc...,0
2,"In a First, BP Offers Employees Shares in Rall...",Oil major BP is launching its first share awar...,1
3,SHAREHOLDER ALERT: Purcell Julie & Lefkowitz L...,TipRanks We’ve got a full month of 2021 behind...,0
4,Komatsu Australia - Komatsu Australia,Komatsu Australia Corporate Finance Pty Ltd Cr...,0


In [4]:
test = pd.read_excel("../datasets/testing_set.xlsx")
test.head()

Unnamed: 0,title,cleaned text,relevance
0,As Blackjewel bankruptcy case enters final str...,On June 11 the court ruled it would lift the a...,0
1,FG Wilson (Engineering) Ltd v John Holt & Comp...,1. This is the hearing of an application by th...,0
2,Personnel Policy and Performance Appraisal Sys...,Here at Komatsu we consider many aspects of ou...,0
3,Caterpillar planning 700 job cuts in the North,US manufacturing firm Caterpillar has announce...,1
4,Hardman & Co Research : Tritax EuroBox present...,Hardman & Co Research 03-Feb-2021 / 12:45 GMT/...,0


In [5]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
def spacy_process(text):
    
    doc = nlp(text)
    
    #Tokenization and lemmatization are done with the spacy nlp pipeline commands
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    #print("Tokenize+Lemmatize:")
    #print(lemma_list)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;$\'-_"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    #print(" ")
    #print("Remove stopword & punctuation: ")
    #print(filtered_sentence)

    return " ".join(filtered_sentence)

In [8]:
def clean_text(text):
    return text.strip().lower()

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
cv = TfidfVectorizer(ngram_range=(1, 1), max_features = 100)

def term_freq(temp_df, cv, flag):
    temp_df['cleaned text'] = temp_df['cleaned text'].apply(clean_text)
    
    if flag == "train":
        x = cv.fit_transform(temp_df['cleaned text'])
    else:
        x = cv.transform(temp_df['cleaned text'])
        
    df = pd.DataFrame(x.toarray(), columns=cv.get_feature_names_out())

    temp_df.drop(['cleaned text'], axis=1, inplace=True)

    return pd.concat([df, temp_df], axis=1)

In [12]:
def final_df(df, cv, flag):
    df.iloc[:,1] = df.iloc[:,1].apply(spacy_process)
    
    del df['title']

    return term_freq(df, cv, flag)

In [13]:
train = final_df(train, cv, "train")
train

Unnamed: 0,000,10,12,2019,2020,america,analysis,bank,base,billion,...,system,technology,term,time,total,use,value,work,year,relevance
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.129117,0.465281,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.105907,0
1,0.000000,0.000000,0.000000,0.187392,0.052574,0.060468,0.000000,0.0,0.097328,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.059441,0.000000,0.000000,0
2,0.283978,0.096466,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.084711,0.203509,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.051736,0.000000,0.416903,1
3,0.000000,0.000000,0.124320,0.000000,0.069547,0.000000,0.172642,0.0,0.064374,0.231977,...,0.120331,0.075864,0.000000,0.000000,0.087454,0.057556,0.000000,0.000000,0.184809,0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.025631,0.0,0.000000,0.000000,...,0.047639,0.000000,0.022897,0.037056,0.000000,0.187989,0.000000,0.019646,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,0.000000,0.000000,0.000000,0.000000,0.453192,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.135537,0.000000,0.000000,0.170795,0.000000,0.000000,0
571,0.000000,0.000000,0.000000,0.000000,0.231676,0.000000,0.000000,0.0,0.000000,0.257587,...,0.000000,0.126359,0.128439,0.000000,0.000000,0.095865,0.000000,0.000000,0.000000,0
572,0.038511,0.065410,0.000000,0.036864,0.031028,0.000000,0.000000,0.0,0.028720,0.034498,...,0.000000,0.067692,0.000000,0.000000,0.000000,0.000000,0.035080,0.059037,0.000000,1
573,0.364049,0.154583,0.087385,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.197370,0.000000,0.060684,0.000000,0.139522,0.222689,0


In [None]:
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_jobs=3, oob_score=True, n_estimators=100, criterion="gini")
model = rfc.fit(X_train, y_train)

In [None]:
test = final_df(test, cv, "test")
test.head()

Unnamed: 0,000,10,12,2019,2020,america,analysis,bank,base,billion,business,case,caterpillar,change,claim,come,company,construction,continue,cost,court,customer,datum,day,demand,development,employee,end,energy,equipment,expect,financial,follow,future,gas,global,good,government,group,growth,help,high,impact,inc,include,increase,industry,information,issue,key,komatsu,large,law,lead,machine,management,market,million,mining,month,need,net,new,news,oil,operate,operation,period,plan,policy,power,price,product,production,provide,quarter,rate,report,research,result,revenue,risk,sale,section,segment,service,share,state,statement,stock,support,system,technology,term,time,total,use,value,work,year,relevance
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142723,0.0,0.2235,0.0,0.106798,0.0,0.797374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129561,0.0,0.0,0.0,0.0,0.0,0.0,0.11451,0.418378,0.123502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118052,0.0,0.0,0.0,0.129185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.030506,0.045987,0.0,0.0,0.0,0.0,0.0,0.00893,0.0,0.062449,0.159983,0.011193,0.0,0.45487,0.0,0.051338,0.0,0.009199,0.0,0.109894,0.164071,0.0,0.14466,0.0,0.010302,0.0,0.028732,0.0,0.0,0.009766,0.0,0.12759,0.0,0.0,0.0,0.208227,0.0,0.020496,0.0,0.009742,0.019062,0.0,0.009988,0.007184,0.0,0.0,0.0,0.1644,0.0,0.0,0.019436,0.12608,0.0,0.0,0.032907,0.0,0.078909,0.0,0.095743,0.0,0.014211,0.0,0.0,0.0,0.0,0.0,0.045167,0.019629,0.0,0.0,0.523368,0.053699,0.0,0.185924,0.0,0.033875,0.007951,0.0,0.047089,0.0,0.0,0.258908,0.086023,0.0,0.147145,0.0,0.050844,0.010969,0.025364,0.009913,0.022255,0.0,0.385078,0.051933,0.048524,0.055887,0.010907,0.036712,0.029298,0
2,0.0,0.074629,0.084375,0.0,0.0,0.0,0.0,0.0,0.032768,0.0,0.028645,0.0,0.0,0.0,0.0,0.0,0.047097,0.0,0.033757,0.0,0.0,0.0,0.0,0.0,0.0,0.037804,0.723975,0.035144,0.0,0.0,0.035838,0.0,0.0,0.0,0.0,0.032842,0.07641,0.0,0.075213,0.039469,0.0,0.0,0.0,0.0,0.026361,0.0,0.03262,0.033915,0.0,0.0,0.532382,0.0,0.0,0.0,0.0,0.241512,0.0,0.0,0.0,0.0,0.035401,0.0,0.0,0.0,0.0,0.040952,0.040252,0.0,0.0,0.04284,0.0,0.0,0.032842,0.0,0.118654,0.0,0.041436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.204169,0.0,0.039252,0.031762,0.089032,0.0,0.040025,0.134717,0.0,0
3,0.0,0.0,0.084765,0.0,0.0,0.0,0.0,0.0,0.065839,0.0,0.172664,0.0,0.742716,0.074019,0.0,0.0,0.236575,0.077381,0.067827,0.068623,0.0,0.0,0.0,0.082045,0.0,0.0,0.229681,0.0,0.0,0.071829,0.072007,0.0,0.0,0.071129,0.0,0.065987,0.076764,0.0,0.0,0.0,0.071829,0.0,0.07759,0.0,0.0,0.071829,0.065543,0.068143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080876,0.0,0.0,0.08857,0.078436,0.142258,0.0,0.0,0.0,0.0,0.0,0.080876,0.083255,0.217092,0.0,0.077799,0.0,0.0,0.155598,0.119203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076358,0.0,0.0,0.0,0.0,0.074975,0.0,0.0,0.14618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20301,0.0,1
4,0.0,0.0,0.148521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147626,0.0,0.0,0.0,0.0,0.497414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132739,0.253583,0.0,0.0,0.0,0.0,0.0,0.132393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145874,0.0,0.714633,0.0,0.0,0.0,0.0,0.0,0.0,0.111818,0.0,0.131366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094622,0


In [None]:
X_test = test.iloc[:,:-1]
y_test = test.iloc[:,-1]

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
pred = rfc.predict(X_test)
score = accuracy_score(y_test, pred)
score

0.8704453441295547

In [None]:
print("Classification Report")
print(classification_report(y_test, pred))

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       168
           1       0.90      0.67      0.77        79

    accuracy                           0.87       247
   macro avg       0.88      0.82      0.84       247
weighted avg       0.87      0.87      0.86       247



In [None]:
print("Confusion Matrix")
print(confusion_matrix(y_test, pred))

Confusion Matrix
[[162   6]
 [ 26  53]]
