### List of Companies to Test

In [67]:
import pandas as pd

from sklearn.model_selection import train_test_split

from xml.sax.handler import feature_namespace_prefixes
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

In [68]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

### Train Test Split

In [69]:
import os
os.getcwd()

'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'

In [73]:
# all_sentences = pd.read_csv("../../relevant_irrelevant_sentences_labeled_final/all_sentences.csv", index_col = 0)

In [120]:
rel_sentences = pd.read_csv("../../extracted_sentences/notebooks/final_extracted_statistics_notebooks/rel_with_index.csv", index_col = 0, dtype = str)

In [121]:
irr_sentences = pd.read_csv("../../extracted_sentences/notebooks/final_extracted_statistics_notebooks/irr_with_index.csv", index_col = 0, dtype = str)

In [122]:
# rel = all_sentences[all_sentences.label == 'rel']

In [123]:
# irr = all_sentences[all_sentences.label == 'irr']

In [124]:
irrelevant = irr_sentences
relevant = rel_sentences

In [125]:
relevant = relevant.rename(columns = {'relevant_sentences': 'sentences'})

In [126]:
irrelevant = irrelevant.rename(columns = {'all_sentences': 'sentences'})

In [127]:
relevant['key'] = relevant["company_index"] + relevant['sent_index']

In [128]:
irrelevant['key'] = irrelevant["company_index"] + irrelevant['sent_index']

In [129]:
relevant.head()

Unnamed: 0,sentences,sent_count_x,rel_match_all,sent_count_y,company_label,company_index,sent_index,label,key
0,"In 2021, 9.6% of our purchased electricity cam...",11,"Looking toward the future, we have set climate...",40,EliLilly,1,20,rel,10020
1,A large portion of this renewable electricity ...,24,A large portion of this renewable electricity ...,24,EliLilly,1,21,rel,10021
2,"From 2012 to 2020, we achieved a 26% reduction...",12,"From 2012 to 2020, we achieved a 26% reduction...",12,EliLilly,1,25,rel,10025
3,"In 2021, we achieved a 9% absolute emissions r...",11,"In 2021, we achieved a 9% absolute emissions r...",11,EliLilly,1,26,rel,10026
4,This reduction was partially driven by energy ...,27,This reduction was partially driven by energy ...,27,EliLilly,1,27,rel,10027


In [130]:
irrelevant.head()

Unnamed: 0,sentences,company_label,company_index,sent_index,sent_count,label,key
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,1,1,30,irr,10001
1,Making medicines requires the use of valuable ...,EliLilly,1,2,14,irr,10002
2,We’re committed to reducing our environmental ...,EliLilly,1,3,18,irr,10003
3,"To track our progress, we measure and manage e...",EliLilly,1,4,27,irr,10004
4,"Lilly manages health, safety and the environme...",EliLilly,1,5,13,irr,10005


In [131]:
# pull in the csv data
# irrelevant = pd.read_csv("../../relevant_irrelevant_sentences_labeled/extracted_irrelevant_sentences.csv")
# relevant = pd.read_csv("../../relevant_irrelevant_sentences_labeled/extracted_relevant_sentences.csv")

print("total_irrelevant:", len(irrelevant))
print("total_relevant:", len(relevant))
irrelevant["class"] = 0
relevant["class"] = 1

irrelevant = irrelevant[['key','sentences', 'company_label', 'class']]
relevant = relevant[['key','sentences', 'company_label', 'class']]
relevant.sample(5)
irrelevant.sample(5)

total_irrelevant: 76410
total_relevant: 912


Unnamed: 0,key,sentences,company_label,class
55839,530004,"Against the backdrop of loss this year, we saw...",VIsa,0
23549,220975,Embedding sustainability In this section Susta...,BP,0
74999,710330,This network collaborates to provide results o...,AEP,0
73961,700276,ENVIRONMENTAL Seizing the opportunity to del...,Duke_Energy,0
61954,591391,"Linde is dependent upon its highly skilled, ex...",Linde,0


In [132]:
for comp_name in comp_list:
    comp_irrelevant = irrelevant[irrelevant['company_label'] == comp_name]
    comp_relevant = relevant[relevant['company_label'] == comp_name]
    comp_all = pd.concat([comp_relevant,comp_irrelevant])
    
    comp_all.to_csv(comp_name + '_comp_all_data.csv', encoding = 'utf-8-sig')
    
    rest_irrelevant = irrelevant[irrelevant['company_label'] != comp_name]
    rest_relevant = relevant[relevant['company_label'] != comp_name]
    
    print(comp_name, "\n", len(comp_irrelevant), len(comp_relevant),len(rest_irrelevant), len(rest_relevant))
    
    comp_sample_irr = comp_irrelevant.sample(n = len(comp_relevant), random_state = 1)
    rest_sample_irr = rest_irrelevant.sample(n = len(rest_relevant), random_state = 1)
    
    comp_balanced_set = pd.concat([comp_relevant, comp_sample_irr], ignore_index = True)
    rest_balanced_set = pd.concat([rest_relevant, rest_sample_irr], ignore_index = True)
    
    # Train Test Split on comp_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(comp_balanced_set['sentences'], 
                                                        comp_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([comp_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_comp_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_comp_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_comp_balanced_data_test.csv', encoding = 'utf-8-sig')
    
    # Train Test Split on rest_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(rest_balanced_set['sentences'], 
                                                        rest_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([rest_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_rest_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_rest_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_rest_balanced_data_test.csv', encoding = 'utf-8-sig')

EliLilly 
 84 15 76326 897
Merck 
 2384 11 74026 901
BristolMyersSquibb 
 1173 5 75237 907
johnsonandjohnson 
 1917 13 74493 899
Abbott 
 2030 8 74380 904
Boeing 
 1134 8 75276 904
UPS 
 74 12 76336 900
3M 
 3202 6 73208 906
Walmart 
 560 3 75850 909
Tesla 
 1363 4 75047 908


### Move Files to a Separate Folder

In [133]:
import os
import shutil
import glob
import pathlib

In [134]:
os.getcwd()

'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'

In [135]:
new_dir_name = 'balanced_data'
new_dir = pathlib.Path('/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences/', new_dir_name)
new_dir.mkdir(parents=True, exist_ok=True)

In [136]:
path = '/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'
files = glob.glob(path + "/*.csv")

In [137]:
for file in files:
    
    if file.startswith('/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences/test_sentences_'):
        continue
        
    filename = file.split('/')[-1]
    
    target = (r'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences/' + new_dir_name + '/' + filename)

    shutil.move(file, target)

### Logistics Regression Classifier

In [138]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

In [139]:
pd.read_csv('balanced_data/EliLilly_rest_balanced_data.csv', index_col = 0)

Unnamed: 0,key,sentences,company_label,class
0,31178,"In 2020, a new solar array was installed at on...",Merck,1
1,31179,These projects will address approximately 35 p...,Merck,1
2,31180,"These agreements follow a 2018 U.S. wind VPPA,...",Merck,1
3,31181,Approximately nine percent of our total Scope ...,Merck,1
4,31185,40 percent of our U.S. fleet are now partial-z...,Merck,1
...,...,...,...,...
1789,210497,"Occasional, or non-routine, flaring connected ...",Total,0
1790,700357,"For example, we have partnered with TerraPowe...",Duke_Energy,0
1791,610584,• “Managers treat our employees with dignity a...,SherwinWilliams,0
1792,682718,Discussion of long-term and short-term strateg...,Dow,0


In [140]:
# Train on rest data and fit on comp data 
crLR_reports = []

for comp in comp_list:
    train = pd.read_csv('balanced_data/' + comp + "_rest_balanced_data_train.csv", index_col = 0)
    test = pd.read_csv('balanced_data/' + comp + "_rest_balanced_data_test.csv", index_col = 0)
    
    X_train = train['sentences']
    X_test = test['sentences']

    y_train = train['class']
    y_test = test['class']
    
    comp_X_test = pd.read_csv('balanced_data/' + comp + "_comp_balanced_data.csv")['sentences']
    comp_Y_test = pd.read_csv('balanced_data/' + comp + "_comp_balanced_data.csv")['class']
    
    
    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

    train_tfIdf = vectorizer_tfidf.fit_transform(X_train.values.astype('U'))

    # testing on balanced comp sentences 
    test_tfIdf_comp = vectorizer_tfidf.transform(comp_X_test.values.astype('U'))
    
    # testing on all comp sentences
    comp_all_X_test = pd.read_csv('balanced_data/' + comp + "_comp_all_data.csv")['sentences']
    comp_all_Y_test = pd.read_csv('balanced_data/' + comp + "_comp_all_data.csv")['class']
    comp_key = pd.read_csv('balanced_data/' + comp + "_comp_all_data.csv")['key']
    
    test_all_tfIdf_comp = vectorizer_tfidf.transform(comp_all_X_test.values.astype('U'))
    
    # logistic regression prediction and evaluation
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import MultinomialNB
    
    lr = LogisticRegression()
    lr.fit(train_tfIdf, y_train)
    predLR =  lr.predict(test_all_tfIdf_comp)

#     nb_classifier = MultinomialNB()
#     nb_classifier.fit(train_tfIdf, y_train)
#     predLR = nb_classifier.predict(test_all_tfIdf_comp) 

    crLR = classification_report(predLR, comp_all_Y_test)
    print(comp)
    print(crLR)
    print("\n")
    crLR = classification_report(predLR, comp_all_Y_test, output_dict=True)
    
    # update the classification report as it's printed
    crLR.update({"accuracy": {"precision": None, "recall": None, "f1-score": crLR["accuracy"], "support": crLR['macro avg']['support']}})
    df = pd.DataFrame(crLR).transpose()
    df.index.name = comp
    crLR_reports.append(df)
    
    df = pd.DataFrame([comp_key.to_list(), comp_all_X_test.to_list(), comp_all_Y_test.to_list(), list(predLR)]).transpose()
    df = df.rename(columns = { 0: 'key', 1: 'sentences', 2: 'class', 3: 'predicted'})
    df.to_csv('test_sentences_' + comp + '.csv', encoding = 'utf-8-sig')


EliLilly
              precision    recall  f1-score   support

           0       0.46      1.00      0.63        39
           1       1.00      0.25      0.40        60

    accuracy                           0.55        99
   macro avg       0.73      0.62      0.52        99
weighted avg       0.79      0.55      0.49        99



Merck
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2243
           1       1.00      0.07      0.13       152

    accuracy                           0.94      2395
   macro avg       0.97      0.54      0.55      2395
weighted avg       0.94      0.94      0.92      2395



BristolMyersSquibb
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1112
           1       0.80      0.06      0.11        66

    accuracy                           0.95      1178
   macro avg       0.87      0.53      0.54      1178
weighted avg       0.94      0.95   

In [112]:
test_3M = pd.read_csv('test_sentences_3M.csv', index_col = 0)

In [113]:
test_3M.groupby('class').count()[['sentences']]

Unnamed: 0_level_0,sentences
class,Unnamed: 1_level_1
0,3202
1,6


In [114]:
test_3M.groupby('predicted').count()[['sentences']]

Unnamed: 0_level_0,sentences
predicted,Unnamed: 1_level_1
0,2993
1,215


### Printing the Result

In [115]:
df = pd.concat(crLR_reports, keys=map(lambda d: d.index.name, crLR_reports))

In [116]:
df.index.names = [None, None]

In [117]:
df = df.round(2)

In [118]:
df.support = df.support.apply(lambda x: int(x))

In [119]:
df

Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
EliLilly,0,0.46,1.0,0.63,39
EliLilly,1,1.0,0.25,0.4,60
EliLilly,accuracy,,,0.55,99
EliLilly,macro avg,0.73,0.62,0.52,99
EliLilly,weighted avg,0.79,0.55,0.49,99
Merck,0,0.94,1.0,0.97,2243
Merck,1,1.0,0.07,0.13,152
Merck,accuracy,,,0.94,2395
Merck,macro avg,0.97,0.54,0.55,2395
Merck,weighted avg,0.94,0.94,0.92,2395


In [35]:
df.to_csv("summarizer_test_result.csv")