### List of Companies to Test

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from xml.sax.handler import feature_namespace_prefixes
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

In [2]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

### Train Test Split

In [3]:
import os
os.getcwd()

'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'

In [4]:
all_sentences = pd.read_csv("../../relevant_irrelevant_sentences_labeled_final/all_sentences.csv", index_col = 0)

In [5]:
rel = all_sentences[all_sentences.label == 'rel']

In [6]:
irr = all_sentences[all_sentences.label == 'irr']

In [7]:
irrelevant = irr
relevant = rel

In [8]:
relevant

Unnamed: 0,key,sentences,company_label,label
1,1,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,rel
2,2,A large portion of this renewable electricity ...,EliLilly,rel
3,3,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,rel
4,4,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,rel
5,5,This reduction was partially driven by energy ...,EliLilly,rel
...,...,...,...,...
867,867,"2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,rel
868,868,2018 Retired and demolished 636 MW of coal and...,NextEraEnergyZeroCarbonBlueprint,rel
869,869,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,rel
870,870,2020 Retired 615 MW of nuclear and 330 MW of c...,NextEraEnergyZeroCarbonBlueprint,rel


In [9]:
# pull in the csv data
# irrelevant = pd.read_csv("../../relevant_irrelevant_sentences_labeled/extracted_irrelevant_sentences.csv")
# relevant = pd.read_csv("../../relevant_irrelevant_sentences_labeled/extracted_relevant_sentences.csv")
irrelevant = irr
relevant = rel

print("total_irrelevant:", len(irrelevant))
print("total_relevant:", len(relevant))
irrelevant["class"] = 0
relevant["class"] = 1

irrelevant = irrelevant[['key','sentences', 'company_label', 'class']]
relevant = relevant[['key','sentences', 'company_label', 'class']]
relevant.sample(5)

total_irrelevant: 76406
total_relevant: 871


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  irrelevant["class"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant["class"] = 1


Unnamed: 0,key,sentences,company_label,class
571,571,ACHIEVED: 60% reduction Use electricity genera...,Cisco,1
489,489,Amazon has contracted 82 MW of the new 120- MW...,Amazon,1
204,204,"• At the Zeeland refinery, the Company plans t...",Total,1
623,623,"Our commitment is that by 2030, 100 percent of...",Microsoft,1
840,840,The strategy proposes adding approximately 16 ...,AEP,1


In [10]:
for comp_name in comp_list:
    comp_irrelevant = irrelevant[irrelevant['company_label'] == comp_name]
    comp_relevant = relevant[relevant['company_label'] == comp_name]
    comp_all = pd.concat([comp_relevant,comp_irrelevant])
    
    comp_all.to_csv(comp_name + '_comp_all_data.csv', encoding = 'utf-8-sig')
    
    rest_irrelevant = irrelevant[irrelevant['company_label'] != comp_name]
    rest_relevant = relevant[relevant['company_label'] != comp_name]
    
    print(comp_name, "\n", len(comp_irrelevant), len(comp_relevant),len(rest_irrelevant), len(rest_relevant))
    
    comp_sample_irr = comp_irrelevant.sample(n = len(comp_relevant), random_state = 1)
    rest_sample_irr = rest_irrelevant.sample(n = len(rest_relevant), random_state = 1)
    
    comp_balanced_set = pd.concat([comp_relevant, comp_sample_irr], ignore_index = True)
    rest_balanced_set = pd.concat([rest_relevant, rest_sample_irr], ignore_index = True)
    
    # Train Test Split on comp_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(comp_balanced_set['sentences'], 
                                                        comp_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([comp_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_comp_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_comp_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_comp_balanced_data_test.csv', encoding = 'utf-8-sig')
    
    # Train Test Split on rest_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(rest_balanced_set['sentences'], 
                                                        rest_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([rest_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_rest_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_rest_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_rest_balanced_data_test.csv', encoding = 'utf-8-sig')

EliLilly 
 84 15 76322 856
Merck 
 2384 10 74022 861
BristolMyersSquibb 
 1172 4 75234 867
johnsonandjohnson 
 1917 13 74489 858
Abbott 
 2030 8 74376 863
Boeing 
 1134 7 75272 864
UPS 
 75 11 76331 860
3M 
 3202 5 73204 866
Walmart 
 560 3 75846 868
Tesla 
 1364 3 75042 868


### Move Files to a Separate Folder

In [11]:
import os
import shutil
import glob
import pathlib

In [12]:
os.getcwd()

'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'

In [13]:
new_dir_name = 'balanced_data'
new_dir = pathlib.Path('/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences/', new_dir_name)
new_dir.mkdir(parents=True, exist_ok=True)

In [14]:
path = '/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'
files = glob.glob(path + "/*.csv")

In [15]:
for file in files:
    
    if file.startswith('/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences/test_sentences_'):
        continue
        
    filename = file.split('/')[-1]
    
    target = (r'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences/' + new_dir_name + '/' + filename)

    shutil.move(file, target)

### Logistics Regression Classifier

In [24]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

In [25]:
pd.read_csv('balanced_data/EliLilly_rest_balanced_data.csv', index_col = 0)

Unnamed: 0,key,sentences,company_label,class
0,16,These projects will address approximately 35 p...,Merck,1
1,17,"These agreements follow a 2018 U.S. wind VPPA,...",Merck,1
2,18,Approximately nine percent of our total Scope ...,Merck,1
3,19,Our company recently signed three virtual powe...,Merck,1
4,20,Over 50 percent of the vehicles being utilized...,Merck,1
...,...,...,...,...
1707,62636,Carbon dioxide and other plants caused approxi...,Linde,0
1708,60835,We describe the rights and responsibilities of...,Prologis,0
1709,32208,"3TGs – Tin, tantalum, tungsten, and gold Aeros...",Philip_Morris_Intl,0
1710,44196,People and Culture Employee Engagement,Tesla,0


In [26]:
# Train on rest data and fit on comp data 
crLR_reports = []

for comp in comp_list:
    train = pd.read_csv('balanced_data/' + comp + "_rest_balanced_data_train.csv", index_col = 0)
    test = pd.read_csv('balanced_data/' + comp + "_rest_balanced_data_test.csv", index_col = 0)
    
    X_train = train['sentences']
    X_test = test['sentences']

    y_train = train['class']
    y_test = test['class']
    
    comp_X_test = pd.read_csv('balanced_data/' + comp + "_comp_balanced_data.csv")['sentences']
    comp_Y_test = pd.read_csv('balanced_data/' + comp + "_comp_balanced_data.csv")['class']
    
    
    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

    train_tfIdf = vectorizer_tfidf.fit_transform(X_train.values.astype('U'))

    # testing on balanced comp sentences 
    test_tfIdf_comp = vectorizer_tfidf.transform(comp_X_test.values.astype('U'))
    
    # testing on all comp sentences
    comp_all_X_test = pd.read_csv('balanced_data/' + comp + "_comp_all_data.csv")['sentences']
    comp_all_Y_test = pd.read_csv('balanced_data/' + comp + "_comp_all_data.csv")['class']
    comp_key = pd.read_csv('balanced_data/' + comp + "_comp_all_data.csv")['key']
    
    test_all_tfIdf_comp = vectorizer_tfidf.transform(comp_all_X_test.values.astype('U'))
    
    # logistic regression prediction and evaluation
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import MultinomialNB
    
    lr = LogisticRegression()
    lr.fit(train_tfIdf, y_train)
    predLR =  lr.predict(test_all_tfIdf_comp)

#     nb_classifier = MultinomialNB()
#     nb_classifier.fit(train_tfIdf, y_train)
#     predLR = nb_classifier.predict(test_all_tfIdf_comp) 

    crLR = classification_report(predLR, comp_all_Y_test)
    print(comp)
    print(crLR)
    print("\n")
    crLR = classification_report(predLR, comp_all_Y_test, output_dict=True)
    
    # update the classification report as it's printed
    crLR.update({"accuracy": {"precision": None, "recall": None, "f1-score": crLR["accuracy"], "support": crLR['macro avg']['support']}})
    df = pd.DataFrame(crLR).transpose()
    df.index.name = comp
    crLR_reports.append(df)
    
    df = pd.DataFrame([comp_key.to_list(), comp_all_X_test.to_list(), comp_all_Y_test.to_list(), list(predLR)]).transpose()
    df = df.rename(columns = { 0: 'key', 1: 'sentences', 2: 'class', 3: 'predicted'})
    df.to_csv('test_sentences_' + comp + '.csv', encoding = 'utf-8-sig')


EliLilly
              precision    recall  f1-score   support

           0       0.49      1.00      0.66        41
           1       1.00      0.26      0.41        58

    accuracy                           0.57        99
   macro avg       0.74      0.63      0.53        99
weighted avg       0.79      0.57      0.51        99



Merck
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      2227
           1       1.00      0.06      0.11       167

    accuracy                           0.93      2394
   macro avg       0.97      0.53      0.54      2394
weighted avg       0.94      0.93      0.91      2394



BristolMyersSquibb
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1115
           1       0.75      0.05      0.09        61

    accuracy                           0.95      1176
   macro avg       0.85      0.52      0.53      1176
weighted avg       0.94      0.95   

In [27]:
test_3M = pd.read_csv('test_sentences_3M.csv', index_col = 0)

In [28]:
test_3M.groupby('class').count()[['sentences']]

Unnamed: 0_level_0,sentences
class,Unnamed: 1_level_1
0,3202
1,5


In [29]:
test_3M.groupby('predicted').count()[['sentences']]

Unnamed: 0_level_0,sentences
predicted,Unnamed: 1_level_1
0,3006
1,201


### Printing the Result

In [30]:
df = pd.concat(crLR_reports, keys=map(lambda d: d.index.name, crLR_reports))

In [31]:
df.index.names = [None, None]

In [32]:
df = df.round(2)

In [33]:
df.support = df.support.apply(lambda x: int(x))

In [34]:
df

Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
EliLilly,0,0.49,1.0,0.66,41
EliLilly,1,1.0,0.26,0.41,58
EliLilly,accuracy,,,0.57,99
EliLilly,macro avg,0.74,0.63,0.53,99
EliLilly,weighted avg,0.79,0.57,0.51,99
Merck,0,0.93,1.0,0.97,2227
Merck,1,1.0,0.06,0.11,167
Merck,accuracy,,,0.93,2394
Merck,macro avg,0.97,0.53,0.54,2394
Merck,weighted avg,0.94,0.93,0.91,2394


In [35]:
df.to_csv("summarizer_test_result.csv")