### List of Companies to Test

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from xml.sax.handler import feature_namespace_prefixes
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

In [2]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

### Train Test Split

In [3]:
import os
os.getcwd()

'/Users/tylerryoo/t3/summarizer_test_result/test_all_sentences'

In [18]:
all_sentences = pd.read_csv("../../extracted_sentences/notebooks/final_extracted_statistics_notebooks/all_sentences.csv", index_col = 0)

In [19]:
rel = all_sentences[all_sentences.label == 'rel']

In [20]:
irr = all_sentences[all_sentences.label == 'irr']

In [21]:
irrelevant = irr
relevant = rel

In [24]:
# pull in the csv data
# irrelevant = pd.read_csv("../../relevant_irrelevant_sentences_labeled/extracted_irrelevant_sentences.csv")
# relevant = pd.read_csv("../../relevant_irrelevant_sentences_labeled/extracted_relevant_sentences.csv")
irrelevant = irr
relevant = rel

print("total_irrelevant:", len(irrelevant))
print("total_relevant:", len(relevant))
irrelevant["class"] = 0
relevant["class"] = 1

irrelevant = irrelevant[['sentences', 'company_label', 'class']]
relevant = relevant[['sentences', 'company_label', 'class']]
relevant.sample(5)

total_irrelevant: 77258
total_relevant: 853


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  irrelevant["class"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant["class"] = 1


Unnamed: 0,sentences,company_label,class
438,Here’s how that helped the planet: Avoided 2....,HomeDepot,1
217,These data include the commissioning of two co...,Total,1
230,"From 2010 to 2013, TotalEnergies developed a ...",Total,1
734,305-1 Direct greenhouse gas (GHG) emissions (S...,Linde,1
471,Nearly half of our carbon intensity improveme...,Amazon,1


In [26]:
for comp_name in comp_list:
    comp_irrelevant = irrelevant[irrelevant['company_label'] == comp_name]
    comp_relevant = relevant[relevant['company_label'] == comp_name]
    comp_all = pd.concat([comp_relevant,comp_irrelevant])
    
    comp_all.to_csv(comp_name + '_comp_all_data.csv', encoding = 'utf-8-sig')
    
    rest_irrelevant = irrelevant[irrelevant['company_label'] != comp_name]
    rest_relevant = relevant[relevant['company_label'] != comp_name]
    
    print(comp_name, "\n", len(comp_irrelevant), len(comp_relevant),len(rest_irrelevant), len(rest_relevant))
    
    comp_sample_irr = comp_irrelevant.sample(n = len(comp_relevant), random_state = 1)
    rest_sample_irr = rest_irrelevant.sample(n = len(rest_relevant), random_state = 1)
    
    comp_balanced_set = pd.concat([comp_relevant, comp_sample_irr], ignore_index = True)
    rest_balanced_set = pd.concat([rest_relevant, rest_sample_irr], ignore_index = True)
    
    # Train Test Split on comp_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(comp_balanced_set['sentences'], 
                                                        comp_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([comp_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_comp_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_comp_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_comp_balanced_data_test.csv', encoding = 'utf-8-sig')
    
    # Train Test Split on rest_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(rest_balanced_set['sentences'], 
                                                        rest_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([rest_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_rest_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_rest_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_rest_balanced_data_test.csv', encoding = 'utf-8-sig')

EliLilly 
 84 15 77174 838
Merck 
 2403 10 74855 843
BristolMyersSquibb 
 1183 5 76075 848
johnsonandjohnson 
 1937 12 75321 841
Abbott 
 2073 7 75185 846
Boeing 
 1147 8 76111 845
UPS 
 76 10 77182 843
3M 
 3246 5 74012 848
Walmart 
 571 3 76687 850
Tesla 
 1378 4 75880 849


### Logistics Regression Classifier

In [27]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

In [28]:
pd.read_csv('EliLilly_rest_balanced_data.csv')

Unnamed: 0.1,Unnamed: 0,sentences,company_label,class
0,0,These projects will address approximately 35 p...,Merck,1
1,1,These agreements follow a 2018 U.S. wind VPP...,Merck,1
2,2,Vehicle fleet Approximately nine percent of ou...,Merck,1
3,3,• 40 percent of our U.S. fleet are now partial...,Merck,1
4,4,• Over 50 percent of the vehicles being utiliz...,Merck,1
...,...,...,...,...
1671,1671,"The data in this table excludes ONE Brands, L...",Hershey,0
1672,1672,It is a value that matters deeply to 3M emplo...,3M,0
1673,1673,The PSI assessment process involved detailed r...,Dominion_Energy,0
1674,1674,"Once onboarded, our new colleagues are suppo...",ThermoFisherScientifiic,0


In [30]:
# Train on rest data and fit on comp data 
crLR_reports = []

for comp in comp_list: 
    train = pd.read_csv(comp + "_rest_balanced_data_train.csv")
    test = pd.read_csv(comp + "_rest_balanced_data_test.csv")
    
    train.drop(columns = ['Unnamed: 0'])
    test.drop(columns = ['Unnamed: 0'])
    
    X_train = train['sentences']
    X_test = test['sentences']

    y_train = train['class']
    y_test = test['class']
    
    comp_X_test = pd.read_csv(comp + "_comp_balanced_data.csv")['sentences']
    comp_Y_test = pd.read_csv(comp + "_comp_balanced_data.csv")['class']
    
    
    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

    train_tfIdf = vectorizer_tfidf.fit_transform(X_train.values.astype('U'))

    # testing on balanced comp sentences 
    test_tfIdf_comp = vectorizer_tfidf.transform(comp_X_test.values.astype('U'))
    
    # testing on all comp sentences
    comp_all_X_test = pd.read_csv(comp + "_comp_all_data.csv")['sentences']
    comp_all_Y_test = pd.read_csv(comp + "_comp_all_data.csv")['class']
    test_all_tfIdf_comp = vectorizer_tfidf.transform(comp_all_X_test.values.astype('U'))
    
    # logistic regression prediction and evaluation
    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression()
    lr.fit(train_tfIdf, y_train)


    predLR =  lr.predict(test_all_tfIdf_comp)

    crLR = classification_report(predLR, comp_all_Y_test)
    print(comp)
    print(crLR)
    print("\n")
    crLR = classification_report(predLR, comp_all_Y_test, output_dict=True)
    
    df = pd.DataFrame(crLR).transpose()
    df.index.name = comp
    crLR_reports.append(df)
    
    df = pd.DataFrame([comp_all_X_test.to_list(), comp_all_Y_test.to_list(), list(predLR)]).transpose()
    df = df.rename(columns = { 0: 'sentences', 1: 'class', 2: 'predicted'})
    df.to_csv('test_sentences_' + comp + '.csv', encoding = 'utf-8-sig')


EliLilly
              precision    recall  f1-score   support

           0       0.48      1.00      0.65        40
           1       1.00      0.25      0.41        59

    accuracy                           0.56        99
   macro avg       0.74      0.63      0.53        99
weighted avg       0.79      0.56      0.50        99



Merck
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2248
           1       1.00      0.06      0.11       165

    accuracy                           0.94      2413
   macro avg       0.97      0.53      0.54      2413
weighted avg       0.94      0.94      0.91      2413



BristolMyersSquibb
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1122
           1       0.80      0.06      0.11        66

    accuracy                           0.95      1188
   macro avg       0.87      0.53      0.54      1188
weighted avg       0.94      0.95   

### Printing the Result

In [20]:
df = pd.concat(crLR_reports, keys=map(lambda d: d.index.name, crLR_reports))

In [21]:
df.index.names = [None, None]

In [22]:
df = df.round(2)

In [23]:
df

Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
EliLilly,0,0.48,0.98,0.64,41.0
EliLilly,1,0.93,0.24,0.38,58.0
EliLilly,accuracy,0.55,0.55,0.55,0.55
EliLilly,macro avg,0.7,0.61,0.51,99.0
EliLilly,weighted avg,0.74,0.55,0.49,99.0
Merck,0,0.94,1.0,0.97,2288.0
Merck,1,0.91,0.06,0.12,160.0
Merck,accuracy,0.94,0.94,0.94,0.94
Merck,macro avg,0.92,0.53,0.54,2448.0
Merck,weighted avg,0.94,0.94,0.91,2448.0


In [24]:
df.to_csv("summarizer_test_result.csv")