### List of Companies to Test

In [17]:
import pandas as pd

from sklearn.model_selection import train_test_split

from xml.sax.handler import feature_namespace_prefixes
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

In [4]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

### Train Test Split

In [5]:
# pull in the csv data
irrelevant = pd.read_csv("../relevant_irrelevant_sentences_labeled/extracted_irrelevant_sentences.csv")
relevant = pd.read_csv("../relevant_irrelevant_sentences_labeled/extracted_relevant_sentences.csv")

print("total_irrelevant:", len(irrelevant))
print("total_relevant:", len(relevant))
irrelevant["class"] = 0
relevant["class"] = 1

irrelevant = irrelevant[['relevant_sentences', 'company_label', 'class']]
relevant = relevant[['relevant_sentences', 'company_label', 'class']]
relevant.sample(5)

total_irrelevant: 78258
total_relevant: 891


Unnamed: 0,relevant_sentences,company_label,class
609,While our combined Scope 1 and 2 emissions dec...,Microsoft,1
333,We also implemented a more accurate method for...,Shell,1
235,It halved its operated methane emissions betwe...,Total,1
143,13.5 13.0 2009 2010 2011 2012 2013 2014 2015 2...,Delta,1
232,"All In Europe, electricity will be provided by...",Total,1


In [11]:
for comp_name in comp_list:
    comp_irrelevant = irrelevant[irrelevant['company_label'] == comp_name]
    comp_relevant = relevant[relevant['company_label'] == comp_name]
    
    rest_irrelevant = irrelevant[irrelevant['company_label'] != comp_name]
    rest_relevant = relevant[relevant['company_label'] != comp_name]
    
    print(comp_name, "\n", len(comp_irrelevant), len(comp_relevant),len(rest_irrelevant), len(rest_relevant))
    
    comp_sample_irr = comp_irrelevant.sample(n = len(comp_relevant), random_state = 1)
    rest_sample_irr = rest_irrelevant.sample(n = len(rest_relevant), random_state = 1)
    
    comp_balanced_set = pd.concat([comp_relevant, comp_sample_irr], ignore_index = True)
    rest_balanced_set = pd.concat([rest_relevant, rest_sample_irr], ignore_index = True)
    
    # Train Test Split on comp_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(comp_balanced_set['relevant_sentences'], 
                                                        comp_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([comp_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_comp_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_comp_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_comp_balanced_data_test.csv', encoding = 'utf-8-sig')
    
    # Train Test Split on rest_balanced_set
    X_train, X_test, y_train, y_test = train_test_split(rest_balanced_set['relevant_sentences'], 
                                                        rest_balanced_set['class'], test_size=0.1, random_state=100)

    dfbalanced = pd.concat([rest_balanced_set], ignore_index=True)
    dfbalanced.to_csv(comp_name + '_rest_balanced_data.csv', encoding = 'utf-8-sig')

    dfbalanced_train = pd.concat([X_train.to_frame(), y_train.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_train.columns = ['sentences', 'class']
    dfbalanced_train.to_csv(comp_name + '_rest_balanced_data_train.csv', encoding = 'utf-8-sig')

    dfbalanced_test = pd.concat([X_test.to_frame(), y_test.to_frame()], axis = 1, ignore_index = True)
    dfbalanced_test.columns = ['sentences', 'class']
    dfbalanced_test.to_csv(comp_name + '_rest_balanced_data_test.csv', encoding = 'utf-8-sig')

EliLilly 
 84 15 78174 876
Merck 
 2437 11 75821 880
BristolMyersSquibb 
 1185 4 77073 887
johnsonandjohnson 
 2056 13 76202 878
Abbott 
 2123 8 76135 883
Boeing 
 1158 7 77100 884
UPS 
 75 11 78183 880
3M 
 3272 8 74986 883
Walmart 
 588 3 77670 888
Tesla 
 1396 3 76862 888


### Logistics Regression Classifier

In [12]:
comp_list = ["EliLilly", "Merck", "BristolMyersSquibb", "johnsonandjohnson", "Abbott", "Boeing",
             "UPS", "3M", "Walmart", "Tesla"]

In [13]:
pd.read_csv('EliLilly_rest_balanced_data.csv')

Unnamed: 0.1,Unnamed: 0,relevant_sentences,company_label,class
0,0,These projects will address approximately 35 p...,Merck,1
1,1,"These agreements follow a 2018 U.S. wind VPPA,...",Merck,1
2,2,Our company recently signed three virtual powe...,Merck,1
3,3,Over 50 percent of the vehicles being utilized...,Merck,1
4,4,Approximately nine percent of our total Scope ...,Merck,1
...,...,...,...,...
1747,1747,"2 Reported operating income grew 35%, declin...",CocaCola,0
1748,1748,Regular Training and Reviews Providing ongoin...,Mondelez_Intl,0
1749,1749,"After a rigorous review and vetting process, ...",ThermoFisherScientifiic,0
1750,1750,We collect genetic and genomic samples in our ...,Merck,0


In [79]:
# Train on rest data and fit on comp data 
crLR_reports = []

for comp in comp_list: 
    train = pd.read_csv(comp + "_rest_balanced_data_train.csv")
    test = pd.read_csv(comp + "_rest_balanced_data_test.csv")
    
    train.drop(columns = ['Unnamed: 0'])
    test.drop(columns = ['Unnamed: 0'])
    
    X_train = train['sentences']
    X_test = test['sentences']

    y_train = train['class']
    y_test = test['class']
    
    comp_X_test = pd.read_csv(comp + "_comp_balanced_data.csv")['relevant_sentences']
    comp_Y_test = pd.read_csv(comp + "_comp_balanced_data.csv")['class']
    
    
    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

    train_tfIdf = vectorizer_tfidf.fit_transform(X_train.values.astype('U'))

    test_tfIdf_comp = vectorizer_tfidf.transform(comp_X_test.values.astype('U'))
    
    # logistic regression prediction and evaluation
    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression()
    lr.fit(train_tfIdf, y_train)


    predLR =  lr.predict(test_tfIdf_comp)

    crLR = classification_report(predLR, comp_Y_test)
    print(comp)
    print(crLR)
    print("\n")
    crLR = classification_report(predLR, comp_Y_test, output_dict=True)
    
    df = pd.DataFrame(crLR).transpose()
    df.index.name = comp
    crLR_reports.append(df)
    
    df = pd.DataFrame([comp_X_test.to_list(), comp_Y_test.to_list(), list(predLR)]).transpose()
    df = df.rename(columns = { 0: 'sentences', 1: 'class', 2: 'predicted'})
    df.to_csv('test_sentences_' + comp + '.csv', encoding = 'utf-8-sig')


EliLilly
              precision    recall  f1-score   support

           0       0.60      0.90      0.72        10
           1       0.93      0.70      0.80        20

    accuracy                           0.77        30
   macro avg       0.77      0.80      0.76        30
weighted avg       0.82      0.77      0.77        30



Merck
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.91      1.00      0.95        10

    accuracy                           0.95        22
   macro avg       0.95      0.96      0.95        22
weighted avg       0.96      0.95      0.95        22



BristolMyersSquibb
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         6
           1       0.50      1.00      0.67         2

    accuracy                           0.75         8
   macro avg       0.75      0.83      0.73         8
weighted avg       0.88      0.75   

### Printing the Result

In [97]:
df = pd.concat(crLR_reports, keys=map(lambda d: d.index.name, crLR_reports))

In [102]:
df.index.names = [None, None]

In [104]:
df = df.round(2)

In [105]:
df

Unnamed: 0,Unnamed: 1,precision,recall,f1-score,support
EliLilly,0,0.6,0.9,0.72,10.0
EliLilly,1,0.93,0.7,0.8,20.0
EliLilly,accuracy,0.77,0.77,0.77,0.77
EliLilly,macro avg,0.77,0.8,0.76,30.0
EliLilly,weighted avg,0.82,0.77,0.77,30.0
Merck,0,1.0,0.92,0.96,12.0
Merck,1,0.91,1.0,0.95,10.0
Merck,accuracy,0.95,0.95,0.95,0.95
Merck,macro avg,0.95,0.96,0.95,22.0
Merck,weighted avg,0.96,0.95,0.95,22.0


In [106]:
df.to_csv("summarizer_test_result.csv")