In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import pipeline

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None) 

In [10]:
df = pd.read_csv('Wonjoo/PenningtonFertilizer.csv', index_col = [0])

In [11]:
#replace NaN in label columns with 0
columns = ['cost','ease of use', 'effective', 'efficient']

for column in columns:
    df[column] = df[column].replace(np.nan, int(0))

In [12]:
classifier  = pipeline("zero-shot-classification",  model = "facebook/bart-large-mnli")

In [13]:
# insert the labels you identified from above section
type = ['cost', 'efficient', 'effective', 'ease of use']

In [14]:
#setting empty values for the columns
index = -1
for label in type:
    df[label + '_m'] = -1

for j in range(99):
    #counter for progress/debugging
    index+=1
    if(index%20 == 0): 
        print(index)
        
    #running the classifier on the column    
    res = classifier(
        df.iloc[j]['review_lower'],
        candidate_labels = type,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_m'].iloc[j] = res['scores'][i]

0
20
40
60
80


In [15]:
from sklearn.metrics import classification_report

In [30]:
def imputation(list):
    i = []
    for prediction in list:
        if prediction < 0.7:
            i.append(0)
        else:
            i.append(1)
    
    return(i)

In [32]:
df['cost_m'] = imputation(df['cost_m'])
df['efficient_m'] = imputation(df['efficient_m'])
df['effective_m'] = imputation(df['effective_m'])
df['ease of use_m'] = imputation(df['ease of use_m'])

In [70]:
print(classification_report(df['cost'], df['cost_m']))

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99       395
         1.0       0.31      1.00      0.48         5

    accuracy                           0.97       400
   macro avg       0.66      0.99      0.73       400
weighted avg       0.99      0.97      0.98       400



In [53]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['cost'], df['cost_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.9861


In [71]:
print(classification_report(df['efficient'], df['efficient_m']))

              precision    recall  f1-score   support

         0.0       0.98      0.86      0.92       375
         1.0       0.28      0.80      0.41        25

    accuracy                           0.86       400
   macro avg       0.63      0.83      0.67       400
weighted avg       0.94      0.86      0.89       400



In [54]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['efficient'], df['efficient_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.8307


In [72]:
print(classification_report(df['effective'], df['effective_m']))

              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       316
         1.0       0.85      0.83      0.84        84

    accuracy                           0.94       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.93      0.94      0.93       400



In [55]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['effective'], df['effective_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.8977


In [73]:
print(classification_report(df['ease of use'], df['ease of use_m']))

              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       376
         1.0       0.41      0.58      0.48        24

    accuracy                           0.93       400
   macro avg       0.69      0.77      0.72       400
weighted avg       0.94      0.93      0.93       400



In [56]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['ease of use'], df['ease of use_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.7651


In [68]:
#df.to_csv('testing_evaluation.csv')

In [59]:
#df2 = pd.read_csv('testing_evaluation.csv', index_col = [0])

In [60]:
#evaluation(df2['cost'], df2['naive_cost'])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       395
           1       0.00      0.00      0.00         5

    accuracy                           0.99       400
   macro avg       0.49      0.50      0.50       400
weighted avg       0.98      0.99      0.98       400



In [61]:
# from sklearn.metrics import roc_auc_score
# ROC_AUC = roc_auc_score(df2['cost'], df2['naive_cost'])
# print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5000


In [62]:
#evaluation(df2['efficient'], df2['naive_efficient'])

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       375
           1       0.00      0.00      0.00        25

    accuracy                           0.94       400
   macro avg       0.47      0.50      0.48       400
weighted avg       0.88      0.94      0.91       400



In [63]:
# from sklearn.metrics import roc_auc_score
# ROC_AUC = roc_auc_score(df2['efficient'], df2['naive_efficient'])
# print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5000


In [64]:
#evaluation(df2['effective'], df2['naive_effective'])

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       316
           1       0.21      1.00      0.35        84

    accuracy                           0.21       400
   macro avg       0.10      0.50      0.17       400
weighted avg       0.04      0.21      0.07       400



In [65]:
# from sklearn.metrics import roc_auc_score
# ROC_AUC = roc_auc_score(df2['effective'], df2['naive_effective'])
# print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5000


In [66]:
#evaluation(df2['ease of use'], df2['naive_easeofuse'])

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       376
           1       0.00      0.00      0.00        24

    accuracy                           0.94       400
   macro avg       0.47      0.50      0.48       400
weighted avg       0.88      0.94      0.91       400



In [67]:
# from sklearn.metrics import roc_auc_score
# ROC_AUC = roc_auc_score(df2['ease of use'], df2['naive_easeofuse'])
# print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5000
