# Accuracy Evaluation

## Garden Tech

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

!pip install transformers
from transformers import pipeline

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None) 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 39.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 40.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0


In [2]:
from google.colab import files
uploaded = files.upload()

Saving GardenTech_handlabeled.csv to GardenTech_handlabeled.csv


In [3]:
df = pd.read_csv('GardenTech_handlabeled.csv', index_col = [0])

In [4]:
#replace NaN in label columns with 0
columns = ['cost','ease_of_use', 'effective', 'efficient']

for column in columns:
    df[column] = df[column].replace(np.nan, int(0))

## Sentiment Analysis

In [5]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [6]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

sentiment = [None] * len(df)
index = -1
for sentence in df['review_lower'][0:99]:
    index+=1
    if(index%20 == 0):
        print(index)

    result = sentiment_pipeline(sentence[:512])[0]
    sentiment[index] = result['label']
df['sentiment_m'] = sentiment

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0
20
40
60
80


## Zero-Shot Classifier

In [7]:
classifier  = pipeline("zero-shot-classification",  model = "facebook/bart-large-mnli")

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
# insert the labels you identified from above section
type = ['cost', 'efficient', 'effective', 'ease_of_use']

In [9]:
#setting empty values for the columns
index = -1
for label in type:
    df[label + '_m'] = -1

for j in range(99):
    #counter for progress/debugging
    index+=1
    if(index%20 == 0): 
        print(index)
        
    #running the classifier on the column    
    res = classifier(
        df.iloc[j]['review_lower'],
        candidate_labels = type,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df[res['labels'][i]+ '_m'].iloc[j] = res['scores'][i]

0
20
40
60
80


## Sentiment Evaluation

In [10]:
def imputation1(list):
    i = []
    for prediction in list:
        if prediction == "Positive":
            i.append(1)
        elif prediction == "Neutral":
            i.append(0)
        else:
            i.append(-1)    
    return(i)

In [11]:
df['sentiment'] = imputation1(df['sentiment'])
df['sentiment_m'] = imputation1(df['sentiment_m'])

In [12]:
from sklearn.metrics import classification_report

In [13]:
print(classification_report(df['sentiment'], df['sentiment_m']))

              precision    recall  f1-score   support

          -1       1.00      0.80      0.89       400
           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0

    accuracy                           0.80       400
   macro avg       0.33      0.27      0.30       400
weighted avg       1.00      0.80      0.89       400



## Zero-Shot Evaluation

In [14]:
def imputation(list):
    i = []
    for prediction in list:
        if prediction < 0.7:
            i.append(0)
        else:
            i.append(1)
    
    return(i)

In [15]:
df['cost_m'] = imputation(df['cost_m'])
df['efficient_m'] = imputation(df['efficient_m'])
df['effective_m'] = imputation(df['effective_m'])
df['ease_of_use_m'] = imputation(df['ease_of_use_m'])

In [16]:
print(classification_report(df['cost'], df['cost_m']))

              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96       388
         1.0       0.11      0.25      0.15        12

    accuracy                           0.92       400
   macro avg       0.54      0.59      0.56       400
weighted avg       0.95      0.92      0.93       400



In [17]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['cost'], df['cost_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5941


In [18]:
print(classification_report(df['efficient'], df['efficient_m']))

              precision    recall  f1-score   support

         0.0       0.86      0.80      0.83       339
         1.0       0.20      0.28      0.23        61

    accuracy                           0.72       400
   macro avg       0.53      0.54      0.53       400
weighted avg       0.76      0.72      0.74       400



In [19]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['efficient'], df['efficient_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5376


In [20]:
print(classification_report(df['effective'], df['effective_m']))

              precision    recall  f1-score   support

         0.0       0.08      0.80      0.14        30
         1.0       0.93      0.22      0.36       370

    accuracy                           0.27       400
   macro avg       0.50      0.51      0.25       400
weighted avg       0.87      0.27      0.35       400



In [21]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['effective'], df['effective_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5122


In [22]:
print(classification_report(df['ease_of_use'], df['ease_of_use_m']))

              precision    recall  f1-score   support

         0.0       0.78      0.86      0.81       299
         1.0       0.39      0.27      0.32       101

    accuracy                           0.71       400
   macro avg       0.58      0.56      0.56       400
weighted avg       0.68      0.71      0.69       400



In [23]:
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(df['ease_of_use'], df['ease_of_use_m'])
print('ROC AUC : {:.4f}'.format(ROC_AUC))

ROC AUC : 0.5618


## Compare Zero-shot against a Naive Classifier

In [24]:
def naive(list):
    avg  = np.mean(list)

    if avg >= 0.5:
        i = 1
    else:
        i = 0

    return([i] * len(list))

In [26]:
df['naive_cost'] = naive(df['cost'])
df['naive_efficient'] = naive(df['efficient'])
df['naive_effective'] = naive(df['effective'])
df['naive_easeofuse'] = naive(df['ease_of_use'])

In [27]:
print(classification_report(df['cost'], df['naive_cost']))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98       388
         1.0       0.00      0.00      0.00        12

    accuracy                           0.97       400
   macro avg       0.48      0.50      0.49       400
weighted avg       0.94      0.97      0.96       400



In [28]:
print(classification_report(df['efficient'], df['naive_efficient']))

              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92       339
         1.0       0.00      0.00      0.00        61

    accuracy                           0.85       400
   macro avg       0.42      0.50      0.46       400
weighted avg       0.72      0.85      0.78       400



In [29]:
print(classification_report(df['effective'], df['naive_effective']))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        30
         1.0       0.93      1.00      0.96       370

    accuracy                           0.93       400
   macro avg       0.46      0.50      0.48       400
weighted avg       0.86      0.93      0.89       400



In [30]:
print(classification_report(df['ease_of_use'], df['naive_easeofuse']))

              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       299
         1.0       0.00      0.00      0.00       101

    accuracy                           0.75       400
   macro avg       0.37      0.50      0.43       400
weighted avg       0.56      0.75      0.64       400



In [33]:
df.to_csv('GardenTech_evaluation.csv')

In [34]:
from google.colab import files
files.download("GardenTech_evaluation.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>