In [None]:
from src.data import make_dataset
from transformers import pipeline
from sklearn.metrics import classification_report
import torch
import pandas as pd
import gc

In [None]:
if torch.cuda.is_available():
    device = 0
else:
    device = -1

In [None]:
# input_file = "../../data/raw/reviews.csv"
# train_output_file = "../../data/processed/train_final_processed_reviews.csv"
# test_output_file = "../../data/processed/test_final_processed_reviews.csv"
# X_train, X_test, y_train, y_test = make_dataset.main(input_file, train_output_file, test_output_file)


In [None]:
train = pd.read_csv("../../data/processed/train_final_processed_reviews.csv", index_col='Unnamed: 0')
test = pd.read_csv("../../data/processed/test_final_processed_reviews.csv", index_col='Unnamed: 0')
X_train = train.drop('sentiment', axis=1)
X_test = test.drop('sentiment', axis=1)
y_train = train.sentiment
y_test = test.sentiment

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
data = X_test.text.to_list()
data = [x[:512] if len(x)>512 else x for x in data]

In [None]:
%%time
sentiment_pipeline = pipeline(model = "distilbert-base-uncased-finetuned-sst-2-english", device=device)
results0 = sentiment_pipeline(data)
results0[0:5]

In [None]:
%%time
specific_model = pipeline(model="nlptown/bert-base-multilingual-uncased-sentiment", device=device)
results1 = specific_model(data)
results1[0:5]

CPU times: total: 12.2 s
Wall time: 14.8 s


[{'label': '5 stars', 'score': 0.9336758852005005},
 {'label': '5 stars', 'score': 0.4628945291042328},
 {'label': '2 stars', 'score': 0.555162250995636},
 {'label': '4 stars', 'score': 0.5062201619148254},
 {'label': '4 stars', 'score': 0.6108953952789307}]

In [None]:
%%time
specific_model = pipeline(model="Seethal/sentiment_analysis_generic_dataset", device=device)
results2 = specific_model(data)
results2[0:5]

CPU times: total: 6.03 s
Wall time: 6.92 s


[{'label': 'LABEL_2', 'score': 0.9964131712913513},
 {'label': 'LABEL_2', 'score': 0.9589166045188904},
 {'label': 'LABEL_0', 'score': 0.7314499020576477},
 {'label': 'LABEL_2', 'score': 0.9967833757400513},
 {'label': 'LABEL_2', 'score': 0.9085060358047485}]

In [None]:
%%time
specific_model = pipeline('sentiment-analysis', model="siebert/sentiment-roberta-large-english", device=device)
results3 = specific_model(data)
results3[0:5]

CPU times: total: 19.8 s
Wall time: 19.6 s


[{'label': 'POSITIVE', 'score': 0.9989363551139832},
 {'label': 'POSITIVE', 'score': 0.9988914132118225},
 {'label': 'NEGATIVE', 'score': 0.9995133876800537},
 {'label': 'POSITIVE', 'score': 0.9988318085670471},
 {'label': 'POSITIVE', 'score': 0.9908561706542969}]

# Test Results

In [None]:
y_true = [1 if label=='positive' else 0 for label in y_test]

## Results for distilbert-base-uncased-finetuned-sst-2-english

In [None]:
labels = [result['label'] for result in results0]
y_pred = [1 if label=='POSITIVE' else 0 for label in labels]
print(classification_report(y_true, y_pred))

NameError: name 'results0' is not defined

## Results for nlptown/bert-base-multilingual-uncased-sentiment"

In [None]:
labels = [result['label'] for result in results1]
y_pred = [1 if label in ['5 stars', '4 stars', '3 stars'] else 0 for label in labels]
print(classification_report(y_true, y_pred))

## Results for Seethal/sentiment_analysis_generic_dataset

In [None]:
labels = [result['label'] for result in results2]
y_pred = [1 if label=='LABEL_2' else 0 for label in labels]
# y_pred = [0 if label=='LABEL_0' else 1 for label in labels]
print(classification_report(y_true, y_pred))

## Results for siebert/sentiment-roberta-large-english

In [None]:
labels = [result['label'] for result in results3]
y_pred = [1 if label=='POSITIVE' else 0 for label in labels]
print(classification_report(y_true, y_pred))

In [None]:
results3

In [None]:
sentiment_pipeline=None
specific_model=None
gc.collect()
torch.cuda.empty_cache()

In [15]:
from src.models.sentiment_analysis.pre_trained.seibert import Seibert

model = Seibert()
pred = model.predict(X_test).predicted_sentiment
print(classification_report(y_true, pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       283
           1       0.97      0.96      0.97       806

    accuracy                           0.95      1089
   macro avg       0.93      0.94      0.94      1089
weighted avg       0.95      0.95      0.95      1089



In [None]:
# from src.models.sentiment_analysis.pre_trained.bert_fine_tuned import BertFineTuned
# model = BertFineTuned('bert_state_dict.pt')
# pred = model.predict(X_test)
# print(classification_report(y_true, pred))