# Benchmark Model
We will be using a multilingual sentiment analysis from tabularisai
https://huggingface.co/tabularisai/multilingual-sentiment-analysis

In [43]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


### Testing imports and MSA model

In [44]:
pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")

result = pipe("I really love using Hugging Face transformers!")
print(result)

Device set to use mps:0


[{'label': 'Very Positive', 'score': 0.47680380940437317}]


## Testing MSA model on Negative Amazon Review

In [45]:
pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis", top_k=None)

result = pipe("Poor Quality,This monitor worked well for the first three months. It became erratic so I changed the battery, it woked perfectly for two days, then stopped woking again. Not worth the money.")
print(result)


Device set to use mps:0


[[{'label': 'Very Negative', 'score': 0.3376656174659729}, {'label': 'Negative', 'score': 0.2424277365207672}, {'label': 'Neutral', 'score': 0.18288999795913696}, {'label': 'Positive', 'score': 0.13345637917518616}, {'label': 'Very Positive', 'score': 0.10356023907661438}]]


## Collapsing MSA Multi Category response into binary classification

In [46]:

pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis", top_k=None)

output  = pipe("Poor Quality,This monitor worked well for the first three months. It became erratic so I changed the battery, it woked perfectly for two days, then stopped woking again. Not worth the money.")[0]

scores = {entry['label']: entry['score'] for entry in output}

negative_score = scores['Very Negative'] + scores['Negative']
positive_score = scores['Very Positive'] + scores['Positive']

print("Negative Score", negative_score)
print("Positive Score", positive_score)


binary_label = "Negative" if negative_score > positive_score else "Positive"
confidence = max(negative_score, positive_score)

print("Original scores:", scores)
print("Collapsed binary sentiment:", binary_label)
print("Confidence:", confidence)

Device set to use mps:0


Negative Score 0.5800933539867401
Positive Score 0.23701661825180054
Original scores: {'Very Negative': 0.3376656174659729, 'Negative': 0.2424277365207672, 'Neutral': 0.18288999795913696, 'Positive': 0.13345637917518616, 'Very Positive': 0.10356023907661438}
Collapsed binary sentiment: Negative
Confidence: 0.5800933539867401


## Testing MSA on 1k data

In [47]:
df = pd.read_csv("test_1K_with_headers.csv")

pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")

df['sentiment'] = df['review'].apply(lambda x: pipe(x)[0]['label'])
df['confidence'] = df['review'].apply(lambda x: pipe(x)[0]['score'])

df.to_csv("data_with_multiclass_sentiment.csv", index=False)
df

Device set to use mps:0


Unnamed: 0,label,review,review_title,sentiment,confidence
0,1,"So where's the ""fun""?",What kind of a cheap @$$ game is this?! It's n...,Very Negative,0.318136
1,1,product quality,"it is impossible to cram it down any drain, I ...",Neutral,0.398749
2,1,Not quite as advertised,"If you are expecting this to actually be a ""La...",Negative,0.637007
3,1,Avoid,"i actually gave this zero stars, but amazon fo...",Very Positive,0.366753
4,1,Another 'sound-alike' interpretation,This is as uninspiring and insipid as it can g...,Negative,0.398311
...,...,...,...,...,...
995,2,Not as delicious as some other Trollops',A humorous and wise exploration of human value...,Negative,0.460323
996,2,One of my favorites,"This is a fabulous book. If you like ""The Hous...",Very Positive,0.362985
997,2,The Good ol Boys have returned,"I saw this relased, and of course it is a must...",Neutral,0.263414
998,2,great stufff,I originally ordered this from my dentist for ...,Very Positive,0.488412


## Testing MSA on 1k data binary collapse

In [48]:

df = pd.read_csv("test_1K_with_headers.csv")

pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis", top_k=None)

def collapse_sentiment(text):
    try:
        results = pipe(text)[0]
        scores = {r['label']: r['score'] for r in results}
        neg = scores.get('Very Negative', 0) + scores.get('Negative', 0)
        pos = scores.get('Very Positive', 0) + scores.get('Positive', 0)
        label = "Negative" if neg > pos else "Positive"
        confidence = max(neg, pos)
        label_num = 1 if label == "Negative" else 2
        return pd.Series([label, confidence, label_num])
    except Exception as e:
        print(f"Error on text: {text} -> {e}")
        return pd.Series(["Unknown", 0, 0])

df[['binary_sentiment', 'binary_confidence', 'sentiment_code']] = df['review'].apply(collapse_sentiment)
df.to_csv("data_with_binary_sentiment.csv", index=False)

df

Device set to use mps:0


Unnamed: 0,label,review,review_title,binary_sentiment,binary_confidence,sentiment_code
0,1,"So where's the ""fun""?",What kind of a cheap @$$ game is this?! It's n...,Negative,0.583087,1
1,1,product quality,"it is impossible to cram it down any drain, I ...",Positive,0.453767,2
2,1,Not quite as advertised,"If you are expecting this to actually be a ""La...",Negative,0.714688,1
3,1,Avoid,"i actually gave this zero stars, but amazon fo...",Positive,0.671075,2
4,1,Another 'sound-alike' interpretation,This is as uninspiring and insipid as it can g...,Negative,0.573924,1
...,...,...,...,...,...,...
995,2,Not as delicious as some other Trollops',A humorous and wise exploration of human value...,Negative,0.509718,1
996,2,One of my favorites,"This is a fabulous book. If you like ""The Hous...",Positive,0.647358,2
997,2,The Good ol Boys have returned,"I saw this relased, and of course it is a must...",Positive,0.454918,2
998,2,great stufff,I originally ordered this from my dentist for ...,Positive,0.868806,2


## Testing MSA on 100k data binary collapse

In [49]:
df = pd.read_csv("test_100K_with_headers.csv")

pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis", top_k=None)

def collapse_sentiment(text):
    try:
        results = pipe(text)[0]
        scores = {r['label']: r['score'] for r in results}
        neg = scores.get('Very Negative', 0) + scores.get('Negative', 0)
        pos = scores.get('Very Positive', 0) + scores.get('Positive', 0)
        label = "Negative" if neg > pos else "Positive"
        confidence = max(neg, pos)
        label_num = 1 if label == "Negative" else 2
        return pd.Series([label, confidence, label_num])
    except Exception as e:
        print(f"Error on text: {text} -> {e}")
        return pd.Series(["Unknown", 0, 0])

df[['binary_sentiment', 'binary_confidence', 'sentiment_code']] = df['review'].apply(collapse_sentiment)
df.to_csv("data_100k_with_binary_sentiment.csv", index=False)

df

Device set to use mps:0


Error on text: nan -> text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).
Error on text: nan -> text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).


Unnamed: 0,label,review,review_title,binary_sentiment,binary_confidence,sentiment_code
0,1,What a disappointment!,John Grisham should stop churning out a book a...,Negative,0.878810,1
1,1,Wire pokes out,I am a full-figured woman and the underwire on...,Negative,0.770555,1
2,1,Don't be fooled by the low price; there's alwa...,The price for this product seems like a good d...,Positive,0.672777,2
3,1,"Wish My Alarm Clock Could Talk, Too!",You know from the moment the talking alarm clo...,Negative,0.622198,1
4,1,Not that swinging!,The Cd offers some nice melodies and some that...,Negative,0.810849,1
...,...,...,...,...,...,...
99995,2,perfect,I am never traveling without this again. great...,Positive,0.930244,2
99996,2,BACK IN THE DAY WITH THE WHA,REBEL LEAGUE BY ED WILLES IS A GREAT READ AND ...,Positive,0.483237,2
99997,2,"Perfect for PreK, not great for Kindergarten",This was on an amazing sale on Amazon (thanks!...,Positive,0.310754,2
99998,2,a great example of the struggle for zionism in...,"ok, the book may focus on the relationship bet...",Positive,0.573181,2


# Benchmarking

## MSA

In [50]:
df = pd.read_csv("data_with_binary_sentiment.csv")
y_true = df['label']
y_pred = df['sentiment_code']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, pos_label=2))
print("Recall:", recall_score(y_true, y_pred, pos_label=2))
print("F1 Score:", f1_score(y_true, y_pred, pos_label=2))

print(classification_report(y_true, y_pred, target_names=['Negative','Positive']))

Accuracy: 0.722
Precision: 0.7142857142857143
Recall: 0.74
F1 Score: 0.7269155206286837
              precision    recall  f1-score   support

    Negative       0.73      0.70      0.72       500
    Positive       0.71      0.74      0.73       500

    accuracy                           0.72      1000
   macro avg       0.72      0.72      0.72      1000
weighted avg       0.72      0.72      0.72      1000



In [51]:
df = pd.read_csv("data_100k_with_binary_sentiment.csv")
df = df[df['sentiment_code'].isin([1, 2])]
y_true = df['label']
y_pred = df['sentiment_code']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, pos_label=2))
print("Recall:", recall_score(y_true, y_pred, pos_label=2))
print("F1 Score:", f1_score(y_true, y_pred, pos_label=2))

print(classification_report(y_true, y_pred, target_names=['Negative','Positive']))

Accuracy: 0.7410348206964139
Precision: 0.7271681966409681
Recall: 0.7715554311086221
F1 Score: 0.7487045123726347
              precision    recall  f1-score   support

    Negative       0.76      0.71      0.73     49999
    Positive       0.73      0.77      0.75     49999

    accuracy                           0.74     99998
   macro avg       0.74      0.74      0.74     99998
weighted avg       0.74      0.74      0.74     99998

