# Hugging Face for sentiment analysis

In [1]:
import pandas as pd

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from transformers import pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data\\test_set_twitter.csv')

In [None]:
def truncate_text(text, max_length):

    return text[:max_length] if len(text) > max_length else text

def classify_text(text, truncate_text, classifier, max_length):

    if text.strip():
        truncated_text = truncate_text(text, max_length)
        result = classifier(truncated_text)[0]
        max_label = max(result, key=lambda x: x['score'])
        return max_label['label']
    return None

def classify_sentiments(texts, classifier, max_length=512, true_labels=None):

    results = []
    for text in texts:
        label = classify_text(text, truncate_text, classifier, max_length)
        results.append(label)

    if true_labels is not None:
        label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
        numerical_results = [label_mapping[label] for label in results]
        report = classification_report(true_labels, numerical_results)
        return results, report

    return results

### Hugging Face - model 1

In [3]:
sentiment_classifier = pipeline(
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=0,
    return_all_scores=True
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
classification_results, classification_report_output = classify_sentiments(df['content'], sentiment_classifier, true_labels=df['label'])

print("Classification Report:\n", classification_report_output)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Classification Report:
               precision    recall  f1-score   support

         0.0       0.52      0.66      0.58       667
         1.0       0.47      0.62      0.53       667
         2.0       0.70      0.29      0.41       667

    accuracy                           0.52      2001
   macro avg       0.56      0.52      0.51      2001
weighted avg       0.56      0.52      0.51      2001



### Hugging Face - model 2

In [6]:
distilbert_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    device=0,
    return_all_scores=True
)

In [7]:
classification_results, classification_report_output = classify_sentiments(df['content'], distilbert_sentiment_classifier, true_labels=df['label'])


print("Classification Report:\n", classification_report_output)


Classification Report:
               precision    recall  f1-score   support

         0.0       0.45      0.87      0.59       667
         1.0       0.53      0.04      0.07       667
         2.0       0.48      0.48      0.48       667

    accuracy                           0.46      2001
   macro avg       0.49      0.46      0.38      2001
weighted avg       0.49      0.46      0.38      2001

