# Model validation

In [20]:
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def preprocess(text):
    """Preprocess text (username and link placeholders)"""
    new_text = []
    for t in text.split(' '):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return ' '.join(new_text).strip()

MODEL = 'Cloudy1225/stackoverflow-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to(device)

None

## Original test dataset

In [4]:
!wget https://raw.githubusercontent.com/collab-uniba/Senti4SD/refs/heads/master/Senti4SD_GoldStandard_and_DSM/Senti4SD_Train_Test_Partitions/test1326itemPOLARITY.csv

--2025-07-05 18:54:43--  https://raw.githubusercontent.com/collab-uniba/Senti4SD/refs/heads/master/Senti4SD_GoldStandard_and_DSM/Senti4SD_Train_Test_Partitions/test1326itemPOLARITY.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 245886 (240K) [text/plain]
Saving to: ‘test1326itemPOLARITY.csv.3’


2025-07-05 18:54:43 (9.04 MB/s) - ‘test1326itemPOLARITY.csv.3’ saved [245886/245886]



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/working/test1326itemPOLARITY.csv', sep=';', header=None)

df.rename(columns={0: 'id', 1: 'sentiment', 2:'text'}, inplace=True)

def classify_sentiment(tokenizer, model, text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    output = model(**encoded_input)
    scores = output[0][0].detach().cpu().numpy()
    scores = softmax(scores)
    return int(np.argmax(scores))

df['predicted'] = df['text'].apply(lambda x: classify_sentiment(tokenizer, model, x))

Unnamed: 0,id,sentiment,text
0,t4,positive,@DrabJay: excellent suggestion! Code changed. :-)
1,t5,neutral,Any decent browser should protect against mali...
2,t8,negative,I swear - I don't put pseudo code I get told o...
3,t9,neutral,I have attached below
4,t13,negative,When I refactor the following line: using Resh...
...,...,...,...
1321,t4414,neutral,"@yaauie - sure, I could have put a `raise 'oh ..."
1322,t4415,positive,Excellent resource:
1323,t4418,positive,"+1 from me, I loved the leap from MFC to Qt ;)"
1324,t4420,positive,"Works great! And you can add ""desc"" after the ..."


In [17]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

label_map = {'positive': 2, 'neutral': 1, 'negative': 0}

ground_truth = np.array(list(map(lambda x: label_map[x], df['sentiment'])))
predicted = np.array(df['predicted'])

target_names = ['Negative', 'Neutral', 'Positive']

print("--- Classification Report ---")
report = classification_report(ground_truth, predicted, target_names=target_names)
print(report)

print("\n--- Overall Accuracy ---")
accuracy = accuracy_score(ground_truth, predicted)
print(f"Accuracy Score: {accuracy:.2f}")

print("\n--- Confusion Matrix ---")

cm = confusion_matrix(ground_truth, predicted)
print(cm)

--- Classification Report ---
              precision    recall  f1-score   support

    Negative       0.90      0.89      0.89       360
     Neutral       0.88      0.89      0.89       508
    Positive       0.96      0.95      0.95       458

    accuracy                           0.91      1326
   macro avg       0.91      0.91      0.91      1326
weighted avg       0.91      0.91      0.91      1326


--- Overall Accuracy ---
Accuracy Score: 0.91

--- Confusion Matrix ---
[[319  39   2]
 [ 36 454  18]
 [  1  23 434]]


## Our (small) test dataset

In [None]:
import json

human = []
labelled = []

with open('./data/full-human-labels.jsonl', 'r') as inp:
    for line in inp:
        obj = json.loads(line)
        human.append(obj['human_label_sentiment'])
        labelled.append(obj['labeled_sentiment'])

ground_truth = np.array(human)
predicted = np.array(labelled)

# The labels you are using
target_names = ['Negative', 'Neutral', 'Positive']

# 1. Generate the main Classification Report
print("--- Classification Report ---")
report = classification_report(ground_truth, predicted, target_names=target_names)
print(report)

# 2. Calculate the overall Accuracy
print("\n--- Overall Accuracy ---")
accuracy = accuracy_score(ground_truth, predicted)
print(f"Accuracy Score: {accuracy:.2f}")

# 3. Display the Confusion Matrix
print("\n--- Confusion Matrix ---")
# The matrix rows represent the actual classes, and columns represent the predicted classes.
cm = confusion_matrix(ground_truth, predicted)
print(cm)

--- Classification Report ---
              precision    recall  f1-score   support

    Negative       0.87      0.94      0.90       111
     Neutral       0.97      0.82      0.89       142
    Positive       0.88      0.98      0.93       107

    accuracy                           0.90       360
   macro avg       0.90      0.91      0.90       360
weighted avg       0.91      0.90      0.90       360


--- Overall Accuracy ---
Accuracy Score: 0.90

--- Confusion Matrix ---
[[104   3   4]
 [ 15 116  11]
 [  1   1 105]]
