In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import pandas as pd

In [3]:
model_path = "../Roberta/best_roberta_finetuned"

tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()  # put model in evaluation mode

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [6]:
data_path = "../Data for analysis/data_for_further_analysis.csv"
df = pd.read_csv(data_path)

# Combine title + clean_text (if available) for prediction input
if {'title', 'clean_text'}.issubset(df.columns):
    df["post"] = df["title"].fillna('') + " " + df["clean_text"].fillna('')
elif "clean_text" in df.columns:
    df["post"] = df["clean_text"].fillna('')
else:
    raise ValueError("Expected columns 'title' and/or 'clean_text' not found in dataset.")

texts = df["post"].tolist()
print(f"✅ Loaded {len(texts)} posts for prediction.")

✅ Loaded 95250 posts for prediction.


In [8]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- GoEmotions label set ---
goemotions_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude",
    "grief", "joy", "love", "nervousness", "optimism", "pride",
    "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

# --- Prediction function (returns probs + labels) ---
def predict_with_probs(text_list, model, tokenizer, batch_size=16):
    all_preds, all_probs = [], []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Predicting"):
        batch_texts = text_list[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True,
                           padding=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            preds = torch.argmax(probs, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())

    return np.array(all_preds), np.array(all_probs)

# --- Run predictions ---
pred_labels, pred_probs = predict_with_probs(df["post"].tolist(), model, tokenizer)

# --- Convert to readable format ---
df["predicted_label_id"] = pred_labels
df["predicted_emotion"] = [goemotions_labels[i] for i in pred_labels]

# Add top 3 emotions per post if you want:
top3 = np.argsort(-pred_probs, axis=1)[:, :3]
df["top3_emotions"] = [
    ", ".join([goemotions_labels[j] for j in row]) for row in top3
]

# --- Add probabilities as columns (optional, large but detailed) ---
for i, label in enumerate(goemotions_labels):
    df[f"prob_{label}"] = pred_probs[:, i]

# --- Keep only relevant columns for output ---
final_df = df[["subreddit", "post", "predicted_emotion", "top3_emotions"] + [f"prob_{l}" for l in goemotions_labels]]

# --- Save results ---
output_path = "../Roberta/Output/predictions_with_probs.csv"
final_df.to_csv(output_path, index=False)
print(f"✅ Predictions with probabilities saved to: {output_path}")

Predicting: 100%|██████████| 5954/5954 [02:17<00:00, 43.28it/s]


✅ Predictions with probabilities saved to: ../Roberta/Output/predictions_with_probs.csv
