In [8]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch

In [9]:
df = pd.read_csv("../Data for analysis/data_for_further_analysis.csv")
df["text"] = df["title"].fillna('') + ". " + df["clean_text"].fillna('')
texts = df["text"].tolist()

In [10]:
classifier = pipeline(
    "text-classification",
    model="bhadresh-savani/distilbert-base-uncased-emotion",
    top_k=None,
    truncation=True,
    device=-1
)

In [11]:
batch_size = 32
all_outputs = []
for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    outputs = classifier(batch)
    all_outputs.extend(outputs)

100%|██████████| 2977/2977 [56:21<00:00,  1.14s/it] 


In [15]:
from pathlib import Path

labels = [d["label"] for d in all_outputs[0]]
columns = ["dominant_emotion"] + [f"emotion_{label}" for label in labels]
records = []

for row in all_outputs:
    probs = {item["label"]: item["score"] for item in row}
    top_label = max(probs, key=probs.get)
    records.append([top_label] + [probs[label] for label in labels])

df_emotions = pd.DataFrame(records, columns=columns)

output_path = Path("../Data for analysis/distilbert_emotion_scores.csv")
df_emotions.to_csv(output_path, index=False)
print(f"Saved → {output_path}")

Saved → ../Data for analysis/distilbert_emotion_scores.csv


In [None]:
df_post_text = df[["text"]].reset_index(drop=True)
df_combined = pd.concat([df_post_text, df_emotions], axis=1)

output_path_combined = Path("../DistillBert/Output/distilbert_posttext_emotions.csv")
df_combined.to_csv(output_path_combined, index=False)
print(f"Saved → {output_path_combined}")


Saved → ../Data for analysis/distilbert_posttext_emotions.csv
