In [1]:
!pip install nltk pandas tqdm



In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tusharbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df = pd.read_csv("../Circumplex Model/Output/mentalbert_7labels_with_va.csv")
df.head()

Unnamed: 0,subreddit,title,clean_text,url,anger,disgust,fear,joy,neutral,sadness,surprise,predicted_emotion,top3_emotions,valence,arousal,valence_norm,arousal_norm,quadrant
0,ADHD,Does Strattera work for anyone?,psychiatrist decided switch concerta strattera...,https://www.reddit.com/r/ADHD/comments/1hcl16z...,0.00099,0.00028,0.000411,0.001127,0.985838,0.000148,0.011205,neutral,"neutral, surprise, joy",-0.053029,-0.612822,-1.106059,-2.225643,Q3: Negative–Low Arousal
1,ADHD,How can I become a better student and stop fee...,’ trouble uni spoke favorite teacher another t...,https://www.reddit.com/r/ADHD/comments/1hc3g0n...,0.273112,0.005573,0.001122,0.002145,0.373796,0.311687,0.032565,neutral,"neutral, sadness, anger",-0.463809,-0.139381,-1.927618,-1.278761,Q3: Negative–Low Arousal
2,ADHD,able to read social cues fine but not know how...,socially shy growing usually quiet situation n...,https://www.reddit.com/r/ADHD/comments/1hc30d4...,0.007374,0.001338,0.004227,0.00966,0.906869,0.011406,0.059126,neutral,"neutral, surprise, sadness",-0.022548,-0.51838,-1.045096,-2.03676,Q3: Negative–Low Arousal
3,ADHD,Focussing and Concentrating on dull tasks is a...,please help 19 undiagnosed waiting list tested...,https://www.reddit.com/r/ADHD/comments/1hc346v...,0.011763,0.001967,0.550103,0.004271,0.412945,0.014081,0.004871,fear,"fear, neutral, sadness",-0.50985,0.123208,-2.0197,-0.753583,Q3: Negative–Low Arousal
4,ADHD,random - label maker recommendations,random request label maker adhd struggle organ...,https://www.reddit.com/r/ADHD/comments/1hc3axk...,0.000936,0.000246,0.000197,0.00253,0.995199,0.000197,0.000695,neutral,"neutral, joy, anger",-0.05994,-0.625936,-1.119881,-2.251872,Q3: Negative–Low Arousal


In [6]:
vad_path = "../Circumplex Model/NRC-VAD-Lexicon-v2.1/NRC-VAD-Lexicon.txt"

# The NRC VAD lexicon format is typically tab-separated:
# word | valence | arousal | dominance

vad_df = pd.read_csv(
    vad_path,
    sep="\t",
    names=["word", "valence", "arousal", "dominance"],
    skiprows=1,   # skip header line if present
    engine="python"
)

# Ensure lowercase matching
vad_df["word"] = vad_df["word"].astype(str).str.lower()

# Convert to lookup dictionaries
vad_dict = vad_df.set_index("word")[["valence", "arousal", "dominance"]].to_dict(orient="index")

len(vad_dict)


19970

In [7]:
def compute_vad_features(text):
    if pd.isna(text) or not isinstance(text, str):
        return {
            "mean_valence_lex": 0,
            "mean_arousal_lex": 0,
            "mean_dominance_lex": 0,
            "valence_var_lex": 0,
            "arousal_var_lex": 0,
            "ratio_high_val": 0,
            "ratio_low_val": 0,
            "ratio_high_arousal": 0,
            "ratio_low_arousal": 0
        }

    tokens = [w.lower() for w in word_tokenize(text)]
    vals, ars, doms = [], [], []

    for w in tokens:
        if w in vad_dict:
            v, a, d = vad_dict[w]["valence"], vad_dict[w]["arousal"], vad_dict[w]["dominance"]
            vals.append(v)
            ars.append(a)
            doms.append(d)

    if len(vals) == 0:
        return {
            "mean_valence_lex": 0,
            "mean_arousal_lex": 0,
            "mean_dominance_lex": 0,
            "valence_var_lex": 0,
            "arousal_var_lex": 0,
            "ratio_high_val": 0,
            "ratio_low_val": 0,
            "ratio_high_arousal": 0,
            "ratio_low_arousal": 0
        }

    vals = np.array(vals)
    ars = np.array(ars)

    return {
        "mean_valence_lex": vals.mean(),
        "mean_arousal_lex": ars.mean(),
        "mean_dominance_lex": np.array(doms).mean(),
        "valence_var_lex": vals.var(),
        "arousal_var_lex": ars.var(),
        "ratio_high_val": np.mean(vals > 0.66),
        "ratio_low_val": np.mean(vals < 0.33),
        "ratio_high_arousal": np.mean(ars > 0.66),
        "ratio_low_arousal": np.mean(ars < 0.33)
    }

In [9]:
psy_features = df["clean_text"].apply(compute_vad_features)

psy_df = pd.DataFrame(list(psy_features))
df = pd.concat([df, psy_df], axis=1)

df.head()

Unnamed: 0,subreddit,title,clean_text,url,anger,disgust,fear,joy,neutral,sadness,...,quadrant,mean_valence_lex,mean_arousal_lex,mean_dominance_lex,valence_var_lex,arousal_var_lex,ratio_high_val,ratio_low_val,ratio_high_arousal,ratio_low_arousal
0,ADHD,Does Strattera work for anyone?,psychiatrist decided switch concerta strattera...,https://www.reddit.com/r/ADHD/comments/1hcl16z...,0.00099,0.00028,0.000411,0.001127,0.985838,0.000148,...,Q3: Negative–Low Arousal,0.584529,0.507824,0.598,0.057399,0.018579,0.411765,0.117647,0.117647,0.176471
1,ADHD,How can I become a better student and stop fee...,’ trouble uni spoke favorite teacher another t...,https://www.reddit.com/r/ADHD/comments/1hc3g0n...,0.273112,0.005573,0.001122,0.002145,0.373796,0.311687,...,Q3: Negative–Low Arousal,0.544526,0.473013,0.574718,0.05754,0.020952,0.320513,0.217949,0.128205,0.192308
2,ADHD,able to read social cues fine but not know how...,socially shy growing usually quiet situation n...,https://www.reddit.com/r/ADHD/comments/1hc30d4...,0.007374,0.001338,0.004227,0.00966,0.906869,0.011406,...,Q3: Negative–Low Arousal,0.614486,0.445444,0.534986,0.044741,0.026835,0.527778,0.152778,0.083333,0.208333
3,ADHD,Focussing and Concentrating on dull tasks is a...,please help 19 undiagnosed waiting list tested...,https://www.reddit.com/r/ADHD/comments/1hc346v...,0.011763,0.001967,0.550103,0.004271,0.412945,0.014081,...,Q3: Negative–Low Arousal,0.559349,0.461965,0.525756,0.055838,0.025637,0.44186,0.22093,0.127907,0.174419
4,ADHD,random - label maker recommendations,random request label maker adhd struggle organ...,https://www.reddit.com/r/ADHD/comments/1hc3axk...,0.000936,0.000246,0.000197,0.00253,0.995199,0.000197,...,Q3: Negative–Low Arousal,0.616357,0.417714,0.544786,0.024425,0.027654,0.285714,0.071429,0.071429,0.285714


In [12]:
output_path = "./Output/mentalbert_7labels_with_va_psycholinguistic.csv"
df.to_csv(output_path, index=False)

print("Saved:", output_path)

Saved: ./Output/mentalbert_7labels_with_va_psycholinguistic.csv
