In [1]:
# merge_eda4_sentiment_circumplex_risk.ipynb
# Purpose: Add sentiment (VADER), valence/arousal (Circumplex via VAD lexicon), and 5-level danger categories.

import os
import re
import math
import string
import warnings
from collections import Counter, defaultdict

import pandas as pd
import numpy as np

# Sentiment (rule-based, robust for social text)
from nltk.sentiment import SentimentIntensityAnalyzer

# If running first time in a new environment:
# import nltk; nltk.download('vader_lexicon')

# --- Paths ---
INPUT_CSV = "../EDA Code/Merge EDA/all_subreddits_merged_clean.csv"  # update if needed
OUT_DIR   = "Output"
os.makedirs(OUT_DIR, exist_ok=True)

# Optional paths for lexicons (put the files here if you have them)
# Option 1: NRC VAD lexicon (preferred): one token per row with V,A,D in [0,1]
NRC_VAD_PATH = "NRC-VAD-Lexicon.txt"      # expected columns: word, valence, arousal, dominance
# Option 2: Warriner norms: has valence/arousal on a 1–9 scale
WARRINER_PATH = "Warriner_et_al_2013.csv" # expected columns: Word, V.Mean.Sum, A.Mean.Sum

# --- Config ---
# Candidate text columns to auto-detect
TEXT_COLUMNS_CANDIDATES = ["clean_text", "text", "body", "selftext", "post", "content", "title"]

# Emo/keyword flags for danger detection (non-exhaustive baseline; refine as needed)
SELF_HARM_TERMS = [
    "suicide","kill myself","end my life","take my life","self harm","self-harm","cut myself",
    "overdose","od","hanging","noose","jump off","die by suicide","suicidal","ideation","plan",
    "kill me","want to die","wish i were dead","i want to die","i dont want to live","i don't want to live"
]
CRISIS_TERMS = [
    "urgent","immediately","right now","cant cope","can't cope","panic","panic attack","desperate",
    "hopeless","worthless","no way out","end it","final note"
]
PROTECTIVE_TERMS = [
    "help line","helpline","hotline","therapist","counsellor","counselor","doctor","friend","family","support"
]

# VADER thresholds (typical)
VADER_NEG = -0.35
VADER_POS =  0.35

# Circumplex thresholds (valence/arousal scaled to 0–1; adjust later if using Warriner 1–9)
VALENCE_LOW  = 0.4
VALENCE_HIGH = 0.6
AROUSAL_LOW  = 0.4
AROUSAL_HIGH = 0.6

pd.set_option("display.max_colwidth", 120)

In [2]:
df = pd.read_csv(INPUT_CSV)

# Try to find a text column
text_col = None
for c in TEXT_COLUMNS_CANDIDATES:
    if c in df.columns:
        text_col = c
        break

if text_col is None:
    raise ValueError(f"Could not find a text column. Looked for: {TEXT_COLUMNS_CANDIDATES}")

print(f"Using text column: {text_col}")
print(f"Rows: {len(df):,}")


Using text column: clean_text
Rows: 95,250


In [3]:
# If you already have a 'tokens' column, we’ll use it when computing lexicon matches.
# Otherwise we’ll tokenise here in a simple way (lowercase, split on non-letters).

TOKEN_COL = "tokens" if "tokens" in df.columns else None

_word_re = re.compile(r"[a-z']+")

def simple_tokenise(text: str):
    if not isinstance(text, str):
        return []
    text = text.lower()
    return _word_re.findall(text)

if TOKEN_COL is None:
    df["_tokens_for_vad"] = df[text_col].apply(simple_tokenise)
    TOKEN_COL = "_tokens_for_vad"
else:
    # ensure lower-cased tokens
    df["_tokens_for_vad"] = df[TOKEN_COL].apply(lambda xs: [x.lower() for x in xs] if isinstance(xs, (list, tuple)) else simple_tokenise(str(xs)))
    TOKEN_COL = "_tokens_for_vad"

print("Token column used for VAD:", TOKEN_COL)

Token column used for VAD: _tokens_for_vad


In [4]:
# Ensures the VADER lexicon is available for NLTK wherever you're running.
import os, nltk

# Put NLTK data in your home folder so there are no permission issues
NLTK_HOME = os.path.join(os.path.expanduser("~"), "nltk_data")
nltk.data.path.append(NLTK_HOME)

try:
    # This will raise LookupError if the lexicon is missing
    from nltk.sentiment import SentimentIntensityAnalyzer
    _ = SentimentIntensityAnalyzer()
except LookupError:
    print("Downloading VADER lexicon to", NLTK_HOME)
    nltk.download("vader_lexicon", download_dir=NLTK_HOME)
    from nltk.sentiment import SentimentIntensityAnalyzer
    _ = SentimentIntensityAnalyzer()

print("VADER is ready.")

VADER is ready.


In [5]:
sia = SentimentIntensityAnalyzer()

def vader_compound(text):
    if not isinstance(text, str) or not text.strip():
        return 0.0
    return sia.polarity_scores(text).get("compound", 0.0)

df["sentiment_vader"] = df[text_col].apply(vader_compound)

# Optional quick label
def vader_label(c):
    if c <= VADER_NEG:
        return "negative"
    if c >= VADER_POS:
        return "positive"
    return "neutral"

df["sentiment_label"] = df["sentiment_vader"].apply(vader_label)
df[["sentiment_vader","sentiment_label"]].head()

Unnamed: 0,sentiment_vader,sentiment_label
0,0.3612,positive
1,-0.989,negative
2,0.9565,positive
3,0.7859,positive
4,0.5719,positive


In [9]:
import os, nltk
NLTK_HOME = os.path.join(os.path.expanduser("~"), "nltk_data")
nltk.data.path.append(NLTK_HOME)

# Ensure both punkt and punkt_tab are present
for resource in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{resource}")
        print(f"NLTK {resource} already present.")
    except LookupError:
        print(f"Downloading NLTK {resource} to {NLTK_HOME}")
        nltk.download(resource, download_dir=NLTK_HOME)
        nltk.data.find(f"tokenizers/{resource}")
        print("Done.")

NLTK punkt already present.
Downloading NLTK punkt_tab to /Users/tusharbansal/nltk_data


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tusharbansal/nltk_data...


Done.


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [10]:
# Cell 4.5 — NRC Emotion → Circumplex proxy (normalised, robustly scaled)
import sys, subprocess
try:
    from nrclex import NRCLex
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nrclex"])
    from nrclex import NRCLex

# Include positive/negative alongside the 8 basic emotions
EMOS_ALL = ["anger","anticipation","disgust","fear","joy","sadness","surprise","trust","positive","negative"]

def nrc_freqs(text: str):
    if not isinstance(text, str) or not text.strip():
        d = {e: 0.0 for e in EMOS_ALL}
        d["emo_total"] = 0
        return d
    emo = NRCLex(text.lower())
    raw = {e: float(emo.raw_emotion_scores.get(e, 0)) for e in EMOS_ALL}
    total = sum(raw.values())
    if total == 0:
        freq = {e: 0.0 for e in EMOS_ALL}
    else:
        freq = {e: raw[e] / total for e in EMOS_ALL}
    freq["emo_total"] = total
    return freq

emo_df = df[text_col].apply(nrc_freqs).apply(pd.Series).fillna(0.0)

# Heuristic valence/arousal from emotion FREQUENCIES (sum to ~1)
val_raw = (
    1.00*emo_df["positive"] + 0.80*emo_df["joy"] + 0.60*emo_df["trust"] + 0.35*emo_df["anticipation"] + 0.20*emo_df["surprise"]
    - (1.00*emo_df["negative"] + 0.85*emo_df["sadness"] + 0.85*emo_df["disgust"] + 0.75*emo_df["fear"] + 0.60*emo_df["anger"])
)

aro_raw = (
    0.95*emo_df["anger"] + 0.95*emo_df["fear"] + 0.80*emo_df["surprise"] + 0.60*emo_df["anticipation"] + 0.45*emo_df["joy"]
    - (0.45*emo_df["sadness"] + 0.35*emo_df["disgust"] + 0.25*emo_df["trust"])
)

# Robust quantile scaling to 0..1 (avoid a single outlier collapsing spread)
def robust_minmax01(s, q_low=0.02, q_high=0.98):
    lo, hi = s.quantile(q_low), s.quantile(q_high)
    if hi - lo < 1e-9:
        return pd.Series(np.zeros(len(s)), index=s.index)
    return ((s - lo) / (hi - lo)).clip(0, 1)

df["valence_proxy"] = robust_minmax01(val_raw)
df["arousal_proxy"] = robust_minmax01(aro_raw)

# Quick peek
df[["valence_proxy","arousal_proxy"]].describe().round(3)

Unnamed: 0,valence_proxy,arousal_proxy
count,95250.0,95250.0
mean,0.44,0.546
std,0.213,0.201
min,0.0,0.0
25%,0.302,0.445
50%,0.425,0.563
75%,0.559,0.666
max,1.0,1.0


In [11]:
vad_lex = {}

def load_nrc_vad(path):
    # Format assumption: "word\tvalence\tarousal\tdominance"
    local = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            w = parts[0].lower()
            try:
                v = float(parts[1])
                a = float(parts[2])
            except ValueError:
                continue
            # Already in [0,1]
            local[w] = (v, a)
    return local

def load_warriner(path):
    # Warriner valence/arousal are on 1–9; we’ll scale to [0,1]
    local = {}
    tab = pd.read_csv(path)
    # Common column names: "Word","V.Mean.Sum","A.Mean.Sum"
    wcol = "Word"
    vcol = "V.Mean.Sum"
    acol = "A.Mean.Sum"
    for _, row in tab.iterrows():
        w = str(row[wcol]).lower()
        v = float(row[vcol])
        a = float(row[acol])
        v01 = (v - 1.0) / 8.0
        a01 = (a - 1.0) / 8.0
        local[w] = (v01, a01)
    return local

try:
    if os.path.exists(NRC_VAD_PATH):
        vad_lex = load_nrc_vad(NRC_VAD_PATH)
        source = "NRC-VAD"
    elif os.path.exists(WARRINER_PATH):
        vad_lex = load_warriner(WARRINER_PATH)
        source = "Warriner (scaled)"
    else:
        source = None
        warnings.warn(
            "No VAD lexicon found. Place NRC-VAD-Lexicon.txt or Warriner_et_al_2013.csv in the working directory."
        )
    print("VAD source:", source, f"({len(vad_lex):,} entries)" if vad_lex else "")
except Exception as e:
    vad_lex = {}
    source = None
    warnings.warn(f"Failed to load VAD lexicon: {e}")

VAD source: None 




In [12]:
# Cell 6 — Circumplex: prefer true VAD; otherwise use proxy (already robust-scaled)
def post_vad(tokens):
    vals, aros = [], []
    for t in tokens:
        if t in vad_lex:
            v, a = vad_lex[t]; vals.append(v); aros.append(a)
    if not vals:
        return (np.nan, np.nan)
    return (float(np.mean(vals)), float(np.mean(aros)))

if vad_lex:
    vads = df[TOKEN_COL].apply(post_vad)
    df["valence"] = vads.apply(lambda x: x[0])
    df["arousal"] = vads.apply(lambda x: x[1])
else:
    df["valence"] = df["valence_proxy"]
    df["arousal"] = df["arousal_proxy"]

df[["valence","arousal"]].describe().round(3)

Unnamed: 0,valence,arousal
count,95250.0,95250.0
mean,0.44,0.546
std,0.213,0.201
min,0.0,0.0
25%,0.302,0.445
50%,0.425,0.563
75%,0.559,0.666
max,1.0,1.0


In [13]:
# Cell 7 — Circumplex zones with data-driven cut-offs (35th/65th percentiles)
VALENCE_LOW  = float(df["valence"].quantile(0.35))
VALENCE_HIGH = float(df["valence"].quantile(0.65))
AROUSAL_LOW  = float(df["arousal"].quantile(0.35))
AROUSAL_HIGH = float(df["arousal"].quantile(0.65))

def circumplex_zone(v, a):
    if np.isnan(v) or np.isnan(a):
        return "unknown"
    if v >= VALENCE_HIGH and a >= AROUSAL_HIGH:
        return "excited/enthusiastic"
    if v >= VALENCE_HIGH and a <= AROUSAL_LOW:
        return "calm/content"
    if v <= VALENCE_LOW and a >= AROUSAL_HIGH:
        return "anxious/panicked"
    if v <= VALENCE_LOW and a <= AROUSAL_LOW:
        return "depressed/lethargic"
    # middles
    if v < (VALENCE_LOW + VALENCE_HIGH)/2 and a > (AROUSAL_LOW + AROUSAL_HIGH)/2:
        return "distressed"
    if v < (VALENCE_LOW + VALENCE_HIGH)/2 and a <= (AROUSAL_LOW + AROUSAL_HIGH)/2:
        return "sad"
    if v >= (VALENCE_LOW + VALENCE_HIGH)/2 and a > (AROUSAL_LOW + AROUSAL_HIGH)/2:
        return "energetic"
    return "relaxed"

df["circumplex_zone"] = df.apply(lambda r: circumplex_zone(r["valence"], r["arousal"]), axis=1)

print("Cut-offs:",
      f"valence LOW={VALENCE_LOW:.3f}, HIGH={VALENCE_HIGH:.3f};",
      f"arousal LOW={AROUSAL_LOW:.3f}, HIGH={AROUSAL_HIGH:.3f}")
df["circumplex_zone"].value_counts()


Cut-offs: valence LOW=0.354, HIGH=0.502; arousal LOW=0.498, HIGH=0.620


circumplex_zone
calm/content            14880
anxious/panicked        14140
distressed              13189
energetic               11912
relaxed                 11293
sad                     10748
depressed/lethargic     10134
excited/enthusiastic     8954
Name: count, dtype: int64

In [14]:
def make_phrase_regex(phrases):
    escaped = [re.escape(p.lower()) for p in phrases]
    # Word boundary on ends when safe; allow spaces
    return re.compile(r"(" + "|".join(escaped) + r")", flags=re.IGNORECASE)

re_selfharm = make_phrase_regex(SELF_HARM_TERMS)
re_crisis    = make_phrase_regex(CRISIS_TERMS)
re_protect   = make_phrase_regex(PROTECTIVE_TERMS)

def flag_any(pattern, text):
    if not isinstance(text, str) or not text:
        return False
    return bool(pattern.search(text))

df["flag_selfharm"] = df[text_col].apply(lambda t: flag_any(re_selfharm, t))
df["flag_crisis"]    = df[text_col].apply(lambda t: flag_any(re_crisis, t))
df["flag_protect"]   = df[text_col].apply(lambda t: flag_any(re_protect, t))

df[["flag_selfharm","flag_crisis","flag_protect"]].mean().round(3)


flag_selfharm    0.415
flag_crisis      0.113
flag_protect     0.255
dtype: float64

In [15]:
# Heuristic combiner:
# Inputs: VADER, valence/arousal (if available), keyword flags
# Output: danger_level: 1..5

def danger_level(row):
    comp = row["sentiment_vader"]
    v    = row.get("valence", np.nan)
    a    = row.get("arousal", np.nan)
    sh   = bool(row["flag_selfharm"])
    cr   = bool(row["flag_crisis"])

    # Step 1: Critical indicators (Level 5)
    if sh and ("plan" in row[text_col].lower() or "how to" in row[text_col].lower() or "goodbye" in row[text_col].lower()):
        return 5
    if sh and cr:
        return 5

    # Step 2: High risk (Level 4)
    if sh:
        # explicit ideation/self-harm phrases
        return 4
    if (comp <= -0.7) and (not np.isnan(a) and a >= 0.6):
        return 4

    # Step 3: Moderate (Level 3)
    if (comp <= -0.35 and comp > -0.7):
        # Negative sentiment
        return 3
    if (not np.isnan(v) and v < 0.4) and (not np.isnan(a) and a >= 0.5):
        return 3
    if cr and comp < 0:
        return 3

    # Step 4: Mild concern (Level 2)
    if comp < 0:
        return 2
    if (not np.isnan(v) and v < 0.5):
        return 2

    # Step 5: Low (Level 1)
    return 1

df["danger_level"] = df.apply(danger_level, axis=1).astype(int)

# Human-friendly label (optional)
DANGER_LABELS = {
    1: "Low",
    2: "Mild concern",
    3: "Moderate",
    4: "High",
    5: "Critical",
}
df["danger_label"] = df["danger_level"].map(DANGER_LABELS)

df["danger_label"].value_counts().sort_index()

danger_label
Critical        12156
High            36954
Low             14011
Mild concern    15986
Moderate        16143
Name: count, dtype: int64

In [16]:
print("VADER compound summary:")
print(df["sentiment_vader"].describe(percentiles=[.05,.25,.5,.75,.95]).round(3), "\n")

if "valence" in df and df["valence"].notna().any():
    print("Valence summary:")
    print(df["valence"].describe(percentiles=[.05,.25,.5,.75,.95]).round(3), "\n")

if "arousal" in df and df["arousal"].notna().any():
    print("Arousal summary:")
    print(df["arousal"].describe(percentiles=[.05,.25,.5,.75,.95]).round(3), "\n")

print("Danger level counts:")
print(df["danger_level"].value_counts().sort_index())

VADER compound summary:
count    95250.000
mean        -0.316
std          0.699
min         -1.000
5%          -0.990
25%         -0.933
50%         -0.648
75%          0.318
95%          0.940
max          1.000
Name: sentiment_vader, dtype: float64 

Valence summary:
count    95250.000
mean         0.440
std          0.213
min          0.000
5%           0.098
25%          0.302
50%          0.425
75%          0.559
95%          0.845
max          1.000
Name: valence, dtype: float64 

Arousal summary:
count    95250.000
mean         0.546
std          0.201
min          0.000
5%           0.126
25%          0.445
50%          0.563
75%          0.666
95%          0.866
max          1.000
Name: arousal, dtype: float64 

Danger level counts:
danger_level
1    14011
2    15986
3    16143
4    36954
5    12156
Name: count, dtype: int64


In [17]:
print("Circumplex availability:")
print("  Using VAD source?:", "YES" if len(vad_lex) else "NO (proxy)")
print(df[["sentiment_vader","valence","arousal","danger_level"]].head(8))

Circumplex availability:
  Using VAD source?: NO (proxy)
   sentiment_vader   valence   arousal  danger_level
0           0.3612  0.492477  0.247432             5
1          -0.9890  0.543547  0.340282             2
2           0.9565  0.508208  0.530547             5
3           0.7859  0.550010  0.482530             5
4           0.5719  0.672975  0.687875             1
5          -0.3832  0.372398  0.797697             5
6          -0.9844  0.384092  0.296919             4
7          -0.1280  0.405346  0.573950             2
