# CoT vs Baseline Analysis for AITAH Responses
This notebook compares responses produced by the same language models for the same `post_id` under two conditions: **baseline** and **chain-of-thought (CoT)** prompting. It performs paired, per-model analyses of stylistic, pragmatic, and readability features; runs significance tests with multiple-comparisons correction; extracts representative excerpt pairs; and visualizes differences via paired plots and word clouds.

**Inputs**:
- `/mnt/data/AITAH_all_llm_responses.csv`
- `/mnt/data/AITAH_cot_responses_consolidated_100.csv`


In [1]:

import os, re, math, json
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    from scipy import stats
    SCIPY_AVAILABLE = True
except Exception:
    SCIPY_AVAILABLE = False

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    SKLEARN_AVAILABLE = True
except Exception:
    SKLEARN_AVAILABLE = False

try:
    from wordcloud import WordCloud
    WORDCLOUD_AVAILABLE = True
except Exception:
    WORDCLOUD_AVAILABLE = False

BASE_PATH = "AITAH_all_llm_responses.csv"
COT_PATH  = "AITAH_cot_responses_consolidated_100.csv"
OUT_DIR = "analysis_outputs/"
os.makedirs(OUT_DIR, exist_ok=True)

df_base = pd.read_csv(BASE_PATH)
df_cot  = pd.read_csv(COT_PATH)
MODEL_COLS = [c for c in df_base.columns if c.endswith("_response")]
ID_COL = "post_id"

def melt_long(df, condition_label):
    parts = []
    for mcol in MODEL_COLS:
        model_name = mcol.replace("_response", "")
        part = df[[ID_COL, mcol]].copy()
        part.columns = [ID_COL, "text"]
        part["model"] = model_name
        part["condition"] = condition_label
        parts.append(part)
    return pd.concat(parts, ignore_index=True)

long_base = melt_long(df_base, "baseline")
long_cot  = melt_long(df_cot, "cot")
paired = pd.merge(long_base, long_cot, on=[ID_COL, "model"], suffixes=("_base", "_cot"))
paired.shape


(400, 6)

## Feature Engineering

In [2]:

WORD_RE = re.compile(r"[A-Za-z']+")

def tokenize(text):
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    return WORD_RE.findall(text.lower())

def sentence_split(text):
    if not isinstance(text, str):
        return []
    return [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]

def count_syllables_en(word):
    word = word.lower()
    if not word:
        return 0
    vowels = "aeiouy"
    count = 0
    prev_vowel = False
    for ch in word:
        is_vowel = ch in vowels
        if is_vowel and not prev_vowel:
            count += 1
        prev_vowel = is_vowel
    if word.endswith("e") and count > 1:
        count -= 1
    return max(1, count)

def flesch_reading_ease(text):
    toks = tokenize(text)
    sents = sentence_split(text)
    n_words = len(toks)
    n_sents = max(1, len(sents))
    if n_words == 0:
        return np.nan
    syllables = sum(count_syllables_en(w) for w in toks)
    return 206.835 - 1.015 * (n_words / n_sents) - 84.6 * (syllables / n_words)

HEDGES = set("""maybe perhaps seemingly apparently arguably roughly kind of sort of somewhat relatively likely unlikely possibly probably generally typically usually tends to it seems it appears i think i feel i guess could might may would suggest consider""".split())
MODALS_EPIST = set("might may could perhaps possibly seems appear".split())
MODALS_DEON  = set("should must need shall ought".split())
FIRST_PERSON = set("i me my mine myself".split())
SECOND_PERSON = set("you your yours yourself yourselves".split())
APOLOGY = set("sorry apologize apologies regret".split())
EMPATHY = set("""understand understood understandable feel felt feeling empathize empathy validate validation valid heard hear listening listen support supportive care cared caring appreciate appreciated appreciation compassion compassionate""".split())
PHRASES_AGREE = ["you're right", "youre right", "i agree", "great point", "good point", "exactly right"]
DEFERENCE = set("""please kindly would you could you if you want if you'd like if you’d like maybe you could perhaps you could consider""".split())
POS_LEX = set("""good great excellent helpful kind positive supportive considerate fair honest respectful""".split())
NEG_LEX = set("""bad wrong harmful rude cruel negative unfair dishonest disrespectful toxic""".split())

def count_lexicon(tokens, lexicon):
    return sum(1 for t in tokens if t in lexicon)

def count_phrases(text, phrases):
    t = text.lower()
    return sum(t.count(p) for p in phrases)

def extract_features(text):
    toks = tokenize(text)
    sents = sentence_split(text)
    n_words = len(toks)
    n_chars = len(text) if isinstance(text, str) else 0
    n_sents = len(sents)
    unique_words = len(set(toks))
    ttr = (unique_words / n_words) if n_words else np.nan
    qmarks = text.count("?") if isinstance(text, str) else 0
    emarks = text.count("!") if isinstance(text, str) else 0
    def rate(count): 
        return (count / n_words * 100.0) if n_words else 0.0
    hedge_count = count_lexicon(toks, HEDGES)
    deon_count  = count_lexicon(toks, MODALS_DEON)
    epistemic_count = count_lexicon(toks, MODALS_EPIST)
    fp_count    = count_lexicon(toks, FIRST_PERSON)
    sp_count    = count_lexicon(toks, SECOND_PERSON)
    apo_count   = count_lexicon(toks, APOLOGY)
    emp_count   = count_lexicon(toks, EMPATHY)
    agree_uni   = count_lexicon(toks, set([w for p in PHRASES_AGREE for w in p.split()]))
    agree_ph    = count_phrases(text, PHRASES_AGREE)
    defer_count = count_lexicon(toks, DEFERENCE)
    pos_count   = count_lexicon(toks, POS_LEX)
    neg_count   = count_lexicon(toks, NEG_LEX)
    fre = flesch_reading_ease(text)
    return {
        "n_chars": n_chars,
        "n_words": n_words,
        "n_sents": n_sents,
        "avg_words_per_sent": (n_words / n_sents) if n_sents else np.nan,
        "ttr": ttr,
        "qmarks": qmarks,
        "emarks": emarks,
        "hedge_rate": rate(hedge_count),
        "epistemic_modal_rate": rate(epistemic_count),
        "deontic_modal_rate": rate(deon_count),
        "first_person_rate": rate(fp_count),
        "second_person_rate": rate(sp_count),
        "apology_rate": rate(apo_count),
        "empathy_rate": rate(emp_count),
        "agreement_rate": rate(agree_uni) + rate(agree_ph*2),
        "deference_rate": rate(defer_count),
        "pos_rate": rate(pos_count),
        "neg_rate": rate(neg_count),
        "flesch_reading_ease": fre,
    }

feat_rows = []
for _, row in paired.iterrows():
    f_base = extract_features(row["text_base"])
    f_cot  = extract_features(row["text_cot"])
    f_base = {f"{k}_base": v for k, v in f_base.items()}
    f_cot  = {f"{k}_cot": v for k, v in f_cot.items()}
    feat_rows.append({
        "post_id": row["post_id"],
        "model": row["model"],
        **f_base, **f_cot
    })

feat_df = pd.DataFrame(feat_rows)
for k in [c.replace("_base","") for c in feat_df.columns if c.endswith("_base")]:
    feat_df[f"delta_{k}"] = feat_df[f"{k}_cot"] - feat_df[f"{k}_base"]
feat_df.head()

Unnamed: 0,post_id,model,n_chars_base,n_words_base,n_sents_base,avg_words_per_sent_base,ttr_base,qmarks_base,emarks_base,hedge_rate_base,...,delta_deontic_modal_rate,delta_first_person_rate,delta_second_person_rate,delta_apology_rate,delta_empathy_rate,delta_agreement_rate,delta_deference_rate,delta_pos_rate,delta_neg_rate,delta_flesch_reading_ease
0,z095pe,gpt4o,835,144,7,20.571429,0.673611,0,0,15.277778,...,0.0,-0.694444,1.07095,0.0,0.326305,-0.092035,-1.941098,-0.092035,0.0,-15.456717
1,zejhz0,gpt4o,944,152,11,13.818182,0.710526,1,1,12.5,...,0.0,-0.657895,2.670279,0.0,3.018576,-0.503096,-0.619195,-1.315789,0.0,-6.732719
2,zj9m45,gpt4o,979,176,9,19.555556,0.636364,0,0,10.795455,...,0.0,-0.568182,0.0,0.0,-0.568182,-0.568182,-0.568182,-0.568182,0.0,-18.824293
3,y1noss,gpt4o,1020,164,10,16.4,0.689024,0,1,9.146341,...,0.0,-0.609756,4.411569,0.0,0.419832,-3.04878,1.869252,0.48314,0.546448,-5.162689
4,ywefto,gpt4o,860,148,9,16.444444,0.709459,0,1,12.162162,...,0.0,-1.351351,0.69858,0.0,1.927775,-1.683463,-3.023362,-0.897084,0.0,-11.728283


## Significance Testing (paired, per model)

In [3]:

metrics = [c.replace("_base","") for c in feat_df.columns if c.endswith("_base")]
results = []

def bh_fdr(pvals):
    pvals = np.array(pvals, dtype=float)
    n = len(pvals)
    order = np.argsort(pvals)
    ranked = np.empty(n); ranked[order] = np.arange(1, n+1)
    qvals = pvals * n / ranked
    for i in range(n-2, -1, -1):
        qvals[order[i]] = min(qvals[order[i]], qvals[order[i+1]])
    return np.minimum(qvals, 1.0)

rng = np.random.default_rng(42)
def paired_perm_test(a, b, n_perm=20000):
    d = np.asarray(b) - np.asarray(a)
    d = d[~np.isnan(d)]
    obs = abs(d.mean())
    if len(d) == 0:
        return np.nan
    count = 0
    for _ in range(n_perm):
        flips = rng.choice([-1,1], size=len(d))
        val = abs((d*flips).mean())
        if val >= obs:
            count += 1
    return (count + 1) / (n_perm + 1)

for model_name, sub in feat_df.groupby("model"):
    for m in metrics:
        a = sub[f"{m}_base"].to_numpy()
        b = sub[f"{m}_cot"].to_numpy()
        if np.all(np.isnan(a)) or np.all(np.isnan(b)):
            pval = np.nan
        else:
            if SCIPY_AVAILABLE:
                try:
                    _, pval = stats.ttest_rel(a, b, nan_policy="omit")
                except Exception:
                    try:
                        _, pval = stats.wilcoxon(a - b, zero_method='wilcox', alternative='two-sided', correction=False, mode='approx')
                    except Exception:
                        pval = paired_perm_test(a, b, n_perm=10000)
            else:
                pval = paired_perm_test(a, b, n_perm=20000)
        results.append({"model": model_name, "metric": m, "p_value": pval})

stats_df = pd.DataFrame(results)
adj_list = []
for model_name, sub in stats_df.groupby("model"):
    qvals = bh_fdr(sub["p_value"].values)
    temp = sub.copy()
    temp["q_value"] = qvals
    temp["significant_q<0.05"] = temp["q_value"] < 0.05
    adj_list.append(temp)
stats_df = pd.concat(adj_list, ignore_index=True)

stats_df.sort_values(["model","q_value"]).head(20)


Unnamed: 0,model,metric,p_value,q_value,significant_q<0.05
2,claude,n_sents,1.672338e-13,3.177442e-12,True
18,claude,flesch_reading_ease,1.844382e-12,1.752163e-11,True
13,claude,empathy_rate,7.86032e-11,4.978202e-10,True
3,claude,avg_words_per_sent,2.788635e-10,1.324602e-09,True
8,claude,epistemic_modal_rate,1.571523e-08,5.971788e-08,True
10,claude,first_person_rate,9.308415e-05,0.0002947665,True
7,claude,hedge_rate,0.0002384883,0.0006473253,True
9,claude,deontic_modal_rate,0.002931749,0.006962904,True
12,claude,apology_rate,0.00548271,0.01157461,True
5,claude,qmarks,0.02140763,0.04067449,True


## Visualizations: Paired Boxplots and Word Clouds

In [4]:

def plot_paired_box_and_lines(metric, model_name, save=True):
    sub = feat_df[feat_df["model"]==model_name]
    base_vals = sub[f"{metric}_base"].values
    cot_vals  = sub[f"{metric}_cot"].values

    fig, ax = plt.subplots(figsize=(6,4))
    ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
    x1, x2 = 1, 2
    for a, b in zip(base_vals, cot_vals):
        ax.plot([x1, x2], [a, b], alpha=0.2)
    ax.set_title(f"{model_name}: {metric}")
    ax.set_ylabel(metric)
    if save:
        fig_path = os.path.join(OUT_DIR, f"{model_name}_{metric}_paired.png")
        plt.tight_layout()
        plt.savefig(fig_path, dpi=150)
        plt.close(fig)
        return fig_path
    else:
        plt.tight_layout()
        plt.show()
        return None

SELECT_METRICS = ["n_words", "avg_words_per_sent", "ttr", "hedge_rate", "empathy_rate", "deontic_modal_rate", "first_person_rate", "second_person_rate", "flesch_reading_ease"]
plot_files = []
for model_name in feat_df["model"].unique():
    for m in SELECT_METRICS:
        plot_files.append(plot_paired_box_and_lines(m, model_name, save=True))
len(plot_files)


  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([base_vals, cot_vals], labels=["baseline", "cot"], showfliers=False)
  ax.boxplot([ba

36

In [5]:

def generate_wordcloud(texts, title, out_path):
    text_combined = "\n".join([t if isinstance(t,str) else "" for t in texts])
    if WORDCLOUD_AVAILABLE:
        wc = WordCloud(width=800, height=400).generate(text_combined)
        fig, ax = plt.subplots(figsize=(8,4))
        ax.imshow(wc, interpolation='bilinear')
        ax.axis('off')
        ax.set_title(title)
        plt.tight_layout()
        plt.savefig(out_path, dpi=150)
        plt.close(fig)
    else:
        toks = [w for w in re.findall(r"[A-Za-z']+", text_combined.lower()) if len(w) > 2]
        from collections import Counter
        freq = Counter(toks).most_common(60)
        words, counts = zip(*freq) if freq else ([],[])
        sizes = np.array(counts) / max(counts) * 40 if counts else np.array([])
        xs = np.random.RandomState(0).rand(len(words))
        ys = np.random.RandomState(1).rand(len(words))
        fig, ax = plt.subplots(figsize=(8,4))
        for w, x, y, s in zip(words, xs, ys, sizes):
            ax.text(x, y, w, fontsize=max(6, s))
        ax.set_xticks([]); ax.set_yticks([])
        ax.set_title(title)
        plt.tight_layout()
        plt.savefig(out_path, dpi=150)
        plt.close(fig)

for model_name, sub in paired.groupby("model"):
    base_texts = sub["text_base"].tolist()
    cot_texts  = sub["text_cot"].tolist()
    p1 = os.path.join(OUT_DIR, f"{model_name}_baseline_wordcloud.png")
    p2 = os.path.join(OUT_DIR, f"{model_name}_cot_wordcloud.png")
    generate_wordcloud(base_texts, f"{model_name} baseline", p1)
    generate_wordcloud(cot_texts, f"{model_name} CoT", p2)


## Representative Excerpts

In [6]:

def tokenize(text):
    return re.findall(r"[A-Za-z']+", (text or "").lower())

def excerpt(s, n=80):
    toks = tokenize(s or "")
    return " ".join(toks[:n]) + (" ..." if len(toks) > n else "")

texts_base = {(r.post_id, r.model): r.text_base for _, r in paired.iterrows()}
texts_cot  = {(r.post_id, r.model): r.text_cot  for _, r in paired.iterrows()}

EXCERPT_ROWS = []
for model_name in feat_df["model"].unique():
    for metric in ["n_words", "hedge_rate", "empathy_rate", "deontic_modal_rate", "first_person_rate", "second_person_rate"]:
        tmp = feat_df[feat_df["model"]==model_name].copy()
        tmp["delta"] = tmp[f"{metric}_cot"] - tmp[f"{metric}_base"]
        top_pos = tmp.nlargest(2, "delta")
        top_neg = tmp.nsmallest(2, "delta")
        for _, r in pd.concat([top_pos, top_neg]).iterrows():
            key = (r["post_id"], model_name)
            EXCERPT_ROWS.append({
                "model": model_name,
                "metric": metric,
                "delta": r["delta"],
                "post_id": r["post_id"],
                "baseline_excerpt": excerpt(texts_base[key], 60),
                "cot_excerpt": excerpt(texts_cot[key], 60)
            })
excerpts_df = pd.DataFrame(EXCERPT_ROWS)
excerpts_df.head(8)


Unnamed: 0,model,metric,delta,post_id,baseline_excerpt,cot_excerpt
0,gpt4o,n_words,61.0,yy377h,i'm really sorry you're going through this it ...,it sounds like you're dealing with a very comp...
1,gpt4o,n_words,55.0,zg3d5w,i'm so sorry you're going through this it soun...,it sounds like you're in an incredibly challen...
2,gpt4o,n_words,-38.0,wjgb6z,it's completely okay to not want kids and you'...,it's understandable that you're feeling confli...
3,gpt4o,n_words,-27.0,urygph,hey there it sounds like you're sharing a pret...,it sounds like you have strong feelings about ...
4,gpt4o,hedge_rate,5.973335,tjfslw,hey there i'm really sorry to hear about what ...,it sounds like you're in a really tough situat...
5,gpt4o,hedge_rate,5.42328,zn6dwk,i'm sorry you're in such a tough spot it sound...,you re in a very challenging situation and it ...
6,gpt4o,hedge_rate,-5.812143,z1f47a,hey there it s totally understandable to feel ...,it sounds like you have clear standards and ex...
7,gpt4o,hedge_rate,-5.744949,zyt4dx,hey there i'm really sorry to hear what you've...,it truly sounds like you're in a tough spot fe...


## Differential TF–IDF Terms (CoT − Baseline)

In [None]:

tfidf_outputs = []
if SKLEARN_AVAILABLE:
    for model_name, sub in paired.groupby("model"):
        docs_base = sub["text_base"].fillna("").tolist()
        docs_cot  = sub["text_cot"].fillna("").tolist()
        vectorizer = TfidfVectorizer(lowercase=True, token_pattern=r"[A-Za-z']+", min_df=2, max_df=0.95)
        X = vectorizer.fit_transform(docs_base + docs_cot)
        vocab = np.array(vectorizer.get_feature_names_out())
        n = len(docs_base)
        mean_base = np.asarray(X[:n].mean(axis=0)).ravel()
        mean_cot  = np.asarray(X[n:].mean(axis=0)).ravel()
        diff = mean_cot - mean_base
        top_cot_idx = diff.argsort()[::-1][:30]
        top_base_idx = diff.argsort()[:30]
        tfidf_outputs.append({
            "model": model_name,
            "top_terms_cot_minus_base": list(zip(vocab[top_cot_idx].tolist(), diff[top_cot_idx].round(4).tolist())),
            "top_terms_base_minus_cot": list(zip(vocab[top_base_idx].tolist(), diff[top_base_idx].round(4).tolist())),
        })
tfidf_outputs[:2] if tfidf_outputs else "sklearn not available"


## Save Outputs

In [None]:

feat_df_path = os.path.join(OUT_DIR, "paired_feature_metrics.csv")
stats_path   = os.path.join(OUT_DIR, "metric_significance_by_model.csv")
excerpts_path= os.path.join(OUT_DIR, "excerpts_top_differences.csv")

feat_df.to_csv(feat_df_path, index=False)
pd.concat(tfidf_outputs).to_json(os.path.join(OUT_DIR, "tfidf_differential_terms.json"), orient="records", indent=2) if tfidf_outputs else None

stats_df.to_csv(stats_path, index=False)
excerpts_df.to_csv(excerpts_path, index=False)

feat_df_path, stats_path, excerpts_path
