# Chua Faithfulness Results Analysis


In [29]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('/Users/yuliav/PycharmProjects/thought-anchors-faithfulness/data/Chua_faithfulness_results.csv')
print(f"Total rows: {len(df)}")
print(f"Columns: {list(df.columns)}")


Total rows: 1620
Columns: ['question_with_cue', 'answer_due_to_cue', 'original_answer', 'ground_truth', 'cue_type', 'judge_extracted_evidence', 'cued_raw_response', 'model', 'cond']


## Unique Models


In [5]:
print("=== Unique Models ===")
models = df['model'].unique()
for m in models:
    count = len(df[df['model'] == m])
    print(f"  {m}: {count} rows")


=== Unique Models ===
  qwen/qwq-32b-preview: 361 rows
  gemini-2.0-flash-thinking-exp-01-21: 353 rows
  deepseek-reasoner: 286 rows
  qwen/qwen-2.5-72b-instruct: 89 rows
  meta-llama/llama-3.3-70b-instruct: 108 rows
  gpt-4o: 76 rows
  claude-3-5-sonnet-20241022: 128 rows
  gemini-2.0-flash-exp: 92 rows
  x-ai/grok-2-1212: 127 rows


## Models with Accessible Attention

| Model | Attention Accessible | Notes |
|-------|---------------------|-------|
| `qwen/qwq-32b-preview` | ✅ Yes | 32B reasoning model |
| `qwen/qwen-2.5-72b-instruct` | ✅ Yes | 72B |
| `meta-llama/llama-3.3-70b-instruct` | ✅ Yes | 70B |
| `gpt-4o` | ❌ No | API only |
| `claude-3-5-sonnet-20241022` | ❌ No | API only |
| `gemini-*` | ❌ No | API only |
| `deepseek-reasoner` | ❌ No | API only |
| `x-ai/grok-2-1212` | ❌ No | API only |


## Response Length Analysis (All Models)


In [28]:
# Add character length column
df['char_len'] = df['cued_raw_response'].astype(str).str.len()

print("=== cued_raw_response stats (all models) ===")
print(f"Max char length: {df['char_len'].max()}")
print(f"Mean char length: {df['char_len'].mean():.0f}")
print(f"Median char length: {df['char_len'].median():.0f}")
print(f"95th percentile: {df['char_len'].quantile(0.95):.0f}")
print(f"99th percentile: {df['char_len'].quantile(0.99):.0f}")

# Find model with max length
max_idx = df['char_len'].idxmax()
print(f"\nModel with max length response: {df.loc[max_idx, 'model']}")


=== cued_raw_response stats (all models) ===
Max char length: 32767
Mean char length: 6216
Median char length: 4325
95th percentile: 16845
99th percentile: 23156

Model with max length response: deepseek-reasoner


## Response Length by Model


In [7]:
# Stats per model
model_stats = df.groupby('model')['char_len'].agg(['count', 'mean', 'median', 'max', 
                                                    lambda x: x.quantile(0.95),
                                                    lambda x: x.quantile(0.99)])
model_stats.columns = ['count', 'mean', 'median', 'max', 'p95', 'p99']
model_stats = model_stats.round(0).astype(int)
model_stats


Unnamed: 0_level_0,count,mean,median,max,p95,p99
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
claude-3-5-sonnet-20241022,128,1067,1129,1472,1390,1437
deepseek-reasoner,286,10264,8832,32767,23514,32767
gemini-2.0-flash-exp,92,1777,1870,5030,2998,3476
gemini-2.0-flash-thinking-exp-01-21,353,10268,10122,19973,17015,19503
gpt-4o,76,1558,1578,3146,2236,3002
meta-llama/llama-3.3-70b-instruct,108,2210,2214,4191,3502,3896
qwen/qwen-2.5-72b-instruct,89,1944,1976,3398,2868,3275
qwen/qwq-32b-preview,361,6904,6206,19517,14688,17289
x-ai/grok-2-1212,127,1481,1384,10644,2526,3301


## Prompt Length Analysis: Stanford Professor Questions

Analyzing the `question_with_cue` column length for Stanford Professor questions to understand context window requirements.


In [23]:
# Filter for Stanford Professor questions
professor_df = df[
    (df['cue_type'] == 'Professor') & 
    (df['cond'].isin(['itc_failure', 'itc_success']))
].copy()

# Calculate prompt length (question_with_cue)
professor_df['prompt_char_len'] = professor_df['question_with_cue'].astype(str).str.len()

print("=== Prompt Length Stats (Stanford Professor Questions) ===")
print(f"Number of Professor questions: {len(professor_df)}")
print(f"Max prompt length: {professor_df['prompt_char_len'].max()} characters")
print(f"Mean prompt length: {professor_df['prompt_char_len'].mean():.0f} characters")
print(f"95th percentile: {professor_df['prompt_char_len'].quantile(0.95):.0f} characters")
print(f"99th percentile: {professor_df['prompt_char_len'].quantile(0.99):.0f} characters")


=== Prompt Length Stats (Stanford Professor Questions) ===
Number of Professor questions: 199
Max prompt length: 2454 characters
Mean prompt length: 839 characters
95th percentile: 2027 characters
99th percentile: 2375 characters


In [25]:
# Token estimates for prompts
max_prompt_chars = professor_df['prompt_char_len'].max()
mean_prompt_chars = professor_df['prompt_char_len'].mean()
p95_prompt_chars = professor_df['prompt_char_len'].quantile(0.95)
p99_prompt_chars = professor_df['prompt_char_len'].quantile(0.99)

print("=== Token Estimates for Prompts ===")
print(f"Max prompt @ 3 chars/token: {max_prompt_chars // 3} tokens")
print(f"Max prompt @ 4 chars/token: {max_prompt_chars // 4} tokens")
print()
print(f"Mean prompt @ 3 chars/token: {mean_prompt_chars // 3} tokens")
print(f"Mean prompt @ 4 chars/token: {mean_prompt_chars // 4} tokens")
print()
print(f"95th percentile @ 3 chars/token: {p95_prompt_chars // 3} tokens")
print(f"95th percentile @ 4 chars/token: {p95_prompt_chars // 4} tokens")




=== Token Estimates for Prompts ===
Max prompt @ 3 chars/token: 818 tokens
Max prompt @ 4 chars/token: 613 tokens

Mean prompt @ 3 chars/token: 279.0 tokens
Mean prompt @ 4 chars/token: 209.0 tokens

95th percentile @ 3 chars/token: 675.0 tokens
95th percentile @ 4 chars/token: 506.0 tokens


## Context Window Requirements

For `generate_rollouts.py` with `max_model_len` setting:
- Need to accommodate: **prompt length + max_tokens (generated response)**
- Current setting: `max_model_len=2048` tokens
- If using `max_tokens=2048` for generation, total needed = prompt_tokens + 2048


#### ITC Success (Faithful Switch)

- The model **switches its answer toward the cue**
- The model **explicitly acknowledges** that the cue influenced its decision

#### ITC Failure (Unfaithful Switch)

- The model **switches its answer toward the cue**
- BUT **does NOT acknowledge** the cue in its reasoning

In [27]:
    

# Check cond values for Professor questions specifically
print("=== cond Values for Professor Questions ===")
prof_all = df[df['cue_type'] == 'Professor']
print(prof_all['cond'].value_counts())
print()
print(f"Total Professor questions (all cond): {len(prof_all)}")

# Check what we get with itc_failure + itc_success filter
prof_filtered = df[
    (df['cue_type'] == 'Professor') & 
    (df['cond'].isin(['itc_failure', 'itc_success']))
]


=== cond Values for Professor Questions ===
cond
itc_success         162
non_itc_success      49
non_itc_failures     38
itc_failure          37
Name: count, dtype: int64

Total Professor questions (all cond): 286
