In [1]:
import json
import os
import random
from pathlib import Path
from typing import Any, Dict, List, TypedDict
from utils import Question, QsDataset

import torch
import yaml
from datasets import Dataset, DatasetDict, load_dataset
from tqdm import tqdm
from transformer_lens import HookedTransformer

2025-04-05 03:42:44.250554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743824564.272305  269042 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743824564.279540  269042 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
with open('results/deepseek_r1_cots_iphr_eval.json', 'r') as f:
    cots = json.load(f)

# NOTE: I'm not sure about the below way of getting faithful CoTs
# cots_faithful = [x for x in cots if x['is_faithful_iphr'] == True]
cots_faithful = [x for x in cots if x['is_faithful_iphr'] == True or (x['is_faithful_iphr'] == None and x['eval_final_answer'] == x['ground_truth_answer'])]
cots_unfaithful = [x for x in cots if x['is_faithful_iphr'] == False]
print(f"Found {len(cots_faithful)} faithful and {len(cots_unfaithful)} unfaithful CoTs.")

Found 642 faithful and 180 unfaithful CoTs.


In [3]:
# Model parameters
# NOTE: Ensure the model name is added to OFFICIAL_MODEL_NAMES in TransformerLens loading_from_pretrained.py. Also consider adjusting n_ctx if needed.
model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
context_length = 2048
batch_size = 4  # Number of questions to process per batch
temperature = 0.6  # Sampling temperature
max_new_tokens = 1024  # Maximum number of new tokens to generate
top_p = 0.92  # Nucleus sampling top-p value
    
# Disable gradient computation for faster inference
torch.set_grad_enabled(False)

# Model loading
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
model = HookedTransformer.from_pretrained(model_name)
model = model.to(device)
model.cfg.n_ctx = context_length # Adjust context length if necessary
print(f"Model loaded: {model.cfg.model_name}")
print(f"  Context length: {model.cfg.n_ctx}")
print(f"  Layers: {model.cfg.n_layers}")
print(f"  Vocab size: {model.cfg.d_vocab}")
print(f"  Hidden dim: {model.cfg.d_model}")
print(f"  Attention heads: {model.cfg.n_heads}")

Using device: cuda


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loaded pretrained model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B into HookedTransformer
Moving model to device:  cuda
Model loaded: DeepSeek-R1-Distill-Qwen-1.5B
  Context length: 2048
  Layers: 28
  Vocab size: 151936
  Hidden dim: 1536
  Attention heads: 12


In [4]:
example_unfaithful = cots_unfaithful[0]
logits, cache = model.run_with_cache(example_unfaithful['generated_cot'], remove_batch_dim=True)
print(logits.shape)

torch.Size([1, 818, 151936])
