In [1]:
import json
from pathlib import Path
import torch
import os
import gc
from collections import Counter
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import PeftModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

## Load Data

In [2]:
def extract_legitimate_features(responses):
    """
    Extract only legitimate, non-leaky features from the responses.
    """
    features = {}
    
    # Helper to extract response safely
    def extract_response(responses, code):
        for r in responses:
            if r["variable_code"] == code:
                ans = r.get("respondent_answer")
                if ans in ['Inapplicable', 'Refused', "Don't know", 'Error']:
                    return "NA"
                return str(ans)
        return "NA"
    
    # Demographic features
    features["political_interest"] = extract_response(responses, "V241004")  # Political interest
    features["campaign_interest"] = extract_response(responses, "V241005")   # Campaign interest
    
    # Economic views (if available)
    features["economic_views"] = extract_response(responses, "V241127")
    
    # State/region information
    features["state"] = extract_response(responses, "V241017")
    
    # Media consumption (example)
    features["media_consumption"] = extract_response(responses, "V241201")
    
    # Convert features to a single text representation
    input_text = (
        f"Political interest: {features['political_interest']}\n"
        f"Campaign interest: {features['campaign_interest']}\n"
        f"Economic views: {features['economic_views']}\n"
        f"State: {features['state']}\n"
        f"Media consumption: {features['media_consumption']}\n"
        f"Q: Who would this respondent vote for in a Harris vs Trump election?"
    )
    
    return input_text, features

def load_data(data_folder, variable_code, exclude_classes=None, include_classes=None):
    """
    Loads question-response pairs for a given ANES variable code.
    Uses only legitimate features that don't leak the outcome.
    """
    examples = []
    label_map = {}
    next_label_id = 0
    features_data = []

    excluded_count = 0
    included_count = 0
    missing_answer_count = 0
    not_included_count = 0
    matched_count = 0

    if exclude_classes is None:
        exclude_classes = ['Inapplicable', 'Refused', "Don't know", 'Error', "Don't know"]

    json_files = [f for f in os.listdir(data_folder) if f.endswith('.json')]
    print(f"Processing {len(json_files)} JSON files for variable {variable_code}")

    for i, fname in enumerate(json_files):
        if i % 500 == 0:
            print(f"Progress: {i}/{len(json_files)} files processed")

        """ try:
            with open(os.path.join(data_folder, fname)) as f:
                respondent = json.load(f)
        except (json.JSONDecodeError, FileNotFoundError):
            continue

        responses = respondent.get("responses", []) """
        with open(os.path.join(data_folder, fname)) as f:
            try:
                data = json.load(f)
                if isinstance(data, dict):
                    respondents = [data]
                elif isinstance(data, list):
                    respondents = data
                else:
                    continue
            except (json.JSONDecodeError, FileNotFoundError):
                continue

            for respondent in respondents:
                responses = respondent.get("responses", [])
                found = False
                for item in responses:
                    if item.get("variable_code") != variable_code:
                        continue

                    question = item.get("full_question_text", "")
                    possible_answers = [opt["text"] for opt in item.get("possible_answers", [])]
                    respondent_answer = item.get("respondent_answer", None)

                    if not respondent_answer:
                        missing_answer_count += 1
                        continue

                    if respondent_answer in exclude_classes:
                        excluded_count += 1
                        continue

                    if include_classes and respondent_answer not in include_classes:
                        not_included_count += 1
                        continue

                    included_count += 1

                    if respondent_answer not in label_map:
                        label_map[respondent_answer] = next_label_id
                        next_label_id += 1
                    label = label_map[respondent_answer]

                    input_text, features = extract_legitimate_features(responses)

                    examples.append((input_text, label))
                    features_data.append(features)
                    matched_count += 1
                    found = True
                    break  # Only use first match per respondent

        found = False
        for item in responses:
            if item.get("variable_code") != variable_code:
                continue

            question = item.get("full_question_text", "")
            possible_answers = [opt["text"] for opt in item.get("possible_answers", [])]
            respondent_answer = item.get("respondent_answer", None)

            if not respondent_answer:
                missing_answer_count += 1
                continue

            if respondent_answer in exclude_classes:
                excluded_count += 1
                continue

            if include_classes and respondent_answer not in include_classes:
                not_included_count += 1
                continue

            included_count += 1

            if respondent_answer not in label_map:
                label_map[respondent_answer] = next_label_id
                next_label_id += 1
            label = label_map[respondent_answer]

            # Extract legitimate features instead of leaky ones
            input_text, features = extract_legitimate_features(responses)
            
            examples.append((input_text, label))
            features_data.append(features)
            matched_count += 1
            found = True
            break  # Only use first match per respondent

    # Summary logging
    print(f"\n📊 Summary for variable {variable_code}:")
    print(f"  ➤ Total JSON files: {len(json_files)}")
    print(f"  ➤ Valid examples collected: {matched_count}")
    print(f"  ➤ Unique labels: {len(label_map)}")
    print(f"  ➤ Skipped due to missing answers: {missing_answer_count}")
    print(f"  ➤ Skipped due to exclusion list: {excluded_count}")
    print(f"  ➤ Skipped (not in include_classes): {not_included_count}")
    if include_classes:
        print(f"  ➤ Included only: {include_classes}")
    print(f"  ➤ Final label map: {label_map}")

    # Class distribution
    label_counts = Counter([label for _, label in examples])
    print("\n🔍 Class distribution (label IDs):", label_counts)
    for label, count in label_counts.items():
        for key, val in label_map.items():
            if val == label:
                print(f"  ➤ '{key}': {count} samples")

    return examples, label_map, features_data

def print_class_distribution(labels, label_map):
    """Print the distribution of classes in the dataset."""
    from collections import Counter
    reverse_label_map = {v: k for k, v in label_map.items()}
    
    label_counts = Counter(labels)
    print("\nClass Distribution:")
    print("-" * 50)
    for label_id, count in sorted(label_counts.items()):
        class_name = reverse_label_map.get(label_id, f"Unknown_{label_id}")
        percentage = (count / len(labels)) * 100
        print(f"{class_name}: {count} ({percentage:.1f}%)")

In [3]:
data_folder = "/home/tschuetz/shared/datasets/respondents"
variable_code="V241049"
include_classes = ['Donald Trump', 'Kamala Harris']
# Load data with legitimate features
examples, label_map, features_data = load_data(data_folder, variable_code, include_classes=include_classes)

# Split texts and labels
texts = [ex[0] for ex in examples]
labels = [ex[1] for ex in examples]

# Print class distribution
print_class_distribution(labels, label_map)

Processing 3350 JSON files for variable V241049
Progress: 0/3350 files processed
Progress: 500/3350 files processed
Progress: 1000/3350 files processed
Progress: 1500/3350 files processed
Progress: 2000/3350 files processed
Progress: 2500/3350 files processed
Progress: 3000/3350 files processed

📊 Summary for variable V241049:
  ➤ Total JSON files: 3350
  ➤ Valid examples collected: 7257
  ➤ Unique labels: 2
  ➤ Skipped due to missing answers: 0
  ➤ Skipped due to exclusion list: 87
  ➤ Skipped (not in include_classes): 872
  ➤ Included only: ['Donald Trump', 'Kamala Harris']
  ➤ Final label map: {'Donald Trump': 0, 'Kamala Harris': 1}

🔍 Class distribution (label IDs): Counter({1: 3987, 0: 3270})
  ➤ 'Donald Trump': 3270 samples
  ➤ 'Kamala Harris': 3987 samples

Class Distribution:
--------------------------------------------------
Donald Trump: 3270 (45.1%)
Kamala Harris: 3987 (54.9%)


In [4]:
class ANESDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0).long(),
            "attention_mask": enc["attention_mask"].squeeze(0).float(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

In [5]:
def load_split_anes_dataset(texts, labels, tokenizer, max_len=256, seed=42):
    """
    Creates train/val/test splits from raw texts and labels, returning three Dataset objects.
    """
    # Wrap raw data in the ANESDataset
    full_dataset = ANESDataset(texts, labels, tokenizer, max_len=max_len)
    total = len(full_dataset)
    
    # Compute split sizes: 80% train, 10% val, 10% test
    train_size = int(0.8 * total)
    val_size = int(0.1 * total)
    test_size = total - train_size - val_size
    
    # Perform the split with a fixed random seed for reproducibility
    train_ds, val_ds, test_ds = random_split(
        full_dataset,
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(seed)
    )
    
    print(f"Dataset split: {train_size} train, {val_size} val, {test_size} test")
    return train_ds, val_ds, test_ds

# Usage example:
# texts = [ex[0] for ex in examples]
# labels = [ex[1] for ex in examples]
# train_ds, val_ds, test_ds = load_split_anes_dataset(texts, labels, tokenizer)


## Train new prediction heads

In [6]:
def get_subfolder_names(path='.'):
    """
    Returns a list of all subfolder names in the specified directory.

    Parameters:
        path (str): The directory path to search in. Defaults to the current directory.

    Returns:
        List[str]: A list of subfolder names.
    """
    return [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]

# Example usage:
fine_tuned_models = get_subfolder_names('./lora_finetuned_model/deepseek-llm-7b-base/')  # Replace '.' with your desired path
print(fine_tuned_models)

FileNotFoundError: [Errno 2] No such file or directory: './lora_finetuned_model/deepseek-llm-7b-base/'

In [8]:
def freeze_for_classification_head(model):
    """
    Freeze all parameters except those in the classification head,
    which in LlamaForSequenceClassification is under 'score'.
    """
    for name, param in model.named_parameters():
        # keep anything with "score" in its name
        param.requires_grad = "score" in name

    # Debug: print exactly which parameters remain trainable
    unfrozen = [n for n,p in model.named_parameters() if p.requires_grad]
    print(f"▶️  {len(unfrozen)} trainable params (head only):\n    " + "\n    ".join(unfrozen))


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
    }


def train_model(model, tokenizer, train_dataset, val_dataset, output_dir):
    if not torch.cuda.is_available():
        raise RuntimeError("No CUDA device found! Training requires a GPU.")
    
    # Check where the model currently lives
    first_param_device = next(model.parameters()).device
    if first_param_device.type != "cuda":
        print(f"⚠️  Model is on {first_param_device}, moving to CUDA")
        model.to("cuda")
    else:
        print("✅ Model is already on GPU")

    args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        dataloader_num_workers=4,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

def evaluate_on_test_set(trainer, test_dataset):
    results = trainer.evaluate(eval_dataset=test_dataset)
    print("📊 Test Set Metrics:")
    for k, v in results.items():
        if not k.startswith("eval_"):
            continue
        print(f"{k.replace('eval_', '')}: {v:.4f}")
    return results

In [None]:
# name of used LLM
base_model_name = "deepseek-ai/deepseek-llm-7b-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# split dataset
train_ds, val_ds, test_ds = load_split_anes_dataset(texts, labels, tokenizer)

results_list = []

for model_ft in fine_tuned_models:
    
    model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
    model = PeftModel.from_pretrained(model, f"./lora_finetuned_model/deepseek-llm-7b-base/{model_ft}")
    model.config.pad_token_id = tokenizer.eos_token_id

    freeze_for_classification_head(model)

    trainer = train_model(model, tokenizer, train_ds, val_ds, f"./prediction_heads/{model_ft}")

    results = evaluate_on_test_set(trainer, test_ds)

    # Clean and append results
    cleaned_results = {k.replace("eval_", ""): v for k, v in results.items() if k.startswith("eval_")}
    cleaned_results["model_name"] = model_ft
    results_list.append(cleaned_results)

    # clear models from GPU, as the would accumulate and take up too much VRAM
    del model
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

# Create a DataFrame
results_df = pd.DataFrame(results_list)

# Optional: save to CSV
results_df.to_csv("fine_tuned_model_test_metrics.csv", index=False)

# View results
print(results_df)

In [11]:
# name of used LLM
base_model_name = "deepseek-ai/deepseek-llm-7b-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# split dataset
train_ds, val_ds, test_ds = load_split_anes_dataset(texts, labels, tokenizer)
    
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
model.config.pad_token_id = tokenizer.eos_token_id

freeze_for_classification_head(model)

trainer = train_model(model, tokenizer, train_ds, val_ds, f"./prediction_heads/{'baseline'}")

results = evaluate_on_test_set(trainer, test_ds)

# Clean and append results
""" cleaned_results = {k.replace("eval_", ""): v for k, v in results.items() if k.startswith("eval_")}
cleaned_results["model_name"] = model_ft
results_list.append(cleaned_results) """

# clear models from GPU, as the would accumulate and take up too much VRAM
del model
del trainer
torch.cuda.empty_cache()
gc.collect()

Dataset split: 5805 train, 725 val, 727 test


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-llm-7b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


▶️  1 trainable params (head only):
    score.weight
⚠️  Model is on cpu, moving to CUDA


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5733,0.60162,0.726897,0.80396,0.67893,0.985437
2,0.5594,0.570672,0.84,0.869369,0.810924,0.936893
3,0.5123,0.555795,0.846897,0.875978,0.811594,0.951456


📊 Test Set Metrics:
loss: 0.5626
accuracy: 0.8294
f1: 0.8631
precision: 0.7867
recall: 0.9560
runtime: 11.2238
samples_per_second: 64.7730
steps_per_second: 8.1080


6254