In [2]:
import os
import json
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import torch.nn.functional as F
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [4]:
def extract_legitimate_features(responses):
    """
    Extract only legitimate, non-leaky features from the responses.
    """
    features = {}
    
    # Helper to extract response safely
    def extract_response(responses, code):
        for r in responses:
            if r["variable_code"] == code:
                ans = r.get("respondent_answer")
                if ans in ['Inapplicable', 'Refused', "Don't know", 'Error']:
                    return "NA"
                return str(ans)
        return "NA"
    
    # Demographic features
    features["political_interest"] = extract_response(responses, "V241004")  # Political interest
    features["campaign_interest"] = extract_response(responses, "V241005")   # Campaign interest
    
    # Economic views (if available)
    features["economic_views"] = extract_response(responses, "V241127")
    
    # State/region information
    features["state"] = extract_response(responses, "V241017")
    
    # Media consumption (example)
    features["media_consumption"] = extract_response(responses, "V241201")
    
    # Convert features to a single text representation
    input_text = (
        f"Political interest: {features['political_interest']}\n"
        f"Campaign interest: {features['campaign_interest']}\n"
        f"Economic views: {features['economic_views']}\n"
        f"State: {features['state']}\n"
        f"Media consumption: {features['media_consumption']}\n"
        f"Q: Who would this respondent vote for in a Harris vs Trump election?"
    )
    
    return input_text, features

def load_data(data_folder, variable_code, exclude_classes=None, include_classes=None):
    """
    Loads question-response pairs for a given ANES variable code.
    Uses only legitimate features that don't leak the outcome.
    """
    examples = []
    label_map = {}
    next_label_id = 0
    features_data = []

    excluded_count = 0
    included_count = 0
    missing_answer_count = 0
    not_included_count = 0
    matched_count = 0

    if exclude_classes is None:
        exclude_classes = ['Inapplicable', 'Refused', "Don't know", 'Error', "Don't know"]

    json_files = [f for f in os.listdir(data_folder) if f.endswith('.json')]
    print(f"Processing {len(json_files)} JSON files for variable {variable_code}")

    for i, fname in enumerate(json_files):
        if i % 500 == 0:
            print(f"Progress: {i}/{len(json_files)} files processed")

        try:
            with open(os.path.join(data_folder, fname)) as f:
                respondent = json.load(f)
        except (json.JSONDecodeError, FileNotFoundError):
            continue

        responses = respondent.get("responses", [])
        found = False
        for item in responses:
            if item.get("variable_code") != variable_code:
                continue

            question = item.get("full_question_text", "")
            possible_answers = [opt["text"] for opt in item.get("possible_answers", [])]
            respondent_answer = item.get("respondent_answer", None)

            if not respondent_answer:
                missing_answer_count += 1
                continue

            if respondent_answer in exclude_classes:
                excluded_count += 1
                continue

            if include_classes and respondent_answer not in include_classes:
                not_included_count += 1
                continue

            included_count += 1

            if respondent_answer not in label_map:
                label_map[respondent_answer] = next_label_id
                next_label_id += 1
            label = label_map[respondent_answer]

            # Extract legitimate features instead of leaky ones
            input_text, features = extract_legitimate_features(responses)
            
            examples.append((input_text, label))
            features_data.append(features)
            matched_count += 1
            found = True
            break  # Only use first match per respondent

    # Summary logging
    print(f"\n📊 Summary for variable {variable_code}:")
    print(f"  ➤ Total JSON files: {len(json_files)}")
    print(f"  ➤ Valid examples collected: {matched_count}")
    print(f"  ➤ Unique labels: {len(label_map)}")
    print(f"  ➤ Skipped due to missing answers: {missing_answer_count}")
    print(f"  ➤ Skipped due to exclusion list: {excluded_count}")
    print(f"  ➤ Skipped (not in include_classes): {not_included_count}")
    if include_classes:
        print(f"  ➤ Included only: {include_classes}")
    print(f"  ➤ Final label map: {label_map}")

    # Class distribution
    label_counts = Counter([label for _, label in examples])
    print("\n🔍 Class distribution (label IDs):", label_counts)
    for label, count in label_counts.items():
        for key, val in label_map.items():
            if val == label:
                print(f"  ➤ '{key}': {count} samples")

    return examples, label_map, features_data


In [5]:
class ANESDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0).long(),
            "attention_mask": enc["attention_mask"].squeeze(0).float(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }


In [6]:

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        """
        alpha: 1D tensor of shape [num_classes] or None
        gamma: focusing parameter
        """
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        """
        logits: Tensor[B, C]
        targets: Tensor[B] with class indices 0 ≤ targets[i] < C
        """
        # move class weights if provided
        if self.alpha is not None:
            self.alpha = self.alpha.to(logits.device)

        # standard CE with no reduction → [B]
        ce = F.cross_entropy(logits, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce)             # [B], pt = probability of the true class
        loss = (1 - pt) ** self.gamma * ce

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss  # [B]

In [7]:
def extract_legitimate_features(responses):
    """
    Extract only legitimate, non-leaky features from the responses.
    """
    features = {}
    
    # Helper to extract response safely
    def extract_response(responses, code):
        for r in responses:
            if r["variable_code"] == code:
                ans = r.get("respondent_answer")
                if ans in ['Inapplicable', 'Refused', "Don't know", 'Error']:
                    return "NA"
                return str(ans)
        return "NA"
    
    # Demographic features
    features["political_interest"] = extract_response(responses, "V241004")  # Political interest
    features["campaign_interest"] = extract_response(responses, "V241005")   # Campaign interest
    
    # Economic views (if available)
    features["economic_views"] = extract_response(responses, "V241127")
    
    # State/region information
    features["state"] = extract_response(responses, "V241017")
    
    # Media consumption (example)
    features["media_consumption"] = extract_response(responses, "V241201")
    
    # Convert features to a single text representation
    input_text = (
        f"Political interest: {features['political_interest']}\n"
        f"Campaign interest: {features['campaign_interest']}\n"
        f"Economic views: {features['economic_views']}\n"
        f"State: {features['state']}\n"
        f"Media consumption: {features['media_consumption']}\n"
        f"Q: Who would this respondent vote for in a Harris vs Trump election?"
    )
    
    return input_text, features

In [8]:
def train_epoch(model, loader, optimizer, scheduler, device, loss_fn):
    model.train()
    total_loss, correct, total = 0.0, 0, 0

    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [B, C]

        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item() * labels.size(0)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc

def eval_epoch(model, loader, device, loss_fn):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    all_logits, all_preds, all_labels = [], [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item() * labels.size(0)

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_logits.append(logits.cpu())
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

    avg_loss = total_loss / total
    acc = correct / total

    all_logits = torch.cat(all_logits)
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    return avg_loss, acc, all_logits, all_preds, all_labels


In [9]:
def print_class_distribution(labels, label_map):
    """Print the distribution of classes in the dataset."""
    from collections import Counter
    reverse_label_map = {v: k for k, v in label_map.items()}
    
    label_counts = Counter(labels)
    print("\nClass Distribution:")
    print("-" * 50)
    for label_id, count in sorted(label_counts.items()):
        class_name = reverse_label_map.get(label_id, f"Unknown_{label_id}")
        percentage = (count / len(labels)) * 100
        print(f"{class_name}: {count} ({percentage:.1f}%)")

In [10]:
def apply_threshold_and_report(val_logits, val_true, label_map, threshold=0.5):
    """
    Apply threshold-based prediction for binary classification and print report + confusion matrix.
    """
    # Convert logits to probabilities
    probs = torch.softmax(val_logits, dim=-1).cpu().numpy()

    # Get class indices
    idx_to_label = {v: k for k, v in label_map.items()}
    class_names = [idx_to_label[i] for i in range(len(idx_to_label))]
    
    # For binary classification, we can use the probability of class 1
    # If we have more than 2 classes, this needs to be adjusted
    if len(class_names) == 2:
        # Get index for the second class (typically index 1)
        class_idx = 1
        class_probs = probs[:, class_idx]
        
        # Binary prediction based on threshold
        preds = (class_probs > threshold).astype(int)
        
        # Convert true labels to binary format matching our predictions
        binary_true = (val_true.numpy() == class_idx).astype(int)
        
        # Print classification report
        print(f"\n✅ Classification Report (Thresholded @ {threshold:.2f}):")
        print(classification_report(binary_true, preds, target_names=class_names, zero_division=0))
        
        # Plot confusion matrix
        cm = confusion_matrix(binary_true, preds)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix (Threshold: {threshold:.2f})")
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_threshold_{threshold:.2f}.png')
        plt.close()
        
        # Calculate AUC
        try:
            auc = roc_auc_score(binary_true, class_probs)
            print(f"AUC: {auc:.4f}")
        except:
            print("Could not calculate AUC")
    else:
        print("This function is designed for binary classification only.")

def apply_sampling_strategy(X_train, y_train, strategy='smote'):
    """
    Apply sampling strategy to address class imbalance.
    
    Parameters:
    - X_train: Training features
    - y_train: Training labels
    - strategy: Sampling strategy ('smote', 'undersample', 'none')
    
    Returns:
    - Resampled X_train, y_train
    """
    if strategy == 'none':
        return X_train, y_train
    
    if strategy == 'smote':
        print("Applying SMOTE oversampling...")
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        return X_resampled, y_resampled
    
    if strategy == 'undersample':
        print("Applying random undersampling...")
        rus = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, y_resampled
    
    raise ValueError(f"Unknown sampling strategy: {strategy}")


In [11]:
def main(data_folder, variable_code="V241049", sampling_strategy='smote'):
    # Target variable and label filtering
    # V241049 is "WHO WOULD R VOTE FOR: HARRIS VS TRUMP"
    include_classes = ['Donald Trump', 'Kamala Harris']

    # Load data with legitimate features
    examples, label_map, features_data = load_data(data_folder, variable_code, include_classes=include_classes)

    # Split texts and labels
    texts = [ex[0] for ex in examples]
    labels = [ex[1] for ex in examples]

    # Print class distribution
    print_class_distribution(labels, label_map)

    # Initialize tokenizer
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    
    # Implement k-fold cross-validation
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\n{'='*50}")
        print(f"Fold {fold+1}/{n_splits}")
        print(f"{'='*50}")
        
        # Split data
        train_texts = [texts[i] for i in train_idx]
        train_labels = [labels[i] for i in train_idx]
        val_texts = [texts[i] for i in val_idx]
        val_labels = [labels[i] for i in val_idx]
        
        # Apply sampling strategy if needed
        if sampling_strategy != 'none':
            # For text data, we need to create a simple numerical representation for SMOTE
            # This is just for sampling purposes
            from sklearn.feature_extraction.text import TfidfVectorizer
            vectorizer = TfidfVectorizer(max_features=100)
            X_train_vec = vectorizer.fit_transform(train_texts).toarray()
            
            # Apply sampling
            X_train_resampled, y_train_resampled = apply_sampling_strategy(
                X_train_vec, train_labels, strategy=sampling_strategy
            )
            
            # Map back to texts (for oversampling, we'll have duplicates)
            # For undersampling, we'll have fewer samples
            if sampling_strategy == 'undersample':
                # Get indices of remaining samples
                remaining_indices = []
                for i, label in enumerate(train_labels):
                    if label in y_train_resampled:
                        remaining_indices.append(i)
                        y_train_resampled.remove(label)
                
                train_texts = [train_texts[i] for i in remaining_indices]
                train_labels = [train_labels[i] for i in remaining_indices]
            else:
                # For SMOTE, we can't map back to original texts
                # Instead, we'll duplicate existing texts based on the resampled labels
                class_counts = Counter(y_train_resampled)
                original_counts = Counter(train_labels)
                
                # Calculate how many samples to add for each class
                to_add = {cls: count - original_counts.get(cls, 0) for cls, count in class_counts.items()}
                
                # Get indices for each class
                class_indices = {cls: [] for cls in to_add.keys()}
                for i, label in enumerate(train_labels):
                    if label in class_indices:
                        class_indices[label].append(i)
                
                # Add duplicates
                for cls, count in to_add.items():
                    if count <= 0:
                        continue
                    
                    indices = class_indices[cls]
                    if not indices:
                        continue
                    
                    # Randomly select indices to duplicate
                    import random
                    random.seed(42)
                    duplicate_indices = [random.choice(indices) for _ in range(count)]
                    
                    # Add duplicates
                    for idx in duplicate_indices:
                        train_texts.append(train_texts[idx])
                        train_labels.append(train_labels[idx])
        
        # Create datasets and dataloaders
        train_dataset = ANESDataset(train_texts, train_labels, tokenizer)
        val_dataset = ANESDataset(val_texts, val_labels, tokenizer)
        
        # Use a smaller batch size if memory is an issue
        batch_size = 16
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # Initialize model
        num_labels = len(label_map)
        model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        # Compute class weights for handling imbalance
        classes = np.unique(train_labels)
        weights = compute_class_weight(class_weight="balanced", classes=classes, y=train_labels)
        weights_tensor = torch.tensor(weights, dtype=torch.float)
        
        # Create loss function with class weights
        loss_fn = FocalLoss(alpha=weights_tensor, gamma=2.0)

        # Optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_loader) * 4  # num_epochs = 4
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
        )

        # Training loop
        best_val_acc = 0
        best_val_loss = float('inf')
        best_model_state = None
        
        for epoch in range(1, 5):
            print(f"\nEpoch {epoch}")
            train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device, loss_fn)
            val_loss, val_acc, val_logits, val_preds, val_labels = eval_epoch(model, val_loader, device, loss_fn)
            print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
            print(f"  Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
            
            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_val_loss = val_loss
                best_model_state = model.state_dict().copy()
        
        # Load best model for evaluation
        model.load_state_dict(best_model_state)
        _, _, val_logits, val_preds, val_labels = eval_epoch(model, val_loader, device, loss_fn)
        
        # Evaluate with different thresholds
        print("\nEvaluating with different thresholds:")
        thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
        for threshold in thresholds:
            apply_threshold_and_report(val_logits, val_labels, label_map, threshold=threshold)
        
        # Save fold results
        fold_results.append({
            'val_acc': best_val_acc,
            'val_loss': best_val_loss,
            'val_logits': val_logits,
            'val_labels': val_labels
        })
    
    # Aggregate results across folds
    print("\n" + "="*50)
    print("Cross-validation results:")
    print("="*50)
    
    avg_acc = np.mean([res['val_acc'] for res in fold_results])
    avg_loss = np.mean([res['val_loss'] for res in fold_results])
    
    print(f"Average validation accuracy: {avg_acc:.4f}")
    print(f"Average validation loss: {avg_loss:.4f}")
    
    # Train final model on all data
    print("\n" + "="*50)
    print("Training final model on all data:")
    print("="*50)
    
    # Create datasets and dataloaders for full training
    full_dataset = ANESDataset(texts, labels, tokenizer)
    full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize final model
    final_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    final_model.to(device)
    
    # Compute class weights for handling imbalance
    classes = np.unique(labels)
    weights = compute_class_weight(class_weight="balanced", classes=classes, y=labels)
    weights_tensor = torch.tensor(weights, dtype=torch.float)
    
    # Create loss function with class weights
    loss_fn = FocalLoss(alpha=weights_tensor, gamma=2.0)
    
    # Optimizer and scheduler
    optimizer = AdamW(final_model.parameters(), lr=2e-5)
    total_steps = len(full_loader) * 4  # num_epochs = 4
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
    )
    
    # Training loop
    for epoch in range(1, 5):
        print(f"\nEpoch {epoch}")
        train_loss, train_acc = train_epoch(final_model, full_loader, optimizer, scheduler, device, loss_fn)
        print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
    
    print("\n✅ Training completed.")
    
    # Save the model
    torch.save(final_model.state_dict(), 'anes_classifier_model_legitimate_features.pt')
    print("\nModel saved to 'anes_classifier_model_legitimate_features.pt'")

if __name__ == '__main__':
    # Replace with your data folder path
    data_folder = "/home/tsultanov/shared/datasets/respondents"
    
    # Try different sampling strategies
    for strategy in ['none', 'smote']:
        print(f"\n{'='*70}")
        print(f"Running with sampling strategy: {strategy}")
        print(f"{'='*70}")
        main(data_folder, sampling_strategy=strategy)



Running with sampling strategy: none
Processing 3349 JSON files for variable V241049
Progress: 0/3349 files processed
Progress: 500/3349 files processed
Progress: 1000/3349 files processed
Progress: 1500/3349 files processed
Progress: 2000/3349 files processed
Progress: 2500/3349 files processed
Progress: 3000/3349 files processed

📊 Summary for variable V241049:
  ➤ Total JSON files: 3349
  ➤ Valid examples collected: 2959
  ➤ Unique labels: 2
  ➤ Skipped due to missing answers: 0
  ➤ Skipped due to exclusion list: 34
  ➤ Skipped (not in include_classes): 356
  ➤ Included only: ['Donald Trump', 'Kamala Harris']
  ➤ Final label map: {'Donald Trump': 0, 'Kamala Harris': 1}

🔍 Class distribution (label IDs): Counter({1: 1623, 0: 1336})
  ➤ 'Donald Trump': 1336 samples
  ➤ 'Kamala Harris': 1623 samples

Class Distribution:
--------------------------------------------------
Donald Trump: 1336 (45.2%)
Kamala Harris: 1623 (54.8%)

Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1223, Acc: 0.7440
  Val   - Loss: 0.1002, Acc: 0.8514

Epoch 2
  Train - Loss: 0.0956, Acc: 0.8496
  Val   - Loss: 0.0989, Acc: 0.8547

Epoch 3
  Train - Loss: 0.0912, Acc: 0.8568
  Val   - Loss: 0.0902, Acc: 0.8547

Epoch 4
  Train - Loss: 0.0886, Acc: 0.8559
  Val   - Loss: 0.0909, Acc: 0.8547

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.96      0.70      0.81       268
Kamala Harris       0.80      0.98      0.88       324

     accuracy                           0.85       592
    macro avg       0.88      0.84      0.84       592
 weighted avg       0.87      0.85      0.85       592

AUC: 0.9368

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.89      0.82      0.85       268
Kamala Harris       0.86      0.92      0.89       324

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1250, Acc: 0.7406
  Val   - Loss: 0.1106, Acc: 0.8007

Epoch 2
  Train - Loss: 0.0930, Acc: 0.8411
  Val   - Loss: 0.1138, Acc: 0.8429

Epoch 3
  Train - Loss: 0.0894, Acc: 0.8538
  Val   - Loss: 0.1139, Acc: 0.8429

Epoch 4
  Train - Loss: 0.0870, Acc: 0.8564
  Val   - Loss: 0.1024, Acc: 0.8429

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.62      0.75       267
Kamala Harris       0.76      0.97      0.85       325

     accuracy                           0.81       592
    macro avg       0.85      0.80      0.80       592
 weighted avg       0.84      0.81      0.80       592

AUC: 0.9225

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.62      0.75       267
Kamala Harris       0.76      0.97      0.85       325

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1242, Acc: 0.7410
  Val   - Loss: 0.1044, Acc: 0.8699

Epoch 2
  Train - Loss: 0.0926, Acc: 0.8373
  Val   - Loss: 0.0889, Acc: 0.8564

Epoch 3
  Train - Loss: 0.0909, Acc: 0.8526
  Val   - Loss: 0.0849, Acc: 0.8497

Epoch 4
  Train - Loss: 0.0901, Acc: 0.8555
  Val   - Loss: 0.0851, Acc: 0.8564

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.97      0.65      0.78       267
Kamala Harris       0.77      0.98      0.87       325

     accuracy                           0.83       592
    macro avg       0.87      0.82      0.82       592
 weighted avg       0.86      0.83      0.83       592

AUC: 0.9395

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.93      0.69      0.79       267
Kamala Harris       0.79      0.96      0.87       325

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1259, Acc: 0.7220
  Val   - Loss: 0.0849, Acc: 0.8750

Epoch 2
  Train - Loss: 0.0977, Acc: 0.8378
  Val   - Loss: 0.0751, Acc: 0.8750

Epoch 3
  Train - Loss: 0.0928, Acc: 0.8411
  Val   - Loss: 0.0799, Acc: 0.8750

Epoch 4
  Train - Loss: 0.0914, Acc: 0.8471
  Val   - Loss: 0.0784, Acc: 0.8699

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.97      0.71      0.82       267
Kamala Harris       0.80      0.98      0.89       325

     accuracy                           0.86       592
    macro avg       0.89      0.85      0.85       592
 weighted avg       0.88      0.86      0.86       592

AUC: 0.9470

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.89      0.80      0.84       267
Kamala Harris       0.85      0.92      0.88       325

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1261, Acc: 0.7373
  Val   - Loss: 0.1102, Acc: 0.8190

Epoch 2
  Train - Loss: 0.0950, Acc: 0.8573
  Val   - Loss: 0.0955, Acc: 0.8426

Epoch 3
  Train - Loss: 0.0888, Acc: 0.8619
  Val   - Loss: 0.0942, Acc: 0.8443

Epoch 4
  Train - Loss: 0.0881, Acc: 0.8615
  Val   - Loss: 0.0911, Acc: 0.8443

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.63      0.76       267
Kamala Harris       0.76      0.97      0.85       324

     accuracy                           0.82       591
    macro avg       0.86      0.80      0.81       591
 weighted avg       0.85      0.82      0.81       591

AUC: 0.9278

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.63      0.76       267
Kamala Harris       0.76      0.97      0.85       324

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1261, Acc: 0.7519

Epoch 2
  Train - Loss: 0.0952, Acc: 0.8469

Epoch 3
  Train - Loss: 0.0900, Acc: 0.8530

Epoch 4
  Train - Loss: 0.0892, Acc: 0.8547

✅ Training completed.

Model saved to 'anes_classifier_model_legitimate_features.pt'

Running with sampling strategy: smote
Processing 3349 JSON files for variable V241049
Progress: 0/3349 files processed
Progress: 500/3349 files processed
Progress: 1000/3349 files processed
Progress: 1500/3349 files processed
Progress: 2000/3349 files processed
Progress: 2500/3349 files processed
Progress: 3000/3349 files processed

📊 Summary for variable V241049:
  ➤ Total JSON files: 3349
  ➤ Valid examples collected: 2959
  ➤ Unique labels: 2
  ➤ Skipped due to missing answers: 0
  ➤ Skipped due to exclusion list: 34
  ➤ Skipped (not in include_classes): 356
  ➤ Included only: ['Donald Trump', 'Kamala Harris']
  ➤ Final label map: {'Donald Trump': 0, 'Kamala Harris': 1}

🔍 Class distribution (label IDs): Counter({1: 1623

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1267, Acc: 0.7841
  Val   - Loss: 0.1281, Acc: 0.8547

Epoch 2
  Train - Loss: 0.0924, Acc: 0.8545
  Val   - Loss: 0.0935, Acc: 0.8547

Epoch 3
  Train - Loss: 0.0879, Acc: 0.8610
  Val   - Loss: 0.0982, Acc: 0.8598

Epoch 4
  Train - Loss: 0.0878, Acc: 0.8618
  Val   - Loss: 0.0908, Acc: 0.8547

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.96      0.70      0.81       268
Kamala Harris       0.80      0.98      0.88       324

     accuracy                           0.85       592
    macro avg       0.88      0.84      0.84       592
 weighted avg       0.87      0.85      0.85       592

AUC: 0.9335

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.96      0.70      0.81       268
Kamala Harris       0.80      0.98      0.88       324

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1193, Acc: 0.7581
  Val   - Loss: 0.1051, Acc: 0.8446

Epoch 2
  Train - Loss: 0.0920, Acc: 0.8463
  Val   - Loss: 0.0990, Acc: 0.8497

Epoch 3
  Train - Loss: 0.0863, Acc: 0.8613
  Val   - Loss: 0.1055, Acc: 0.8480

Epoch 4
  Train - Loss: 0.0829, Acc: 0.8586
  Val   - Loss: 0.1018, Acc: 0.8615

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.62      0.75       267
Kamala Harris       0.76      0.97      0.85       325

     accuracy                           0.81       592
    macro avg       0.85      0.80      0.80       592
 weighted avg       0.84      0.81      0.80       592

AUC: 0.9233

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.62      0.75       267
Kamala Harris       0.76      0.97      0.85       325

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1286, Acc: 0.7338
  Val   - Loss: 0.0822, Acc: 0.8699

Epoch 2
  Train - Loss: 0.0943, Acc: 0.8502
  Val   - Loss: 0.0866, Acc: 0.8564

Epoch 3
  Train - Loss: 0.0919, Acc: 0.8459
  Val   - Loss: 0.0885, Acc: 0.8564

Epoch 4
  Train - Loss: 0.0894, Acc: 0.8536
  Val   - Loss: 0.0868, Acc: 0.8564

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.97      0.65      0.78       267
Kamala Harris       0.77      0.98      0.87       325

     accuracy                           0.83       592
    macro avg       0.87      0.82      0.82       592
 weighted avg       0.86      0.83      0.83       592

AUC: 0.9350

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.97      0.66      0.78       267
Kamala Harris       0.78      0.98      0.87       325

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1316, Acc: 0.7219
  Val   - Loss: 0.0789, Acc: 0.8666

Epoch 2
  Train - Loss: 0.0991, Acc: 0.8467
  Val   - Loss: 0.0758, Acc: 0.8666

Epoch 3
  Train - Loss: 0.0925, Acc: 0.8405
  Val   - Loss: 0.0771, Acc: 0.8666

Epoch 4
  Train - Loss: 0.0886, Acc: 0.8567
  Val   - Loss: 0.0811, Acc: 0.8666

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.97      0.71      0.82       267
Kamala Harris       0.81      0.98      0.88       325

     accuracy                           0.86       592
    macro avg       0.89      0.85      0.85       592
 weighted avg       0.88      0.86      0.86       592

AUC: 0.9465

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.89      0.79      0.83       267
Kamala Harris       0.84      0.92      0.88       325

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1202, Acc: 0.7675
  Val   - Loss: 0.0997, Acc: 0.8460

Epoch 2
  Train - Loss: 0.0961, Acc: 0.8487
  Val   - Loss: 0.0953, Acc: 0.8443

Epoch 3
  Train - Loss: 0.0922, Acc: 0.8549
  Val   - Loss: 0.0983, Acc: 0.8443

Epoch 4
  Train - Loss: 0.0913, Acc: 0.8603
  Val   - Loss: 0.0940, Acc: 0.8443

Evaluating with different thresholds:

✅ Classification Report (Thresholded @ 0.30):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.63      0.76       267
Kamala Harris       0.76      0.97      0.85       324

     accuracy                           0.82       591
    macro avg       0.86      0.80      0.81       591
 weighted avg       0.85      0.82      0.81       591

AUC: 0.9288

✅ Classification Report (Thresholded @ 0.40):
               precision    recall  f1-score   support

 Donald Trump       0.95      0.63      0.76       267
Kamala Harris       0.76      0.97      0.85       324

     accuracy                  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
  Train - Loss: 0.1236, Acc: 0.7472

Epoch 2
  Train - Loss: 0.0964, Acc: 0.8462

Epoch 3
  Train - Loss: 0.0928, Acc: 0.8466

Epoch 4
  Train - Loss: 0.0880, Acc: 0.8516

✅ Training completed.

Model saved to 'anes_classifier_model_legitimate_features.pt'
