In [5]:
import os
import json
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [6]:
def load_data(data_folder, variable_code):
    """Load ANES JSONs and extract examples for the given variable code."""
    examples = []
    json_files = [f for f in os.listdir(data_folder) if f.endswith('.json')]
    for filename in json_files:
        path = os.path.join(data_folder, filename)
        with open(path, 'r') as f:
            respondent = json.load(f)
        for item in respondent.get('responses', []):
            if item.get('variable_code') == variable_code:
                question = item.get('full_question_text', '')
                options = [opt['text'] for opt in item.get('possible_answers', [])]
                answer = item.get('respondent_answer')
                if answer in options:
                    label = options.index(answer)
                    text_input = f"{question} Options: {', '.join(options)}"
                    examples.append({'text': text_input, 'label': label})
                break
    return examples

In [7]:
def prepare_datasets(examples, test_size=0.2, random_state=42):
    """Split examples into training and validation sets."""
    texts = [ex['text'] for ex in examples]
    labels = [ex['label'] for ex in examples]
    return train_test_split(texts, labels, test_size=test_size, random_state=random_state)

class ANESDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item





In [8]:
def main():
    # Path to folder containing all respondent JSON files
    data_folder = '/home/tsultanov/shared/datasets/respondents_filtered'
    variable_code = 'V241039'  # Change to your question code

    # Load and split data
    examples = load_data(data_folder, variable_code)
    if not examples:
        print(f"No examples found for variable {variable_code}")
        return
    train_texts, val_texts, train_labels, val_labels = prepare_datasets(examples)

    # Tokenization
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)

    train_dataset = ANESDataset(train_encodings, train_labels)
    val_dataset = ANESDataset(val_encodings, val_labels)

    # Model initialization
    num_labels = len(set(train_labels))
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=num_labels
    )

    # Training arguments
    training_args = training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_dir='./logs'
    )


    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Train!
    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained('./bert_anes_model')
    tokenizer.save_pretrained('./bert_anes_model')

    # Example inference with logits 
    test_text = val_texts[0]
    enc = tokenizer(test_text, return_tensors='pt')
    outputs = model(**enc)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)
    top2 = torch.topk(logits, 2).values
    confidence_gap = (top2[0][0] - top2[0][1]).item()

    print("Input:", test_text)
    print("Logits:", logits.detach().cpu().numpy())
    print("Probabilities:", probs.detach().cpu().numpy())
    print("Logit confidence gap:", confidence_gap)

if __name__ == '__main__':
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`