# Setup

In [None]:
# additional google colab setup
import sys


def colab_install():
    import torch
    if not torch.cuda.is_available():
      print("CUDA is not available. \nPick a GPU before running this notebook. \nGo to 'Runtime' -> 'Change runtime type' to do this.")
      return 
    %pip install numpy==1.* # ligtheval is not compatible with 2.0 TODO: check this
    %pip install lighteval
    %pip install transformers
    %pip install datasets
    %pip install peft
    %pip install bitsandbytes
    %pip install evaluate
    %pip install wandb
    return


if "google.colab" in sys.modules:
    print("Running in Google Colab")
    # Install required packages
    colab_install()
else:
    print("Not running in Google Colab")

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict

transformers.set_seed(24)

In [None]:
# TODO remove
DEBUG = True
if DEBUG:
    %cd survai-finetuning

# Data preperation

In [None]:
# download dataset
!curl -L -o 2016_anes_argyle.pkl https://github.com/tobihol/survai-finetuning/raw/main/2016_anes_argyle.pkl

In [None]:
df_survey = pd.read_pickle("2016_anes_argyle.pkl")
df_survey

In [None]:
# descriptive statistics
df_survey.info()

In [None]:
features = [
    "race",
    "discuss_politics",
    "ideology",
    "party",
    "church_goer",
    "age",
    "gender",
    "political_interest",
    "patriotism",
    "state",
]
label = "ground_truth"

In [None]:
# we tread missing values as a category 
df_survey_processed = (
    df_survey
    .astype({"age": str})
    .fillna("missing")
)
df_survey_processed

### Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_survey_processed, test_size=0.2, random_state=24)
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train, preserve_index=False),
    "test": Dataset.from_pandas(df_test, preserve_index=False),
})
dataset

## Prompt Design

In [None]:
instruction = (
    "Please perform a classification task. "
    + "Given the 2016 survey answers from the American National Election Studies, "
    + "return which candiate the person voted for. "
    + "Return a label from ['Trump', 'Clinton', 'Non-voter'] only without any other text.\n"
)
print(instruction)

In [None]:
column_name_map = {
    "race": "Race",
    "discuss_politics": "Discusses politics",
    "ideology": "Ideology",
    "party": "Party",
    "church_goer": "Church",
    "age": "Age",
    "gender": "Gender",
    "political_interest": "Political interest",
    "patriotism": "American Flag",
    "state": "State",
    "ground_truth": "Vote",
}

def create_prompt(row):
    prompt = instruction
    prompt += "\n".join([f"{column_name_map[k]}: {v}" for k, v in row.items()])
    return prompt

def map_to_prompt(row):
    user_prompt = instruction
    user_prompt += "\n".join([f"{column_name_map[k]}: {v}" for k, v in row.items() if k != label])
    assistant_prompt = row[label]
    return {
        "text": user_prompt, 
        "label": assistant_prompt,
        }

map_to_prompt(dataset['train'][0])

In [None]:
dataset_llm = dataset.map(map_to_prompt).remove_columns(features+[label])
dataset_llm

# Loading the model

LLM training/inference is the wild west: the are a ton of differnt libraries/wrappers where each one can implement different changes your evaluations results. These libaries also get often fixed and updated in major ways which can break your pipeline or change results.

## Model Selection

### Which modle should I fine-tune? 
State-of-the-Art open-source model: **Llama 3 model family** *(Dubey et al., 2024)*
- Best performance, use full for testing the best possible performance to data

Research model: **Pythia model family** *(Biderman et al., 2023)*
- Openly available training data
- Multiple smaller model sizes available
- Enables testing your finetuning pipeline more efficiently
- Enables comparing the effects of model size on performance
- Easy to test for data contamination
- Drawback: May not give a good representation of what is possible with SOTA models



### Which models currently perform best?
- https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
- https://lmarena.ai/
- https://crfm.stanford.edu/helm/
    - Imputation Benchmark: https://crfm.stanford.edu/helm/classic/latest/#/groups/entity_data_imputation

In [None]:
# model_id = "EleutherAI/pythia-70m"
model_id = "unsloth/Llama-3.2-1B-Instruct"
# revision = "8d308458221c84f2b793d9b820d72e2c10159630"

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    # revision=revision,
    padding_side="left",
    trust_remote_code=True,
)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

### Tokenization

The problem I encounted during my pipeline implementations using: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/tree/main

In [None]:
# skip this if you don't have a huggingface account
from transformers import AutoTokenizer

chat = [
    {"role": "user", "content": "Hello world"},
    {"role": "assistant", "content": "Hello"},
]

tokenizer_mistral_old = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    revision="41b61a33a2483885c981aa79e0df6b32407ed873",
)

untokenized_output_mistral_old = tokenizer_mistral_old.apply_chat_template(
    chat,
    tokenize=False,
)
print(f"Untokenized output: {untokenized_output_mistral_old}")

tokenized_output_mistral_old = tokenizer_mistral_old.apply_chat_template(
    chat,
    tokenize=True,
)
print(f"Tokenized output: {tokenized_output_mistral_old}")

In [None]:
# skip this if you don't have a huggingface account
tokenizer_mistral_new = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", revision="main"
)

untokenized_output_mistral_new = tokenizer_mistral_new.apply_chat_template(
    chat,
    tokenize=False,
)
print(f"Untokenized output: {untokenized_output_mistral_new}")

tokenized_output_mistral_new = tokenizer_mistral_new.apply_chat_template(
    chat,
    tokenize=True,
)
print(f"Tokenized output: {tokenized_output_mistral_new}")

-> The token for the `Hello` answer of the assitant is different!

Not only dependency versions should be reported, but also the model version! As even small changes in the tokenizer can cause major changes in the output and make a finding not reproducable.

In [None]:
def instruct_tokenize_function(examples):
    prompt = [
        {"role": "user", "content": examples["text"]},
    ]
    prompt.append(
        {
            "role": "assistant",
            "content": examples["label"],
        }
    )
    inputs_ids = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=False,
    )
    attention_mask = np.ones_like(inputs_ids)
    return {
        "input_ids": inputs_ids,
        "attention_mask": attention_mask,
    }


def basic_tokenize_function(examples):
    prompt = f"{examples['text']} \nVote: {examples['label']} {tokenizer.eos_token}"
    return tokenizer(prompt)


tokenized_dataset_llm = dataset_llm.map(basic_tokenize_function).remove_columns(
    ["text", "label"]
)
tokenized_dataset_llm

### Quantization
Quantization reduces the memory required to store the model (Dettmers et al., 2022). Typically a model is stored in 16bit, therefor for a 70B model typically 16/8 bytes * 70 * 10^9 = 140GB of VRAM would be needed to store it. With 4bit quantisation all parameters are stored in 4bit and therefore only 4/8 bytes * 70 * 10^9 = 35GB of VRAM is needed.



In [None]:
# load model in 4bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto",
)

model = prepare_model_for_kbit_training(model)

if getattr(model.config, "pad_token_id") is None:
    model.config.pad_token_id = tokenizer.pad_token_id

### LoRA
Low-Rank Adapters (LoRA) are a parameter efficient fine-tuning method (Hu et al., 2021). Instead of finetuning all model weights, LoRA finetunes the weights of the adapter layers only. This requires less memory and allows for faster finetuning.

In [None]:
lora_rank = 8
lora_alpha = 8

lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules="all-linear",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.config.use_cache = False

### The Answer Extraction Problem

<!-- - https://arxiv.org/pdf/2307.09702, https://github.com/dottxt-ai/outlines -->
Different modles for answer extraction:
- https://blog.eleuther.ai/multiple-choice-normalization/
- https://github.com/huggingface/lighteval

Problem 1: How many tokens are need to answer the question:
- One token solutions:
    - less compute intensive
    - do not require normalisation
    - only works if all first token are destinct
- Multi token solutions:
    - more compute intensive (multiplied by number of lables)
    - might require normalisation
    - does not require all first tokens to be distinct
    
Problem 2: How to evaluate multi token extraction (see code below)

In [None]:
from lighteval.metrics.metrics_sample import LoglikelihoodAcc
from lighteval.metrics.normalizations import (
    LogProbCharNorm,
    # LogProbTokenNorm,
    # LogProbPMINorm,
)
from lighteval.tasks.requests import Doc
import numpy as np

acc_metric = LoglikelihoodAcc(
    # LogProbCharNorm(ignore_first_space=False),
)

choices = ["Trump", "Clinton", "Non-voter"]
log_prob_predictions = np.log([0.34, 0.33, 0.32])
correct_choice = "Non-voter"

doc = Doc(query="...", choices=choices, gold_index=[choices.index(correct_choice)])

In [None]:
acc_without_normalisation = LoglikelihoodAcc(
    # LogProbCharNorm(ignore_first_space=False),
).compute(
    gold_ixs=doc.gold_index,
    choices_logprob=log_prob_predictions,
    unconditioned_logprob=None,
    choices_tokens=None,
    formatted_doc=doc,
)
print(f"Accuracy score without normalisation: {acc_without_normalisation}")

In [None]:
acc_with_normalisation = LoglikelihoodAcc(
    LogProbCharNorm(ignore_first_space=False),
).compute(
    gold_ixs=doc.gold_index,
    choices_logprob=log_prob_predictions,
    unconditioned_logprob=None,
    choices_tokens=None,
    formatted_doc=doc,
)
print(f"Accuracy score with normalisation: {acc_with_normalisation}")

### Metrics

In [None]:
import evaluate
from sklearn import metrics
from functools import partial

# TODO: make other metrics work
hf_metrics = [
    evaluate.load("accuracy"),
    # evaluate.load("f1"),
    # evaluate.load("precision"),
    # evaluate.load("recall"),
    # evaluate.load("confusion_matrix"),
]
sklearn_metrics = {
    # "accuracy": metrics.accuracy_score,
    # "balanced_accuracy": metrics.balanced_accuracy_score,
    # "f1_weighted": partial(metrics.f1_score, average="weighted"),
    # "confusion_matrix": metrics.confusion_matrix,
}

### Training helper functions

In [None]:
from typing import Tuple


def instruct_tokenization(
    data: DatasetDict,
    tokenizer: AutoTokenizer,
) -> Tuple[DatasetDict, Dataset]:
    def tokenize_function(examples, is_inference=False):
        prompt = [
            {"role": "user", "content": examples["text"]},
        ]
        if not is_inference:
            prompt.append(
                {
                    "role": "assistant",
                    "content": examples["label"],
                }
            )
        inputs_ids = tokenizer.apply_chat_template(
            prompt,
            add_generation_prompt=is_inference,
        )
        attention_mask = np.ones_like(inputs_ids)
        return {
            "input_ids": inputs_ids,
            "attention_mask": attention_mask,
        }

    column_names = list(data.column_names.values())[0]
    training_data = data.map(tokenize_function, remove_columns=column_names)
    from functools import partial

    inference_data = data.map(
        partial(tokenize_function, is_inference=True), remove_columns=column_names
    )

    answer_tokens = list(
        {
            training_ids[len(inference_ids)]
            for inference_ids, training_ids in zip(
                inference_data["train"]["input_ids"]
                + inference_data["test"]["input_ids"],
                training_data["train"]["input_ids"]
                + training_data["test"]["input_ids"],
            )
        }
    )
    assert len(answer_tokens) == len(
        set(data["test"]["label"] + data["train"]["label"])
    )

    return training_data, inference_data, answer_tokens


training_data, inference_data, answer_tokens = instruct_tokenization(
    dataset_llm, tokenizer
)

In [None]:
pred_slice_ids = [
    (len(inference_ids), len(training_ids) - 1)
    for inference_ids, training_ids in zip(
        inference_data["test"]["input_ids"], training_data["test"]["input_ids"]
    )
]  # NOTE: the -1 accounts for the eos token, which is not present for the generation data

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    logits = logits[:, :, answer_tokens].argmax(dim=-1)

    return torch.tensor(
        answer_tokens,
        device="cuda",
    )[logits]


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    preds = np.pad(preds, ((0, 0), (1, 0)), mode="constant", constant_values=-100)
    labels = np.pad(labels, ((0, 0), (0, 1)), mode="constant", constant_values=-100)

    def get_slice(y):
        return [
            [token for token, label_token in zip(row, label) if label_token != -100][
                start_id:end_id
            ]
            for (start_id, end_id), row, label in zip(pred_slice_ids, y, labels)
        ]

    y_true = get_slice(labels)
    y_pred = get_slice(preds)
    # accuracy based on the first token of the vote
    y_true = [row[0] for row in y_true]
    y_pred = [row[0] for row in y_pred]

    results = {}
    for metric in hf_metrics:
        results |= metric.compute(predictions=y_pred, references=y_true)
    for metric_name, metric_func in sklearn_metrics.items():
        results[metric_name] = metric_func(y_true=y_true, y_pred=y_pred)
    return results

### Training the model

In [None]:
import wandb
from datetime import datetime

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
run_name = f"{model_id}_{now}"


def finetune(
    model,
    tokenizer,
    training_data,
    run_name,
):
    # wandb.init(
    #     mode='disabled',
    # )
    wandb.init(
        project="survai-finetuning",
        name=run_name,
    )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=training_data["train"],
        eval_dataset=training_data["test"],
        args=transformers.TrainingArguments(
            output_dir="./results",
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            fp16=True,
            optim="paged_adamw_8bit",

            # train/eval settings
            num_train_epochs=1,
            do_eval=True,
            eval_strategy="steps",
            eval_steps=1 / 3,  # after each third

            # logging
            logging_steps=10,
            report_to="wandb",
            run_name=run_name,
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
        compute_metrics=compute_metrics,
    )

    trainer.evaluate()
    trainer.train()
    trainer.evaluate()

    wandb.finish()

### Systematic Non-responses Experiment

The party affiliation is (obviously) a strong predictor of vote choice. In the Argyle et al. (2022) study, the GPT-3 mainly used the party affiliation and ideology of a person to predict the vote choice.

In this experiment we remove Repulican voters from the train set. We therefore only train on democrats and independents and see if the model can still perform well.

In [None]:
df_train["party"].value_counts()

In [None]:
leans_republican = df_train["party"].apply(lambda x: "Republican" in x)
df_train_ex2 = df_train[~leans_republican]
df_train_ex2

In [None]:
# TODO probably rerunning the notebook would be the right way to go about this?

# Things we did not cover

Some parts of the pipeline we did not do, because of time constraints, but should be done in pratice:
- Hyperparameter search
- Cross validation
- Reporting multiple seeds