In [7]:
%cd ..
%pwd

/tmp/working


'/tmp/working'

In [8]:
import os
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from hydra import compose, initialize
from omegaconf import OmegaConf
from transformers import (
    AutoModel,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase

import wandb

sys.path.append(os.pardir)

import utils



In [9]:
with initialize(version_base=None, config_path="../yamls"):
    c = compose(config_name="config", overrides=["exp=006/001"], return_hydra_config=True)
    cfg = c.exp
    runtime_choices = c.hydra.runtime.choices
    exp_name = f"{Path(sys.argv[0]).stem}/{runtime_choices.check}"
    output_path = Path(f"./output/{exp_name}")  #
    cfg.training_args.output_dir = str(output_path)
    print(OmegaConf.to_yaml(cfg))


utils.seed_everything(cfg.seed)

wandb.init(
    project="kaggle-llm-science-holdout",
    name=exp_name,
    mode="disabled",  # "online" if cfg.debug is False else "disabled",
    config=OmegaConf.to_container(cfg),
)

os.makedirs(output_path, exist_ok=True)

debug: ${debug}
seed: 7
early_stopping_patience: 3
training_args:
  fp16: true
  warmup_ratio: 0.8
  learning_rate: 5.0e-06
  weight_decay: 0.01
  dataloader_num_workers: 8
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  num_train_epochs: 30
  logging_strategy: epoch
  evaluation_strategy: epoch
  save_strategy: epoch
  metric_for_best_model: map@3
  save_total_limit: 1
  load_best_model_at_end: true
  report_to: wandb
  output_dir: output/ipykernel_launcher/000
  seed: ${..seed}
  gradient_accumulation_steps: 4
max_length: 100
model_name: microsoft/deberta-v3-large
sep_token:
- SEP
additional_data_paths:
- preprocessed/001/000/6000_train_examples.csv
- preprocessed/001/000/extra_train_set.csv
- preprocessed/001/000/15k_gpt3.5-turbo.csv
- preprocessed/001/000/5900_examples.csv
- preprocessed/001/000/test.csv
- preprocessed/001/000/stem_1k_v1.csv
valid_data_paths:
  preprocessed/001/000/train.csv: 200
  preprocessed/001/000/6000_all_categories_questions.csv: 200
  pre

In [10]:
valid_data_paths = {
    "preprocessed/001/000/train.csv": 200,
    # "preprocessed/001/000/6000_all_categories_questions.csv": 200,
    "preprocessed/001/000/6000_wiki_en_sci_questions.csv": 2000,
}

In [11]:
df_valid = pd.concat([pd.read_csv(path).head(num) for path, num in valid_data_paths.items()])
df_valid = df_valid.drop(columns="id")
df_valid.shape

(2200, 8)

In [12]:
df_valid["prompt_with_context"] = (
    df_valid["context"].apply(lambda x: " ".join(x.split()[: cfg.max_length]))
    + f"... {cfg.sep_token} "
    + df_valid["prompt"]
)

In [13]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [14]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

dataset_valid = Dataset.from_pandas(df_valid)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
index_to_option = {v: k for k, v in option_to_index.items()}


def preprocess(example):
    first_sentence = [example["prompt_with_context"]] * 5
    second_sentences = [example[option] for option in "ABCDE"]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
    tokenized_example["label"] = option_to_index[example["answer"]]

    return tokenized_example

In [16]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k


def map_k(true_items, predictions, K=3):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_k = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), K)):
            map_at_k += precision_at_k(user_results, k + 1) * user_results[k]
    return map_at_k / U


import numpy as np


def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row), 1, top_answers)


tokenized_test_ds = dataset_valid.map(
    preprocess, batched=False, remove_columns=["prompt_with_context", "prompt", "A", "B", "C", "D", "E", "answer"]
)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

In [17]:
model_name = "output/006_add_valid/001/checkpoint-87040"

model = AutoModelForMultipleChoice.from_pretrained(model_name)

args = TrainingArguments(output_dir=".", per_device_eval_batch_size=1)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    args=args,
)

In [None]:
test_predictions = trainer.predict(tokenized_test_ds).predictions

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
result_dict = {
    "old_map@3": map_k(df_valid["answer"].head(200).to_numpy(), predictions_to_map_output(test_predictions[:200, :])),
    "new_map@3": map_k(df_valid["answer"].to_numpy(), predictions_to_map_output(test_predictions)),
}

print(result_dict)