In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForMultipleChoice 

# Merge with deberta_v3_large

In [2]:
from typing import Optional, Union
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

In [3]:
df_val = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv")
df_val = df_val.drop(columns="id")
df_val.shape

(200, 7)

In [4]:
# df_train = pd.concat([
#     df_train,
#     pd.read_csv('/home/clay/research/kaggle/kaggle_llm/data/additional-train-data-for-llm-science-exam/extra_train_set.csv'),
# ])
df_train = pd.read_csv('/home/clay/research/kaggle/kaggle_llm/data/additional-train-data-for-llm-science-exam/extra_train_set.csv')
df_train.reset_index(inplace=True, drop=True)
df_train.shape

(500, 7)

In [5]:
df_train.head()

Unnamed: 0,prompt,C,E,D,B,A,answer
0,"In relation to Eunice Fay McKenzie's career, w...",McKenzie gained recognition for her role as a ...,McKenzie's successful career in sound films co...,McKenzie's collaborations with director Blake ...,McKenzie is primarily remembered for her starr...,McKenzie showcased her singing talents in nume...,B
1,How does Modified Newtonian Dynamics (MOND) im...,MOND is a theory that reduces the observed mis...,MOND's impact on the observed missing baryonic...,MOND is a theory that eliminates the observed ...,MOND explains the missing baryonic mass in gal...,MOND is a theory that increases the discrepanc...,E
2,Which of the following statements accurately d...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B
3,What is the significance of the Museum of the ...,The Museum of the Occupation of Latvia was est...,The Museum of the Occupation of Latvia is a mu...,The Museum of the Occupation of Latvia primari...,The Museum of the Occupation of Latvia showcas...,The Museum of the Occupation of Latvia is a me...,C
4,What was the previous name of the Christian Sc...,The Evangelical School and Chapel for the Deaf...,The Evangelical School for the Blind (ESB),The Evangelical School for the Deaf (ESD),The Christian School for the Blind (CSB),The Christian School for the Deaf (CSD),D


In [9]:
# deberta_v3_large = '/home/clay/research/kaggle/kaggle_llm/data/deberta-v3-large-notebook-weights'
deberta_v3_large = "/home/clay/research/kaggle/kaggle_llm/data/pretrained_models/google-electra-base-discriminator"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

dataset = Dataset.from_pandas(df_train)

In [None]:
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}


def preprocess(example):
    """The example is expected to be a dictionary with keys 'prompt', 'A', 'B', 'C', 'D', 'E', and 'answer'."""
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = [example[option] for option in options]
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example



In [None]:

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch   
        
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

training_args = TrainingArguments(
    warmup_ratio=0.8,
    learning_rate=5e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    report_to='none',
    output_dir='.'
)

model = AutoModelForMultipleChoice.from_pretrained(deberta_v3_large)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
)

In [None]:
trainer.train()

In [None]:
# There are more verbose/elegant ways of doing this, but if we give our test set a random `answer` column
# we can make predictions directly with our trainer.
df_val['answer'] = 'A'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
val_ds = Dataset.from_pandas(df_val)
tokenized_val_ds = val_ds.map(
    preprocess, 
    batched=False, 
    remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']
)

# Here we'll generate our "real" predictions on the test set
train_predictions = trainer.predict(tokenized_dataset)
val_predictions = trainer.predict(tokenized_val_ds)

# Make Submission

In [None]:
print(normalize(submit_preds).shape)
print(normalize(-test_predictions.predictions).shape)

In [None]:
from sklearn.preprocessing import normalize


final_train_predictions = normalize(-train_predictions.predictions)
final_val_predictions = normalize(-val_predictions.predictions)

In [None]:
pred_train_letters = [np.array(list("ABCDE"))[np.argsort(s)][:3].tolist() for s in final_train_predictions]
pred_val_letters = [np.array(list("ABCDE"))[np.argsort(s)][:3].tolist() for s in final_val_predictions]
pred_val_letters[:3]

In [None]:
train_preds = pd.DataFrame({'id': df_train.index, 'prediction': [" ".join(p) for p in pred_train_letters]}).set_index("id")
val_preds = pd.DataFrame({'id': df_val.index, 'prediction': [" ".join(p) for p in pred_val_letters]}).set_index("id")
val_preds.head()

In [None]:
def get_map3(label_df, pred_df):
    pred_df = pred_df["prediction"].str.split(" ", expand=True).rename({0: "pred0", 1: "pred1", 2: "pred2"}, axis=1)
    joined_df = label_df.join(pred_df, how="left")
    assert not joined_df["pred0"].isna().any()
    assert not joined_df["pred1"].isna().any()
    assert not joined_df["pred2"].isna().any()
    
    map3 = 0
    ranks_to_scores = [1.0, 1/2, 1/3]
    for k in range(3):
        map3 += ranks_to_scores[k] * (joined_df[f"pred{k}"] == joined_df[f"answer"]).sum() / len(joined_df)
    return map3

In [None]:
print(get_map3(df_val, val_preds))
print(get_map3(df_train, train_preds))