In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForMultipleChoice 

In [2]:
# Let's import the public training set and take a look
import pandas as pd

df_valid = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv")
df_valid.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


# Load T5-large model

In [None]:
model_path = "/home/clay/research/kaggle/kaggle_llm/data/pretrained_models/t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_path).cuda()
tokenizer  = AutoTokenizer.from_pretrained(model_path)

# Check Model Baseline Score

In [None]:
valid_score = 0
model.eval()
for index in tqdm(range(df_valid.shape[0])):
    columns = df_valid.iloc[index].values
    scores = []
    input_ids = tokenizer(columns[1]+" <extra_id_0>", return_tensors="pt").input_ids.cuda()
    labels = tokenizer(["<extra_id_0> "+columns[2+p] for p in range(5)], return_tensors="pt", padding=True).input_ids
    minlen = np.min([len(l) for l in labels])
    for p in range(5):
        with torch.no_grad():
            loss = model(
                input_ids=input_ids, 
                labels=labels[p][:minlen].unsqueeze(0).cuda()
            ).loss.detach().cpu().numpy()
        scores.append(float(loss))
    predict = np.array(list("ABCDE"))[np.argsort(scores)][:3].tolist()
    if columns[7] in predict:
        valid_score += [1, 0.5, 0.333333333333][predict.index(columns[7])]
valid_score /= df_valid.shape[0]
print(f'score = {valid_score}')

# Sorting Answers from T5 loss value

In [None]:
df_test = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/kaggle-llm-science-exam/test.csv")
model.eval()
submit_ids, submit_preds = [], []
for index in tqdm(range(df_test.shape[0])):
    columns = df_test.iloc[index].values
    scores = []
    input_ids = tokenizer(columns[1]+" <extra_id_0>", return_tensors="pt").input_ids.cuda()
    labels = tokenizer(["<extra_id_0> "+columns[2+p] for p in range(5)], return_tensors="pt", padding=True).input_ids
    minlen = np.min([len(l) for l in labels])
    for p in range(5):
        with torch.no_grad():
            loss = model(input_ids=input_ids, labels=labels[p][:minlen].unsqueeze(0).cuda()).loss.detach().cpu().numpy()
        scores.append(float(loss))
    submit_ids.append(columns[0])
    submit_preds.append(scores)

# Merge with deberta_v3_large

In [3]:
from typing import Optional, Union
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

In [4]:
df_train = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv")
df_train = df_train.drop(columns="id")
df_train.shape

(200, 7)

In [5]:
df_train = pd.concat([
    df_train,
    pd.read_csv('/home/clay/research/kaggle/kaggle_llm/data/additional-train-data-for-llm-science-exam/extra_train_set.csv'),
])
df_train.reset_index(inplace=True, drop=True)
df_train.shape

(700, 7)

In [6]:
df_train.head()

Unnamed: 0,prompt,A,B,C,D,E,answer
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [11]:
# deberta_v3_large = '/home/clay/research/kaggle/kaggle_llm/data/deberta-v3-large-notebook-weights'
deberta_v3_large = "/home/clay/research/kaggle/kaggle_llm/data/pretrained_models/google-electra-base-discriminator"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

dataset = Dataset.from_pandas(df_train)

In [13]:
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}


def preprocess(example):
    """The example is expected to be a dictionary with keys 'prompt', 'A', 'B', 'C', 'D', 'E', and 'answer'."""
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = [example[option] for option in options]
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example



In [14]:

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch   
        
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

training_args = TrainingArguments(
    warmup_ratio=0.8,
    learning_rate=5e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    report_to='none',
    output_dir='.'
)

model = AutoModelForMultipleChoice.from_pretrained(deberta_v3_large)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
)

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of ElectraForMultipleChoice were not initialized from the model checkpoint at /home/clay/research/kaggle/kaggle_llm/data/pretrained_models/google-electra-base-discriminator and are newly initialized: ['sequence_summary.summary.weight', 'classifier.weight', 'sequence_summary.summary.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.6054
1000,1.6016
1500,1.5723
2000,1.4878


TrainOutput(global_step=2100, training_loss=1.5599672299339657, metrics={'train_runtime': 75.1244, 'train_samples_per_second': 27.954, 'train_steps_per_second': 27.954, 'total_flos': 243627344258940.0, 'train_loss': 1.5599672299339657, 'epoch': 3.0})

In [16]:
test_df = pd.read_csv("/home/clay/research/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv")
test_df.head()

# There are more verbose/elegant ways of doing this, but if we give our test set a random `answer` column
# we can make predictions directly with our trainer.
test_df['answer'] = 'A'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(
    preprocess, 
    batched=False, 
    remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']
)

# Here we'll generate our "real" predictions on the test set
test_predictions = trainer.predict(tokenized_test_ds)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# Make Submission

In [None]:
print(normalize(submit_preds).shape)
print(normalize(-test_predictions.predictions).shape)

In [17]:
from sklearn.preprocessing import normalize


# final_predictions = normalize(submit_preds)*0.2 + normalize(-test_predictions.predictions)*0.8
final_predictions = normalize(-test_predictions.predictions)

In [18]:
final_preds = [' '.join(np.array(list("ABCDE"))[np.argsort(s)][:3].tolist()) for s in final_predictions]

In [19]:
preds = pd.DataFrame({'id': submit_ids, 'prediction': final_preds})
preds.to_csv('submission.csv', index=False)
preds.head()

NameError: name 'submit_ids' is not defined

In [20]:
pred_letters = [np.array(list("ABCDE"))[np.argsort(s)][:3].tolist() for s in final_predictions]
pred_letters[:3]

[['D', 'B', 'C'], ['B', 'A', 'E'], ['A', 'C', 'E']]

In [21]:
soln = df_valid.copy()
for i in range(3):
    soln[f"pred{i}"] = [p[i] for p in pred_letters]
soln.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,pred0,pred1,pred2
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,D,B,C
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,B,A,E
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,A,C,E
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,C,E,B
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,E,B,D


In [23]:
map3 = 0
ranks_to_scores = [1.0, 1/2, 1/3]


for k in range(3):
    map3 += ranks_to_scores[k] * (soln[f"pred{k}"] == soln[f"answer"]).sum() / len(soln)
print(map3)

0.7925000000000001
