In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader



In [2]:
DEBUG = False
TrainSize = 8192
EvalSize = 2048

In [3]:
data_df = pd.read_csv('/kaggle/input/60k-data-with-context-v2/all_12_with_context2.csv')
data_df.drop(columns='source', inplace=True)
eval_df = data_df.sample(1024)
data_df = data_df.drop(eval_df.index)
if DEBUG:
    train_df = data_df.head(1024)
else:
    train_df = data_df.head(len(data_df) if TrainSize is None else TrainSize)

In [4]:
option_to_index = {k: v for v, k  in enumerate('ABCDE')}
index_to_option = {v:k for k, v in option_to_index.items()}

In [5]:
model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
class QAData(Dataset):
    def __init__(self, data_csv, tokenizer):
        self.data_csv = data_csv
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data_csv)
    
    def __getitem__(self, idx):
        sample = self.data_csv.iloc[idx]
        first_sentence =  [' [CLS] ' + sample['context']] * 5
        if type(sample['prompt']) is not str:
            print(sample['prompt'])
        second_sentence = [' [SEP] ' + sample['prompt'] + ' [SEP] ' + str(sample[option]) + ' [SEP] ' for option in 'ABCDE']
        tokenized_example = tokenizer(first_sentence, second_sentence, truncation='only_first', max_length=256, add_special_tokens=False, return_tensors='pt')
        tokenized_example['label'] = option_to_index[sample['answer']]
        return tokenized_example

In [7]:
qa_dataset_train = QAData(train_df, tokenizer)
qa_dataset_eval = QAData(eval_df, tokenizer)

In [8]:
for batch in qa_dataset_train:
    continue

In [9]:
for batch in qa_dataset_eval:
    continue

In [10]:
model = AutoModelForMultipleChoice.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
for param in model.deberta.embeddings.parameters():
    param.requires_grad = False
for layer in model.deberta.encoder.layer[:18]:
    for param in layer.parameters():
        param.requires_grad = False

In [12]:
trainable_params = 0
non_trainable_params = 0

for name, param in model.named_parameters():
    if param.requires_grad:
        trainable_params += param.numel()
    else:
        non_trainable_params += param.numel()

# Calculate the total number of parameters
total_params = trainable_params + non_trainable_params

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Non-trainable parameters: {non_trainable_params}")

Total parameters: 435062785
Trainable parameters: 77154305
Non-trainable parameters: 357908480


In [13]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

In [14]:
training_args = TrainingArguments(
    warmup_ratio=0.1, 
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    report_to='none',
    output_dir = f'./checkpoints_{2}',
    overwrite_output_dir=True,
    fp16=True,
    gradient_accumulation_steps=8,
    logging_steps=25,
    evaluation_strategy='steps',
    eval_steps=25,
    save_strategy="epoch",
    save_steps=25,
    load_best_model_at_end=False,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=qa_dataset_train,
    eval_dataset=qa_dataset_eval,
    compute_metrics = compute_metrics,
)

trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Map@3
25,1.6175,1.609151,0.539062
50,1.6092,1.604808,0.664225
75,1.3385,1.25838,0.705241
100,1.0233,1.097789,0.736003
125,0.9784,1.019481,0.75
150,0.8897,0.993748,0.755046
175,0.8892,0.983152,0.752767
200,0.7952,0.919569,0.772949
225,0.8611,0.931651,0.771322
250,0.7521,0.907251,0.772461




TrainOutput(global_step=512, training_loss=0.9009698387235403, metrics={'train_runtime': 12700.8368, 'train_samples_per_second': 1.29, 'train_steps_per_second': 0.04, 'total_flos': 3.817203736510464e+16, 'train_loss': 0.9009698387235403, 'epoch': 2.0})

In [16]:
trainer.save_model(f'model_v{55}')