In [2]:
import os

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '14'

In [3]:
VER=1
# TRAIN WITH SUBSET OF 60K
NUM_TRAIN_SAMPLES = 1024
# PARAMETER EFFICIENT FINE TUNING
# PEFT REQUIRES 1XP100 GPU NOT 2XT4
USE_PEFT = False
# NUMBER OF LAYERS TO FREEZE 
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 0
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 1496
# HUGGING FACE MODEL
MODEL = 'microsoft/deberta-v3-large'
model_name = MODEL

In [3]:
df_valid = pd.read_csv('input_data/validation_data/master_validation_data_article_context.csv')

In [4]:
eval_gpt = pd.read_csv('input_data/validation_data/eval300_gpt4.csv')

In [11]:
eval_gpt['prompt_answer'] = eval_gpt.apply(lambda row: ''.join(row['prompt'] + row['answer']), axis=1)

In [14]:
df_valid['prompt_answer'] = df_valid.apply(lambda row: ''.join(row['prompt'] + row['answer']), axis=1)

In [40]:
df_merge = df_valid.merge(eval_gpt[['answer','prompt_answer','gpt4']], on=['prompt_answer'], how='left')

In [42]:
# Conditionally update the gpt4 column
df_merge['answer'] = np.where(df_merge['gpt4'].isna(), df_merge['answer_x'], df_merge['gpt4'])

In [53]:
df_load = pd.read_csv('base/validation_data_mistral_LORA_predicted_2percent.csv')

In [55]:
df_load['deberta_choice_1'] = df_load['prediction'].str[0]
df_load['deberta_choice_2'] = df_load['prediction'].str[2]
df_load['deberta_choice_3'] = df_load['prediction'].str[4]


In [56]:
(df_load['deberta_choice_1'] ==df_load['answer']).value_counts()

True     1077
False     193
dtype: int64

In [57]:
df_false = df_load[ df_load.answer != df_load.deberta_choice_1]

In [60]:
df_false

Unnamed: 0,id,prompt,A,B,C,D,E,answer,context,instruction,prediction,logits,deberta_choice_1,deberta_choice_2,deberta_choice_3
0,0,What does a diffusion-limited enzyme represent...,"An intrinsic, physical constraint",A maximum peak height in the fitness landscape,An evolutionary limitation,A chemical limitation,A diffusion limitation,B,-A diffusion-limited enzyme catalyses a reacti...,What does a diffusion-limited enzyme represent...,A B E,"[(array(0.681964, dtype=float32), 'A'), (array...",A,B,E
4,4,What is the height at which mountaineers are m...,"6,000 feet","9,000 feet","12,000 feet","18,000 feet","15,000 feet",B,"-Altitude sickness can first occur at 1,500 me...",What is the height at which mountaineers are m...,D C B,"[(array(0.04697393, dtype=float32), 'A'), (arr...",D,C,B
8,8,Which eukaryotic cell cycle event is missing i...,cell growth,DNA duplication,karyokinesis,cytokinesis,cell growth,C,-Cytokinesis largely resembles the prokaryotic...,Which eukaryotic cell cycle event is missing i...,B C D,"[(array(0.02956403, dtype=float32), 'A'), (arr...",B,C,D
10,10,What is the significant characteristic of Gior...,Giordano Bruno (crater) displays an unusual pa...,Giordano Bruno (crater) has a higher albedo th...,The outer rim of Giordano Bruno (crater) is pa...,Some of the ejecta from Giordano Bruno (crater...,Giordano Bruno (crater) appears at the center ...,B,-Impact craters generally have a rim with ejec...,What is the significant characteristic of Gior...,E D B,"[(array(0.01559149, dtype=float32), 'A'), (arr...",E,D,B
12,12,Which circulation picks up oxygen for cellular...,pulmonary,interlobular,respiratory,bronchial,interlobular,C,"-In vertebrates, oxygen is taken into the body...",Which circulation picks up oxygen for cellular...,A C D,"[(array(0.47210026, dtype=float32), 'A'), (arr...",A,C,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241,1241,What are permutation-inversion groups?,Permutation-inversion groups are groups of sym...,Permutation-inversion groups are groups of sym...,Permutation-inversion groups are groups of sym...,Permutation-inversion groups are groups of sym...,Permutation-inversion groups are groups of sym...,E,"-Symmetry operations, point groups and permuta...",What are permutation-inversion groups? A: Perm...,B E A,"[(array(0.06476416, dtype=float32), 'A'), (arr...",B,E,A
1246,1246,What is the difference between redshift due to...,Redshift due to the expansion of the universe ...,Redshift due to the expansion of the universe ...,There is no difference between redshift due to...,Redshift due to the expansion of the universe ...,Redshift due to the expansion of the universe ...,D,-Redshift is also used to measure the expansio...,What is the difference between redshift due to...,E A D,"[(array(0.2822599, dtype=float32), 'A'), (arra...",E,A,D
1248,1248,What is the reason for heating metals to a tem...,To prevent the grains of solution from growing...,To increase the size of the grains of solution...,To prevent the grains of solution from growing...,To prevent the grains of solution from growing...,To increase the size of the grains of solution...,C,-Metallic materials consist of a microstructur...,What is the reason for heating metals to a tem...,A C B,"[(array(0.3446145, dtype=float32), 'A'), (arra...",A,C,B
1257,1257,What is the explanation for the effective supe...,Two different color charges close together app...,Two different color charges close together app...,Two different color charges close together app...,Two different color charges close together app...,Two different color charges close together app...,A,"-Nonetheless, color-charged particles may comb...",What is the explanation for the effective supe...,D A E,"[(array(0.23341925, dtype=float32), 'A'), (arr...",D,A,E


In [52]:
df_merge.to_csv('input_data/validation_data/master_validation_data_article_context_corrected.csv',index=False)

In [49]:
del df_merge['prompt_answer'], df_merge['answer_y'], df_merge['gpt4'],df_merge['answer_x']

In [43]:
eval_df = df_merge[~df_merge['answer_y'].isnull()]

In [45]:
(eval_df['answer']==eval_df['gpt4']).value_counts()

True    300
dtype: int64

In [41]:
eval_df

Unnamed: 0,id,prompt,A,B,C,D,E,answer_x,context,prompt_answer,answer_y,gpt4
771,771,What is the method of transcription in the lif...,RNA-templated transcription is the method of t...,Transcription occurs through a unique mechanis...,Reverse transcription is the method of transcr...,DNA-templated transcription is the method of t...,Transcription does not occur in the life cycle...,D,-There are three different replication systems...,What is the method of transcription in the lif...,D,D
772,772,What is the role of the viral fiber glycoprote...,The viral fiber glycoproteins are involved in ...,The viral fiber glycoproteins code for 40 prot...,The viral fiber glycoproteins are responsible ...,The viral fiber glycoproteins mediate endocyto...,The viral fiber glycoproteins are responsible ...,D,"-ASFV is a large (175–215 nm), icosahedral, do...",What is the role of the viral fiber glycoprote...,D,D
773,773,What is the significance of the faint Hα emiss...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,A,-Single antenna detections Radio observations ...,What is the significance of the faint Hα emiss...,A,A
774,774,What is the significance of the pedicellariae ...,They are used for climbing on corals.,They resemble the traps of the Venus fly trap ...,They are covered by short and stout spines.,They are found on the central disc of the sea ...,They are a characteristic feature of the Gonia...,B,-Structure The three basic segments of the typ...,What is the significance of the pedicellariae ...,B,B
775,775,What is the role of the microprocessor complex...,The microprocessor complex is responsible for ...,The microprocessor complex is responsible for ...,The microprocessor complex is involved in the ...,The microprocessor complex is involved in the ...,The microprocessor complex is responsible for ...,A,-The microprocessor complex is a protein compl...,What is the role of the microprocessor complex...,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...
1066,1066,What is the significance of the anti-de Sitter...,The AdS/CFT correspondence is a conjectured re...,The AdS/CFT correspondence provides a non-pert...,The AdS/CFT correspondence represents a major ...,The AdS/CFT correspondence is a strong-weak du...,The AdS/CFT correspondence was first proposed ...,C,"-In theoretical physics, anti-de Sitter/confor...",What is the significance of the anti-de Sitter...,C,C
1067,1067,What is the branch of physics that seeks to de...,String theory,Quantum gravity,AdS/CFT correspondence,General relativity,M-theory,B,-Quantum gravity and strings Current understan...,What is the branch of physics that seeks to de...,B,B
1068,1068,What is the AdS/CFT correspondence according t...,The AdS/CFT correspondence is a relationship b...,The AdS/CFT correspondence is the theory that ...,The AdS/CFT correspondence is a dictionary tha...,The AdS/CFT correspondence is the equivalence ...,The AdS/CFT correspondence is a mathematical c...,D,"-In theoretical physics, anti-de Sitter/confor...",What is the AdS/CFT correspondence according t...,D,D
1069,1069,What is the purpose of superstring theory acco...,To explain the behavior of fundamental particl...,To explain the behavior of large-scale structu...,To describe the four fundamental forces acting...,To harmonize the theory of general relativity ...,To eliminate the infinities in quantum field t...,A,-Superstring theory is an attempt to explain a...,What is the purpose of superstring theory acco...,A,A


In [71]:
df_valid = pd.read_csv('input_data/train_with_context2.csv')
print('Validation data size:', df_valid.shape )


Validation data size: (200, 8)


In [70]:
df_train = pd.read_csv('input_data/all_12_with_context2.csv')
df_train = df_train.drop(columns="source")
df_train = df_train.dropna(how='any', axis=0) # delete 4 choice question
print('Train data size:', df_train.shape )


Train data size: (46687, 8)


In [72]:
df_train.head()

Unnamed: 0,prompt,context,A,B,C,D,E,answer
0,"In relation to Eunice Fay McKenzie's career, w...","Eunice Fay McKenzie (February 19, 1918 – April...",McKenzie showcased her singing talents in nume...,McKenzie is primarily remembered for her starr...,McKenzie gained recognition for her role as a ...,McKenzie's collaborations with director Blake ...,McKenzie's successful career in sound films co...,B
1,How does Modified Newtonian Dynamics (MOND) im...,The presence of a clustered thick disk-like co...,MOND is a theory that increases the discrepanc...,MOND explains the missing baryonic mass in gal...,MOND is a theory that reduces the observed mis...,MOND is a theory that eliminates the observed ...,MOND's impact on the observed missing baryonic...,E
2,Which of the following statements accurately d...,Woody Hartman is a retired American soccer goa...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B
3,What is the significance of the Museum of the ...,The Museum of the Occupation of Latvia () is a...,The Museum of the Occupation of Latvia is a me...,The Museum of the Occupation of Latvia showcas...,The Museum of the Occupation of Latvia was est...,The Museum of the Occupation of Latvia primari...,The Museum of the Occupation of Latvia is a mu...,C
4,What was the previous name of the Christian Sc...,It was named the Evangelical School for the De...,The Christian School for the Deaf (CSD),The Christian School for the Blind (CSB),The Evangelical School and Chapel for the Deaf...,The Evangelical School for the Deaf (ESD),The Evangelical School for the Blind (ESB),D


In [5]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [74]:
option_to_index

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

In [75]:
index_to_option

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}

In [76]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
dataset_valid = Dataset.from_pandas(df_valid)
dataset = Dataset.from_pandas(df_train)
dataset = dataset.remove_columns(["__index_level_0__"])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [77]:
tokenized_dataset_valid = dataset_valid.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/46687 [00:00<?, ? examples/s]

In [79]:
tokenized_dataset_valid

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 200
})

In [35]:
tokenized_dataset_valid[0]

3

In [36]:
cntx = df_train.head(1)['context'].values[0]
pmpt = df_train.head(1)['prompt'].values[0]


In [38]:
pmpt

"In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work?"

In [26]:
check =  [ "[CLS] " + cntx ] * 5

In [40]:
first_sentence = [ "[CLS] " + cntx ] * 5
second_sentences = [" #### " + pmpt + " [SEP] " + df_train.head(1)[option].values[0] + " [SEP]" for option in 'ABCDE']

In [41]:
second_sentences

[" #### In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work? [SEP] McKenzie showcased her singing talents in numerous musical productions, garnering critical acclaim. [SEP]",
 " #### In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work? [SEP] McKenzie is primarily remembered for her starring roles opposite Gene Autry in popular Western films of the 1940s. [SEP]",
 " #### In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work? [SEP] McKenzie gained recognition for her role as a child actress in a series of iconic silent films. [SEP]",
 " #### In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work? [SEP] McKenzie's collaborations with director Blake Edwards were instrumental in her rise to fame. [SEP]",
 " #### In relation to Eunice Fay McKenzie's career, which statement accurately reflects her

In [42]:
tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
# tokenized_example['label'] = option_to_index[example['answer']]

In [49]:
tokenizer.decode(tokenized_example['input_ids'][0])

'[CLS] Eunice Fay McKenzie (February 19, 1918 – April 16, 2019) was an American actress and singer. She also entertained the troops with her former screen partner, Gene Autry. ===Later career=== After World War II, McKenzie retired from films to raise her two children. She was briefly billed as Fay Shannon. ==Biography== ===Early life and silent film=== McKenzie was born on February 19, 1918, in Hollywood, California, to show business parents, film actor Eva (née Heazlitt) and Irish American actor/director Robert McKenzie.Mike Fitzgerald, "An Interview with... She starred in silent films as a child, and then sound films as an adult, but perhaps she is best known for her leading roles opposite Gene Autry in the early 1940s in five horse opera features. Fay\'s sister Ida Mae McKenzie, cousin Ella McKenzie, and brother-in-law Billy Gilbert, were also actors. McKenzie sang duets with Autry in each of these films. Ida Mae also played the character of Sarah Lincoln in The Dramatic Life of Ab

In [51]:
tokenizer.decode(tokenized_example['input_ids'][1])

'[CLS] Eunice Fay McKenzie (February 19, 1918 – April 16, 2019) was an American actress and singer. She also entertained the troops with her former screen partner, Gene Autry. ===Later career=== After World War II, McKenzie retired from films to raise her two children. She was briefly billed as Fay Shannon. ==Biography== ===Early life and silent film=== McKenzie was born on February 19, 1918, in Hollywood, California, to show business parents, film actor Eva (née Heazlitt) and Irish American actor/director Robert McKenzie.Mike Fitzgerald, "An Interview with... She starred in silent films as a child, and then sound films as an adult, but perhaps she is best known for her leading roles opposite Gene Autry in the early 1940s in five horse opera features. Fay\'s sister Ida Mae McKenzie, cousin Ella McKenzie, and brother-in-law Billy Gilbert, were also actors. McKenzie sang duets with Autry in each of these films. Ida Mae also played the character of Sarah Lincoln in The Dramatic Life of Ab

## Load model

In [80]:
def _get_model_tokeinzer(path, model_type):
    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
    if model_type == 'causal':
        model = AutoModelForCausalLM.from_pretrained(
            path,
            torch_dtype=torch.float32,
            trust_remote_code=True,
            # low_cpu_mem_usage=True,
            device_map="auto"
        )
    elif model_type == 'seq':
        model = AutoModelForSequenceClassification.from_pretrained(path,num_labels=1)
    elif model_type == 'multic':
        model = AutoModelForMultipleChoice.from_pretrained(path)
    
    return tokenizer, model
    

In [81]:
model_name = MODEL
model_type =  'multic' # ['seq','causal','multic']

In [82]:
directory = f"models/{model_name}"

if os.path.exists(directory):
    print(f"Loading Model from directory: '{directory}' ")
    tokenizer, model = _get_model_tokeinzer(directory,model_type)
else:
    print(f"Downloading Model from huggingface: '{model_name}' ")
    tokenizer, model = _get_model_tokeinzer(model_name,model_type)
    # Save the model weights to disk
    os.makedirs(directory, exist_ok=True)
    print(f"Saving the Model from directory: '{directory}' ")
    model.save_pretrained(directory)
    tokenizer.save_pretrained(directory)
    

Loading Model from directory: 'models/microsoft/deberta-v3-large' 


In [83]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

In [85]:
training_args = TrainingArguments(
    warmup_ratio=0.1, 
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    report_to='none',
    output_dir = f"output/{model_name}-{VER}",
    overwrite_output_dir=True,
    fp16=False,
    gradient_accumulation_steps=8,
    logging_steps=25,
    evaluation_strategy='steps',
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    load_best_model_at_end=False,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
    compute_metrics = compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()
trainer.save_model(f'model_v{VER}')

# Load ~Saved model

In [4]:

# if USE_PEFT:
#     model = AutoModelForMultipleChoice.from_pretrained(MODEL)
#     model = get_peft_model(model, peft_config)
#     checkpoint = torch.load(f'model_v{VER}/pytorch_model.bin')
#     model.load_state_dict(checkpoint)
# else:
model = AutoModelForMultipleChoice.from_pretrained(f'output/{model_name}-{VER}-saved',torch_dtype=torch.float16)
trainer = Trainer(model=model)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(f'output/{model_name}-{VER}-saved')

In [6]:
MAX_INPUT = 512

In [7]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}


def preprocess(example):
    first_sentence = "[CLS] " + example['context']
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    
    tokenized_examples = tokenizer([first_sentence] * 5, second_sentences, truncation=True, padding='max_length', max_length=MAX_INPUT, add_special_tokens=False, return_tensors='pt')
    
    tokenized_examples['label'] = option_to_index[example['answer']]
    
    return tokenized_examples

In [8]:
test_df = pd.read_csv('input_data/train_with_context2.csv')
tokenized_test_dataset = Dataset.from_pandas(test_df).map(
        preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E'])



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoConfig, AutoModelForCausalLM,AutoModelForMultipleChoice, AutoTokenizer, AutoModel,BitsAndBytesConfig
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file

from peft import LoraConfig, get_peft_model 
from peft import PeftModel, PeftConfig

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
model_path = 'models/mistral_c/mistralai/Mistral-7B-v0.1'

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
#     load_in_8bit=True,
    trust_remote_code=True,
#     low_cpu_mem_usage=True,
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    # layers_to_transform = [25,26,27,28,29,30,31,32],
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj","up_proj", "down_proj","lm_head"],
    task_type="CAUSAL_LM"
)

In [25]:
# model.named_parameters

In [31]:
# model.children

In [32]:
def find_linear_layers(model):
    """ find linear layers in given transformer model """
    lora_module_names = set()
    for name, module in model.named_modules():
        # 4 bits for qlora
        # if isinstance(module, nn.Linear): 
        print(module)
        names = name.split('.')
        lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)




In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [38]:

base_model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 170082304 || all params: 7411814400 || trainable%: 2.2947458587198297


In [42]:
import torch
import torch.nn.functional as F

In [39]:
labels = torch.tensor([3, 0, 3, 3, 0, 4, 3, 0, 3, 4, 4, 3])
y_preds = torch.tensor([
    [ 0.0809,  0.1396, -0.0039,  0.1592,  0.0755],
    [ 0.0881,  0.1466, -0.0024,  0.1619,  0.0827],
    [ 0.0910,  0.1463,  0.0056,  0.1694,  0.0862],
    [ 0.0878,  0.1381, -0.0026,  0.1658,  0.0776],
    [ 0.0795,  0.1336, -0.0080,  0.1588,  0.0768],
    [ 0.0787,  0.1359, -0.0029,  0.1639,  0.0768],
    [ 0.0841,  0.1335, -0.0039,  0.1603,  0.0808],
    [ 0.0921,  0.1506,  0.0006,  0.1663,  0.0889],
    [ 0.0819,  0.1423, -0.0025,  0.1623,  0.0814],
    [ 0.0825,  0.1419, -0.0056,  0.1591,  0.0781],
    [ 0.0906,  0.1427, -0.0031,  0.1647,  0.0822],
    [ 0.0837,  0.1398, -0.0032,  0.1606,  0.0798]
])



In [51]:
def listnet_loss(y_pred, y_true):
    # Convert the ground truth to one-hot encoding
    y_true_onehot = F.one_hot(y_true, num_classes=y_pred.size(1)).float()

    # Ensure the predictions are in the same order as y_true.
    _, indices = torch.sort(y_true_onehot, descending=True, dim=1)
    y_pred = torch.gather(y_pred, 1, indices)

    # Compute softmax over raw scores
    y_pred_softmax = F.softmax(y_pred, dim=1)
    
    # Compute cross-entropy
    loss = -torch.mean(torch.sum(y_true_onehot * torch.log(y_pred_softmax), dim=1))
    
    return loss

In [52]:
l_loss = listnet_loss(y_preds, labels)
print(l_loss)

tensor(1.6430)


In [43]:
# Compute the cross-entropy loss for each sample
losses = F.cross_entropy(y_preds, labels, reduction='none')

# Multiply the losses by the corresponding weights
weighted_losses = losses * weight_matrix[torch.arange(weight_matrix.shape[0]), labels]

# Compute the average weighted loss
avg_weighted_loss = torch.mean(weighted_losses)


In [49]:
loss_fct = nn.CrossEntropyLoss()
loss =  loss_fct(y_preds, labels.view(-1)) 

In [50]:
avg_weighted_loss, loss

(tensor(1.5815), tensor(1.5815))

In [46]:
peft_model_id = "output/baseline_mistral_c"
config = PeftConfig.from_pretrained(peft_model_id)

In [47]:
lora_model = PeftModel.from_pretrained(model,peft_model_id)

In [1]:
# test_predictions = trainer.predict(tokenized_test_dataset).predictions

In [None]:
test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]


In [118]:
## load data 40k

In [129]:
df_openbook = pd.read_csv('input_data/validation_data/40k_dataset/MMLU_17k_with_context2.csv')

In [130]:
df_openbook[df_openbook.is_question].head()

Unnamed: 0,prompt,context,A,B,C,D,answer,is_question
2,In which of the following cases is a convictio...,Aggravated Robbery 4-15 years in prison. ===In...,"Johnson forced his way into a woman's home, bo...",A confederate of Brown pushed a man in order t...,Having induced a woman to enter his hotel room...,Hayes unbuttoned the vest of a man too drunk t...,D,True
4,In which of the following situations is Defend...,The prosecutor may decide not to prosecute a c...,Police arrested Thief and recovered goods he h...,Defendant misrepresented his identity to secur...,Believing that state law made it a crime to pu...,"Defendant, intending to kill Selma, shot at Se...",C,True
5,Potts sued Dobbs on a product liability claim....,"Bazley v Curry, [1999] 2 SCR 534 is a Supreme ...","""Isn't it a fact that you are Potts' close fri...","""Isn't it true that you are known in the commu...","""Didn't you fail to report some income on your...","""Weren't you convicted, seven years ago in thi...",B,True
6,In which of the following situations is Defend...,It is considered the most serious form of homi...,Angered because his neighbor is having a noisy...,"During an argument, Harry slaps Defendant. Ang...",Defendant drives his car through a red light a...,"Using his fist, Defendant punches Walter in th...",A,True
8,Redirect examination of a witness must be perm...,"Redirect examination, performed by the attorne...",To reply to any matter raised in crossexaminat...,Only to reply to significant new matter raised...,Only to reiterate the essential elements of th...,Only to supply significant information inadver...,B,True


In [141]:
df_openbook[df_openbook.is_question].iloc[20]['prompt']

"Which occurs as a result of Earth's tilt on its rotating axis?"

In [135]:
[print(df_openbook[df_openbook.is_question].head(1)[letter].values) for letter in ['A','B','C','D']]

["Johnson forced his way into a woman's home, bound her, and compelled her to tell him that her jewelry was in an adjoining room. Johnson went to the room, took the jewelry, and fled. "]
['A confederate of Brown pushed a man in order to cause him to lose his balance and drop his briefcase. Brown picked up the briefcase and ran off with it.']
['Having induced a woman to enter his hotel room, Ritter forced her to telephone her maid to tell the maid to bring certain jewelry to the hotel. Ritter locked the woman in the bathroom while he accepted the jewelry from the maid when she arrived. ']
['Hayes unbuttoned the vest of a man too drunk to notice and removed his wallet. A minute later, the victim missed his wallet and accused Hayes of taking it. Hayes pretended to be insulted, slapped the victim, and went off with the wallet.']


[None, None, None, None]

# validation score 

In [42]:
from sklearn.metrics import accuracy_score
from ast import literal_eval
import pandas as pd
import numpy as np

In [127]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
import numpy as np
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [128]:
df_actual = pd.read_csv('input_data/validation_data/master_validation.csv')

In [129]:
df_deberta_context_1 = pd.read_csv('base/validation_data_fold1_768.csv')
df_deberta_context_2 = pd.read_csv('base/validation_data_fold2_768.csv')
df_deberta_context_0 = pd.read_csv('base/validation_data_fold0_768.csv')
df_deberta_context_all = pd.read_csv('base/validation_data_fold0_80k.csv')


In [130]:
# df_deberta_context_1['deberta_choice_1'] = df_deberta_context_1['prediction'].str[0]
# df_deberta_context_1['deberta_choice_2'] = df_deberta_context_1['prediction'].str[2]
# df_deberta_context_1['deberta_choice_3'] = df_deberta_context_1['prediction'].str[4]

dfs = [df_deberta_context_all,df_deberta_context_0,df_deberta_context_1,df_deberta_context_2]

    

In [131]:
# Function to extract probability by index
def extract_prob_by_index(row_str, index):
    try:
        # Convert the string back to a list
        row = literal_eval(row_str)
    except (ValueError, SyntaxError):
        return None
    
    if index < len(row):
        return row[index]
    return None

def calc_choice_accuracy(df):
    top1_deberta = accuracy_score(df['answer'].values,df['deberta_choice_1'].values )
    top2_deberta = accuracy_score(df['answer'].values,df['deberta_choice_2'].values )
    top3_deberta = accuracy_score(df['answer'].values,df['deberta_choice_3'].values )
    print(f"TOP 1 :{round(top1_deberta,2)} , TOP 2 :{round(top2_deberta,2)}, TOP 3 :{round(top3_deberta,2)}")

# Create new columns
labels = ['A', 'B', 'C', 'D', 'E']

In [132]:
for idx, df in enumerate(dfs):
    df['deberta_choice_1'] = df['prediction'].str[0]
    df['deberta_choice_2'] = df['prediction'].str[2]
    df['deberta_choice_3'] = df['prediction'].str[4]

    for i, label in enumerate(labels):
        df[f'{label}_prob'] = df['probabilities'].apply(lambda row: extract_prob_by_index(row, i))


In [133]:
m = MAP_at_3(df_deberta_context_all.prediction.values, df_deberta_context_all.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.886876640419948


In [134]:
m = MAP_at_3(df_deberta_context_0.prediction.values, df_deberta_context_0.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8759842519685046


In [135]:
m = MAP_at_3(df_deberta_context_1.prediction.values, df_deberta_context_1.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8641732283464576


In [136]:
m = MAP_at_3(df_deberta_context_2.prediction.values, df_deberta_context_2.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8696850393700798


In [137]:
calc_choice_accuracy(df_deberta_context_all) , calc_choice_accuracy(df_deberta_context_0) , calc_choice_accuracy(df_deberta_context_1), calc_choice_accuracy(df_deberta_context_2)

TOP 1 :0.82 , TOP 2 :0.1, TOP 3 :0.04
TOP 1 :0.81 , TOP 2 :0.11, TOP 3 :0.04
TOP 1 :0.79 , TOP 2 :0.11, TOP 3 :0.06
TOP 1 :0.8 , TOP 2 :0.11, TOP 3 :0.05


(None, None, None, None)

In [138]:
# Assuming that the 'id', 'prompt', 'context', and 'answer' columns are consistent across the three dataframes:
df_weighted = df_deberta_context_0[['id', 'prompt', 'context', 'answer']].copy()

columns = ['A_prob', 'B_prob', 'C_prob', 'D_prob', 'E_prob']
# weight = 1/3

for col in columns:
    df_weighted[col.replace('_prob', '_weighted_prob')] = (
        0.25 * df_deberta_context_0[col] + 
        0.15 * df_deberta_context_1[col] + 
        0.20 * df_deberta_context_2[col] +
        0.40 * df_deberta_context_all[col]   
    )



In [139]:
df_weighted.columns

Index(['id', 'prompt', 'context', 'answer', 'A_weighted_prob',
       'B_weighted_prob', 'C_weighted_prob', 'D_weighted_prob',
       'E_weighted_prob'],
      dtype='object')

In [140]:
predictions_as_ids = np.argsort(-df_weighted[['A_weighted_prob',
       'B_weighted_prob', 'C_weighted_prob', 'D_weighted_prob',
       'E_weighted_prob']].values, 1)

predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]

predictions_as_string = df_weighted['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

In [141]:
## mean pred
dfs = [df_weighted]
for idx, df in enumerate(dfs):
    df['deberta_choice_1'] = df['prediction'].str[0]
    df['deberta_choice_2'] = df['prediction'].str[2]
    df['deberta_choice_3'] = df['prediction'].str[4]


In [142]:
calc_choice_accuracy(df_weighted)

TOP 1 :0.84 , TOP 2 :0.1, TOP 3 :0.04


In [143]:
df_weighted.shape

(1270, 13)

In [144]:
m = MAP_at_3(df_weighted.prediction.values, df_weighted.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8980314960629928


In [60]:
df_lora = pd.read_csv('base/validation_data_mistral_LORA_predicted_newarticle1.csv')

In [61]:
m = MAP_at_3(df_lora.prediction.values, df_lora.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8914698162729665


In [62]:
df_lora['lora_choice_1'] = df_lora['prediction'].str[0]
df_lora['lora_choice_2'] = df_lora['prediction'].str[2]
df_lora['lora_choice_3'] = df_lora['prediction'].str[4]

In [63]:
top1_lora = accuracy_score(df_lora['answer'].values,df_lora['lora_choice_1'].values )
top2_lora = accuracy_score(df_lora['answer'].values,df_lora['lora_choice_2'].values )
top3_lora = accuracy_score(df_lora['answer'].values,df_lora['lora_choice_3'].values )
print(f"TOP 1 :{round(top1_lora,2)} , TOP 2 :{round(top2_lora,2)}, TOP 3 :{round(top3_lora,2)}")

TOP 1 :0.82 , TOP 2 :0.11, TOP 3 :0.04


In [64]:
# Function to extract probability by label
def extract_prob(row_str, label):
    try:
        # Convert the string back to a list of tuples
        row = literal_eval(row_str.replace("array(", "").replace(", dtype=float32)", ""))
    except (ValueError, SyntaxError):
        return None
    
    for prob, lbl in row:
        if lbl == label:
            return float(prob)
    return None



# Create new columns
for label in ['A', 'B', 'C', 'D', 'E']:
    df_lora[f'{label}_prob'] = df_lora['logits'].apply(lambda row: extract_prob(row, label))


In [65]:
df_lora.shape, df_weighted.shape

((1270, 20), (1270, 13))

In [66]:
df_weighted_add_lora = df_weighted.merge(df_lora[['id','lora_choice_1','lora_choice_2','lora_choice_3']], on=['id'],how='left')

In [67]:
#Fixing the issue in the implementation to avoid duplication in the prediction string
for index, row in df_weighted_add_lora.iterrows():
    prediction_values = []
    added_choices = set()
    
    if row['deberta_choice_1'] not in added_choices:
        prediction_values.append(row['deberta_choice_1'])
        added_choices.add(row['deberta_choice_1'])
        
    if row['lora_choice_1'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['lora_choice_1'])
        added_choices.add(row['lora_choice_1'])
        
    if row['lora_choice_2'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['lora_choice_2'])
        added_choices.add(row['lora_choice_2'])
    
    if row['deberta_choice_2'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['deberta_choice_2'])
        added_choices.add(row['deberta_choice_2'])

    if row['deberta_choice_3'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['deberta_choice_3'])
        added_choices.add(row['deberta_choice_3'])
    
    # Join the top 3 non-unique values to form the prediction string
    df_weighted_add_lora.at[index, 'deberta_lora'] = ' '.join(prediction_values)

In [68]:
m = MAP_at_3(df_weighted_add_lora.deberta_lora.values, df_weighted_add_lora.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8972440944881896


In [69]:
calc_choice_accuracy(df_weighted_add_lora)

TOP 1 :0.84 , TOP 2 :0.1, TOP 3 :0.04


In [70]:
df_lora

Unnamed: 0,id,prompt,A,B,C,D,E,answer,context,instruction,prediction,logits,lora_choice_1,lora_choice_2,lora_choice_3,A_prob,B_prob,C_prob,D_prob,E_prob
0,0,What does a diffusion-limited enzyme represent...,"An intrinsic, physical constraint",A maximum peak height in the fitness landscape,An evolutionary limitation,A chemical limitation,A diffusion limitation,B,-A diffusion-limited enzyme catalyses a reacti...,What does a diffusion-limited enzyme represent...,A B C,"[(array(0.53893685, dtype=float32), 'A'), (arr...",A,B,C,0.538937,0.453830,0.004115,0.001016,0.002102
1,1,What are known clusters of neurons in the medu...,baroreceptors,angioreceptors,the cardiomotor mechanism,the cardiovascular center,angioreceptors,D,"-Regulation of blood pressure The endogenous, ...",What are known clusters of neurons in the medu...,D C A,"[(array(0.03887285, dtype=float32), 'A'), (arr...",D,C,A,0.038873,0.014987,0.095463,0.837657,0.013021
2,2,How was sulfur produced in the United States i...,Sulfur production in the United States was pri...,Sulfur production in the United States was pri...,Sulfur production in the United States was pri...,Sulfur production in the United States was pri...,Sulfur production in the United States was pri...,A,-Sulfur dioxide is primarily produced for sulf...,How was sulfur produced in the United States i...,A E B,"[(array(0.754821, dtype=float32), 'A'), (array...",A,E,B,0.754821,0.026441,0.006280,0.018604,0.193854
3,3,What is a fexpr in Lisp programming languages?,A fexpr is a function that requires the evalua...,A fexpr is a function that automatically evalu...,A fexpr is a function that does not require th...,A fexpr is a function that passes its operands...,A fexpr is a function that only evaluates the ...,C,"-In Lisp programming languages, a fexpr is a f...",What is a fexpr in Lisp programming languages?...,D C E,"[(array(0.00166235, dtype=float32), 'A'), (arr...",D,C,E,0.001662,0.001226,0.027681,0.945755,0.023677
4,4,What is the height at which mountaineers are m...,"6,000 feet","9,000 feet","12,000 feet","18,000 feet","15,000 feet",B,"-Altitude sickness can first occur at 1,500 me...",What is the height at which mountaineers are m...,D B C,"[(array(0.06164877, dtype=float32), 'A'), (arr...",D,B,C,0.061649,0.225502,0.208555,0.455526,0.048768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265,1265,What is the relation between the three moment ...,The three moment theorem expresses the relatio...,The three moment theorem is used to calculate ...,The three moment theorem describes the relatio...,The three moment theorem is used to calculate ...,The three moment theorem is used to derive the...,C,-In civil engineering and structural analysis ...,What is the relation between the three moment ...,C E A,"[(array(0.00655538, dtype=float32), 'A'), (arr...",C,E,A,0.006555,0.000228,0.935633,0.002265,0.055318
1266,1266,"What is the throttling process, and why is it ...",The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,The throttling process is a steady adiabatic f...,The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,B,-Throttling One of the simple applications of ...,"What is the throttling process, and why is it ...",E B C,"[(array(0.01305553, dtype=float32), 'A'), (arr...",E,B,C,0.013056,0.344691,0.130830,0.002002,0.509421
1267,1267,What happens to excess base metal as a solutio...,"The excess base metal will often solidify, bec...",The excess base metal will often crystallize-o...,"The excess base metal will often dissolve, bec...","The excess base metal will often liquefy, beco...","The excess base metal will often evaporate, be...",B,"-Similarly, a hypoeutectoid alloy has two crit...",What happens to excess base metal as a solutio...,B A C,"[(array(0.110943, dtype=float32), 'A'), (array...",B,A,C,0.110943,0.859105,0.020683,0.002911,0.006358
1268,1268,"What is the relationship between mass, force, ...",Mass is a property that determines the weight ...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is a property that determines the size of...,D,-Mass is (among other properties) an inertial ...,"What is the relationship between mass, force, ...",D C A,"[(array(0.06352976, dtype=float32), 'A'), (arr...",D,C,A,0.063530,0.027972,0.132407,0.773951,0.002140


In [37]:
df_weighted.columns

Index(['id', 'prompt', 'context', 'answer', 'A_weighted_prob',
       'B_weighted_prob', 'C_weighted_prob', 'D_weighted_prob',
       'E_weighted_prob', 'prediction', 'deberta_choice_1', 'deberta_choice_2',
       'deberta_choice_3'],
      dtype='object')

In [71]:
# Merge the DataFrames on 'id'
merged_df = pd.merge(df_lora, df_weighted[['id','deberta_choice_1', 'deberta_choice_2',
       'deberta_choice_3','A_weighted_prob',
       'B_weighted_prob', 'C_weighted_prob', 'D_weighted_prob',
       'E_weighted_prob']], on='id', suffixes=('_lora', '_deberta'))

# Define weights
weight_lora = 0.5
weight_deberta = 0.5

# Calculate weighted sum of probabilities
for label in ['A', 'B', 'C', 'D', 'E']:
    merged_df[f'{label}_prob_combined'] = weight_lora * merged_df[f'{label}_prob'] + weight_deberta * merged_df[f'{label}_weighted_prob']

# Keep only the 'id' and combined probability columns
final_df = merged_df[['id'] + ['answer']+[f'{label}_prob_combined' for label in ['A', 'B', 'C', 'D', 'E']]]


In [72]:
final_df.columns

Index(['id', 'answer', 'A_prob_combined', 'B_prob_combined', 'C_prob_combined',
       'D_prob_combined', 'E_prob_combined'],
      dtype='object')

In [73]:
predictions_as_ids = np.argsort(-final_df[['A_prob_combined', 'B_prob_combined', 'C_prob_combined',
       'D_prob_combined', 'E_prob_combined']].values, 1)

predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]

predictions_as_string = final_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictions_as_string = final_df['prediction'] = [


In [74]:
final_df['top_choice_1'] = final_df['prediction'].str[0]
final_df['top_choice_2'] = final_df['prediction'].str[2]
final_df['top_choice_3'] = final_df['prediction'].str[4]


top1_prob_combined = accuracy_score(final_df['answer'].values,final_df['top_choice_1'].values )
top2_prob_combined = accuracy_score(final_df['answer'].values,final_df['top_choice_2'].values )
top3_prob_combined = accuracy_score(final_df['answer'].values,final_df['top_choice_3'].values )
print(f"TOP 1 :{round(top1_prob_combined,2)} , TOP 2 :{round(top2_prob_combined,2)}, TOP 3 :{round(top3_prob_combined,2)}")

TOP 1 :0.85 , TOP 2 :0.09, TOP 3 :0.03


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['top_choice_1'] = final_df['prediction'].str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['top_choice_2'] = final_df['prediction'].str[2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['top_choice_3'] = final_df['prediction'].str[4]


In [75]:
m = MAP_at_3(final_df.prediction.values, final_df.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.9095800524934389


In [None]:
master_probs = df_lora[['id','A_prob', 'B_prob', 'C_prob', 'D_prob', 'E_prob']] 

In [29]:
master_match = df_lora.merge(df_deberta_context[['id','deberta_choice_1', 'deberta_choice_2',
       'deberta_choice_3','A_prob', 'B_prob', 'C_prob', 'D_prob', 'E_prob']], on=['id'],how = "left")

In [39]:
((master_match['deberta_choice_1'] == master_match['lora_choice_1']) & (master_match['lora_choice_1'] == master_match['answer'])).value_counts()


False    179
True     121
dtype: int64

In [34]:
(master_match['deberta_choice_1'] == master_match['lora_choice_1']).value_counts()

True     158
False    142
dtype: int64

In [33]:
master_match.shape, 

((300, 17), (300, 13))

In [54]:
del master_match["deberta_lora"] #= master_match.apply(lambda row: ' '.join(row['deberta_choice_1']+row['lora_choice_2']+row['lora_choice_3']), axis=1)

In [51]:
# master_match.to_csv('./master_match.csv',index=False)

In [56]:
# Fixing the issue in the implementation to avoid duplication in the prediction string
for index, row in master_match.iterrows():
    prediction_values = []
    added_choices = set()
    
    if row['deberta_choice_1'] not in added_choices:
        prediction_values.append(row['deberta_choice_1'])
        added_choices.add(row['deberta_choice_1'])
        
    if row['lora_choice_1'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['lora_choice_1'])
        added_choices.add(row['lora_choice_1'])
        
    if row['lora_choice_2'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['lora_choice_2'])
        added_choices.add(row['lora_choice_2'])
    
    if row['deberta_choice_2'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['deberta_choice_2'])
        added_choices.add(row['deberta_choice_2'])
        
    if row['lora_choice_3'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['lora_choice_3'])
        added_choices.add(row['lora_choice_3'])
        
    if row['deberta_choice_3'] not in added_choices and len(prediction_values) < 3:
        prediction_values.append(row['deberta_choice_3'])
        added_choices.add(row['deberta_choice_3'])
    
    # Join the top 3 non-unique values to form the prediction string
    master_match.at[index, 'deberta_lora'] = ' '.join(prediction_values)


In [57]:
master_match

Unnamed: 0,prompt,context,A,B,C,D,E,id,instruction,prediction,answer,lora_choice_1,lora_choice_2,lora_choice_3,deberta_choice_1,deberta_choice_2,deberta_choice_3,deberta_lora
0,What is the method of transcription in the lif...,DNA-templated transcription is the method of t...,RNA-templated transcription is the method of t...,Transcription occurs through a unique mechanis...,Reverse transcription is the method of transcr...,DNA-templated transcription is the method of t...,Transcription does not occur in the life cycle...,0,What is the method of transcription in the lif...,A C D,D,A,C,D,D,A,C,D A C
1,What is the role of the viral fiber glycoprote...,Entry into the host cell is achieved by attach...,The viral fiber glycoproteins are involved in ...,The viral fiber glycoproteins code for 40 prot...,The viral fiber glycoproteins are responsible ...,The viral fiber glycoproteins mediate endocyto...,The viral fiber glycoproteins are responsible ...,1,What is the role of the viral fiber glycoprote...,C D A,D,C,D,A,D,B,C,D C B
2,What is the significance of the faint Hα emiss...,"Gamma Geminorum (γ Geminorum, abbreviated Gamm...",The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,The emission lines indicate that 3 Geminorum i...,2,What is the significance of the faint Hα emiss...,B C E,A,B,C,E,B,E,C,B C E
3,What is the significance of the pedicellariae ...,Peziza vesiculosa is a species of apothecial f...,They are used for climbing on corals.,They resemble the traps of the Venus fly trap ...,They are covered by short and stout spines.,They are found on the central disc of the sea ...,They are a characteristic feature of the Gonia...,3,What is the significance of the pedicellariae ...,C B A,B,C,B,A,C,A,E,C B A
4,What is the role of the microprocessor complex...,The microprocessor complex is a protein comple...,The microprocessor complex is responsible for ...,The microprocessor complex is responsible for ...,The microprocessor complex is involved in the ...,The microprocessor complex is involved in the ...,The microprocessor complex is responsible for ...,4,What is the role of the microprocessor complex...,A D B,A,A,D,B,A,D,C,A D B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,What is the significance of the anti-de Sitter...,This observation is the starting point for AdS...,The AdS/CFT correspondence is a conjectured re...,The AdS/CFT correspondence provides a non-pert...,The AdS/CFT correspondence represents a major ...,The AdS/CFT correspondence is a strong-weak du...,The AdS/CFT correspondence was first proposed ...,295,What is the significance of the anti-de Sitter...,D A C,C,D,A,C,A,C,B,A D C
296,What is the branch of physics that seeks to de...,String theories are quantum theories of gravit...,String theory,Quantum gravity,AdS/CFT correspondence,General relativity,M-theory,296,What is the branch of physics that seeks to de...,C A B,B,C,A,B,B,A,E,B C A
297,What is the AdS/CFT correspondence according t...,The conformal field theory is like a hologram ...,The AdS/CFT correspondence is a relationship b...,The AdS/CFT correspondence is the theory that ...,The AdS/CFT correspondence is a dictionary tha...,The AdS/CFT correspondence is the equivalence ...,The AdS/CFT correspondence is a mathematical c...,297,What is the AdS/CFT correspondence according t...,B C D,D,B,C,D,B,D,C,B C D
298,What is the purpose of superstring theory acco...,Superstring theory is an attempt to explain al...,To explain the behavior of fundamental particl...,To explain the behavior of large-scale structu...,To describe the four fundamental forces acting...,To harmonize the theory of general relativity ...,To eliminate the infinities in quantum field t...,298,What is the purpose of superstring theory acco...,E A D,A,E,A,D,A,D,E,A E D


In [58]:

print( 'CV MAP@3 =', MAP_at_3(master_match.deberta_lora.values, master_match.answer.values))

CV MAP@3 = 0.7738888888888893


In [59]:
master_match['choice_1'] = master_match['deberta_lora'].str[0]
master_match['choice_2'] = master_match['deberta_lora'].str[2]
master_match['choice_3'] = master_match['deberta_lora'].str[4]


top1_mixed = accuracy_score(master_match['answer'].values,master_match['choice_1'].values )
top2_mixed = accuracy_score(master_match['answer'].values,master_match['choice_2'].values )
top3_mixed = accuracy_score(master_match['answer'].values,master_match['choice_3'].values )
print(f"TOP 1 :{round(top1_mixed,2)} , TOP 2 :{round(top2_mixed,2)}, TOP 3 :{round(top3_mixed,2)}")

TOP 1 :0.66 , TOP 2 :0.16, TOP 3 :0.11


In [None]:
## 

In [149]:
from sklearn import model_selection
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


def create_folds(data, num_splits):
    dfx = pd.get_dummies(data, columns=["answer"]).groupby(["id"], as_index=False).sum()
    cols = [c for c in dfx.columns if c.startswith("answer") or c == "id" and c != "answer"]
    dfx = dfx[cols]
    
    mskf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    labels = [c for c in dfx.columns if c != "id"]
    dfx_labels = dfx[labels]
    dfx["kfold"] = -1
    
    for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
        dfx.loc[val_, "kfold"] = fold
    
    data = data.merge(dfx[["id", "kfold"]], on="id", how="left")
    
    return data

In [163]:
df_train.head()

Unnamed: 0,prompt,context,A,B,C,D,E,answer
0,"In relation to Eunice Fay McKenzie's career, w...","Eunice Fay McKenzie (February 19, 1918 – April...",McKenzie showcased her singing talents in nume...,McKenzie is primarily remembered for her starr...,McKenzie gained recognition for her role as a ...,McKenzie's collaborations with director Blake ...,McKenzie's successful career in sound films co...,B
1,How does Modified Newtonian Dynamics (MOND) im...,The presence of a clustered thick disk-like co...,MOND is a theory that increases the discrepanc...,MOND explains the missing baryonic mass in gal...,MOND is a theory that reduces the observed mis...,MOND is a theory that eliminates the observed ...,MOND's impact on the observed missing baryonic...,E
2,Which of the following statements accurately d...,Woody Hartman is a retired American soccer goa...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B
3,What is the significance of the Museum of the ...,The Museum of the Occupation of Latvia () is a...,The Museum of the Occupation of Latvia is a me...,The Museum of the Occupation of Latvia showcas...,The Museum of the Occupation of Latvia was est...,The Museum of the Occupation of Latvia primari...,The Museum of the Occupation of Latvia is a mu...,C
4,What was the previous name of the Christian Sc...,It was named the Evangelical School for the De...,The Christian School for the Deaf (CSD),The Christian School for the Blind (CSB),The Evangelical School and Chapel for the Deaf...,The Evangelical School for the Deaf (ESD),The Evangelical School for the Blind (ESB),D


In [186]:
# df_actual = pd.read_csv('input_data/validation_data/stem1k/dataset_wiki_new_1_balanced.csv')
df_train = pd.read_csv('input_data/all_12_with_context2.csv')
df_train = df_train.drop(columns="source")

# df_train = df_train.fillna('')#.sample(NUM_TRAIN_SAMPLES)
# df_train = df_train.dropna(how='any', axis=0) # delete 4 choice question



# df_train = df_train.apply(make_random_4_from_3,axis=1)
# df_train = df_train.dropna(how='any', axis=0) # delete 4 choice question


# train["id"] = list(range(len(train)))
# train['answer'] = df_actual['answer']


# train = create_folds(train, num_splits=4)
# train.to_csv('./train_folds.csv', index=False)

In [187]:
df_train.shape

(60347, 8)

In [188]:
df_train['id'] = list(range(len(df_train)))

In [189]:
df_no_null = df_train.dropna(how='any')

In [191]:
df_train_fill = df_train[~df_train.id.isin(df_no_null.id)]

In [192]:
df_train_fill.shape , df_no_null.shape

((13660, 9), (46687, 9))

In [1]:
import pandas as pd
import random
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model (Download and specify path accordingly)
model = KeyedVectors.load_word2vec_format('models/googelnews/GoogleNews-vectors-negative300.bin', binary=True)


In [2]:
def fill_missing_options(row):
    options = ['A', 'B', 'C', 'D', 'E']
    available_options = {}
    
    # Get available options
    for opt in options:
        if pd.notna(row[f'{opt}']):
            available_options[opt] = row[f'{opt}']
    
    # Remove the correct answer from available_options
    correct_option = row['answer']
    if correct_option in available_options:
        del available_options[correct_option]
    
    similar_words_list = []
    
    # Generate similar words for available incorrect options
    for option_word in available_options.values():
        print(f"Processing option_word: {option_word}")  # Debug print
        words = option_word.split()
        vectors = []
        for word in words:
            try:
                vectors.append(model[word])
            except KeyError:
                print(f"Word '{word}' not in vocabulary.")  # Debug print

        if vectors:
            avg_vector = sum(vectors) / len(vectors)
            try:
                similar_words = [item[0] for item in model.similar_by_vector(avg_vector, topn=3)]
                print(f"Similar words found: {similar_words}")  # Debug print
                similar_words_list.extend(similar_words)
            except Exception as e:
                print(f"Error in finding similar words: {e}")  # Debug print

    # Shuffle the similar words to randomize
    random.shuffle(similar_words_list)
    
    # Fill missing options
    for opt in options:
        if pd.isna(row[f'{opt}']):
            if similar_words_list:
                row[f'{opt}'] = similar_words_list.pop(0)
    
    return row



In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import random

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

# def get_bert_embedding(word):
#     inputs = tokenizer(word, return_tensors="pt")
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1)



In [14]:
def fill_missing_options(row):
    options = ['A', 'B', 'C', 'D', 'E']
    available_options = {opt: row[f'{opt}'] for opt in options if pd.notna(row[f'{opt}'])}
    
    # Remove the correct answer from available options
    correct_option = row['answer']
    available_options.pop(correct_option, None)
    
    # Generate similar words for available incorrect options
    similar_words_list = []
    for option_word in available_options.values():
        try:
            similar_words = [item[0].replace('_', ' ') for item in model.most_similar(option_word, topn=3)]
            similar_words_list.extend(similar_words)
        except KeyError:  # Skip words not in vocabulary
            continue

    random.shuffle(similar_words_list)
    
    # Fill missing options
    for opt in options:
        if pd.isna(row[f'{opt}']):
            if similar_words_list:
                row[f'{opt}'] = similar_words_list.pop(0)
    
    return row



In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

# Load the Universal Sentence Encoder's TF Hub module
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def fill_missing_sentence_options(row):
    options = ['A', 'B', 'C', 'D', 'E']
    correct_option = row['answer']
    
    # Filter out the available incorrect options
    incorrect_options = [row[opt] for opt in options if pd.notna(row[opt]) and opt != correct_option]
    
    # If there are missing options
    missing_options = [opt for opt in options if pd.isna(row[opt])]
    for opt in missing_options:
        # Get the most similar sentence from a predefined list of sentences (you can customize this list)
        # Note: You might want to have a large diverse list of sentences for better results.
        # Here, I'm just reusing the incorrect options for simplicity.
        similar_sentence = get_most_similar_sentences(row[correct_option], incorrect_options)
        row[opt] = similar_sentence

    return row

def get_most_similar_sentences(main_sentence, sentences):
    """
    Return the most similar sentence to the main_sentence from a list of sentences.
    """
    # Compute embeddings for main_sentence and list of sentences
    main_embedding = embed([main_sentence])
    sentences_embedding = embed(sentences)

    # Compute similarity between main_sentence and each sentence in the list
    cosine_similarities = np.inner(main_embedding, sentences_embedding).flatten()

    # Get the index of the most similar sentence
    most_similar_index = np.argmax(cosine_similarities)

    return sentences[most_similar_index]





2023-10-05 20:49:38.856772: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30550 MB memory:  -> device: 0, name: Tesla V100-SXM3-32GB, pci bus id: 0000:34:00.0, compute capability: 7.0
2023-10-05 20:49:38.859643: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30550 MB memory:  -> device: 1, name: Tesla V100-SXM3-32GB, pci bus id: 0000:36:00.0, compute capability: 7.0
2023-10-05 20:49:38.862300: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 30550 MB memory:  -> device: 2, name: Tesla V100-SXM3-32GB, pci bus id: 0000:39:00.0, compute capability: 7.0
2023-10-05 20:49:38.864440: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 30550 MB memory:  -> device: 3, name: Tesla V100-SXM3-32GB, pci bus id

NameError: name 'df' is not defined

In [42]:
import re

In [110]:


def fill_missing_sentence_options(row):
    options = ['A', 'B', 'C', 'D', 'E']
    correct_option = row['answer']

    # Filter out the available incorrect options
    incorrect_options = [row[opt] for opt in options if pd.notna(row[opt]) and opt != correct_option]
    
    # Compute the maximum length from the available options
    max_option_length = max(len(option) for option in incorrect_options)
    
    # Compute embeddings for the incorrect options
    incorrect_embeddings = embed(incorrect_options)
    
    # Compute the average embedding for the missing option
    avg_embedding = np.mean(incorrect_embeddings, axis=0)
    
    # Split the context into individual sentences
    context_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', row['context'])

    # Exclude sentences which are very short or empty
    context_sentences = [s.strip() for s in context_sentences if s and len(s.split()) > 5]

    # Ensure there are valid sentences to process
    if not context_sentences:
        return row

    # Search for a similar sentence in the context that isn't already an option
    similar_sentence = find_similar_sentence_in_corpus(avg_embedding, context_sentences, incorrect_options + [row[correct_option]], max_option_length)
    
    # Fill the missing option
    for opt in options:
        if pd.isna(row[opt]):
            row[opt] = similar_sentence

    return row

def find_similar_sentence_in_corpus(embedding, corpus, exclude_list, max_length):
    """
    Find a sentence in the corpus that is most similar to the given embedding 
    but is not in the exclude_list and doesn't exceed max_length.
    """
    try:
        corpus_embeddings = embed(corpus)
        cosine_similarities = np.inner(embedding, corpus_embeddings).flatten()
        sorted_indexes = np.argsort(cosine_similarities)[::-1]  # Descending order
        
        for idx in sorted_indexes:
            if corpus[idx] not in exclude_list and len(corpus[idx]) <= max_length:
                return corpus[idx]
        return None
    except Exception as e:
        print(f"Error processing corpus: {e}")
        return None



In [63]:
df_train_fill = pd.read_csv('input_data/RACE_with_context_original.csv')

In [64]:
df_train_fill['E'] = np.nan

In [47]:
dummy_df = df_train_fill.head(10)

In [48]:
# # Assuming 'df' is your DataFrame and it has columns like 'option_A', 'option_B', etc.
# all_options = pd.concat([dummy_df[f'{opt}'] for opt in ['A', 'B', 'C', 'D', 'E']])
# unique_words = set()
# for option in all_options.dropna():
#     unique_words.update(option.split())

# # This will be your dynamic vocab_list
# vocab_list = list(unique_words)


In [68]:
# Assuming df is your DataFrame
df_filled = df_train_fill.apply(fill_missing_sentence_options, axis=1)

# dummy_df = df_train_fill.apply(fill_missing_options, axis=1)

In [60]:
df_filled.to_csv('./df_train_fill_RACE.csv', index=False)

In [70]:
# df_filled

In [72]:
df_filled_notnone = df_filled[~df_filled.E.isnull()]

In [74]:
df_filled_none = df_filled[df_filled.E.isnull()]

In [116]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

def generate_option_using_bert(context, false_options, model, tokenizer):
    # Create the prompt
    prompt = f"{context} Option A: {false_options[0]} Option B: {false_options[1]} Option C: {false_options[2]} The next false option is: [MASK]."
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = outputs.logits

    # Get the predicted token for [MASK]
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    return predicted_token

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will stor

In [117]:
def fill_missing_sentence_options(row):
    options = ['A', 'B', 'C', 'D', 'E']
    correct_option = row['answer']

    # Filter out the available incorrect options
    incorrect_options = [row[opt] for opt in options if pd.notna(row[opt]) and opt != correct_option]

    # If there's a missing option, generate it using BERT
    if len(incorrect_options) == 3:
        generated_option = generate_option_using_bert(row['context'], incorrect_options, model, tokenizer)
        missing_option = list(set(options) - set([correct_option]) - set([k for k, v in row.items() if pd.notna(v)]))[0]
        row[missing_option] = generated_option

    return row


In [20]:
df_old = pd.read_csv('input_data/train_folds_article_context.csv')

In [21]:
df_old.columns

Index(['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer', 'kfold', 'context'], dtype='object')

In [22]:
df_70k = pd.read_csv('input_data/len70021_with_context.csv')

In [27]:
df_70k.dropna(inplace=True)

In [30]:
df_70k = df_70k[['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer', 'context']]

In [33]:
df_new = pd.concat([df_old,df_70k])

In [35]:
df_new['id'] =  range(len(df_new))

In [130]:
df_filled_notnone['id'] = range(len(df_filled_notnone))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filled_notnone['id'] = range(len(df_filled_notnone))


In [149]:
df_all.kfold.value_counts()

3    39315
2    16472
1    16472
0    16471
Name: kfold, dtype: int64

In [141]:
df_filled_notnone['kfold'] =3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filled_notnone['kfold'] =3


In [143]:
df_all = pd.concat([df_old,df_filled_notnone])

In [146]:
df_all.to_csv('input_data/train_folds_article_context_added.csv',index=False)

In [150]:
df_all.shape

(88730, 10)

In [36]:
from sklearn import model_selection
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


def create_folds(data, num_splits):
    dfx = pd.get_dummies(data, columns=["answer"]).groupby(["id"], as_index=False).sum()
    cols = [c for c in dfx.columns if c.startswith("answer") or c == "id" and c != "answer"]
    dfx = dfx[cols]
    
    mskf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    labels = [c for c in dfx.columns if c != "id"]
    dfx_labels = dfx[labels]
    dfx["kfold"] = -1
    
    for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
        dfx.loc[val_, "kfold"] = fold
    
    data = data.merge(dfx[["id", "kfold"]], on="id", how="left")
    
    return data


In [37]:
df_all1 = create_folds(df_new, 3)

In [39]:
df_all1.kfold.value_counts()

2    39775
1    39774
0    39774
Name: kfold, dtype: int64

In [40]:
df_all1.columns

Index(['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer', 'context', 'kfold'], dtype='object')

In [165]:
df_all1.columns = ['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer', 'context',
       'kfold']

In [41]:
df_all1.to_csv('input_data/train_folds_article_context_70+40.csv',index=False)

In [118]:
cc = df_filled_none.head(10)

In [119]:
df_filled_none1 = cc.apply(fill_missing_sentence_options, axis=1)

In [121]:
df_filled_none1

Unnamed: 0,prompt,context,A,B,C,D,answer,is_question,dataset,E
0,We can know from the passage that the author w...,Last week I talked with some of my students ab...,doctor,model,teacher,reporter,C,False,train,.
1,Many graduates today turn to cosmetic surgery ...,Last week I talked with some of my students ab...,marry a better man/woman,become a model,get an advantage over others in job-hunting,attract more admirers,C,False,train,.
4,What could be the best title for this passage?,"YUZHOU, HENAN -An accident in a central China ...",Death Toll Rises in an Accident in China,A Coal Mine Accident in Central China,An Accident in Central China,Coal Mine Accidents in China,B,True,train,the
8,How many tips does the author give on career m...,Understanding the process of making career cho...,1.,2.,3.,4.,D,True,train,you
11,Which of the following best describes the fami...,Astronauts on shorter shuttle missions often w...,They are caring and thoughtful.,They are worried and upset.,They are impatient and annoyed.,They are excited and curious.,A,True,train,.
12,The passage mainly discusses how astronauts _ .,Astronauts on shorter shuttle missions often w...,work for longer missions in space,connect with people on the Earth,spend their free time in space,observe the Earth from space,C,False,train,.
13,McCulloch and his group used_in their research.,Dogs have long been used to find explosives an...,10 dogs and 55 people,5 dogs and 86 people,10 dogs and 83 people,5 dogs and 169 people,D,False,train,the
14,We can infer from the passage that_.,Dogs have long been used to find explosives an...,dogs can smell signs of other cancers except t...,the final goal of the researchers is to design...,dogs can detect odors 10 000 to 100 000 times ...,dogs will soon be widely used to smell signs o...,B,False,train,the
15,Which of the following would be the best title?,Dogs have long been used to find explosives an...,Special Noses of Dogs,Dogs and Cancer,Dogs Smell Signs of Cancer,McCulloch'S New Discovery,C,True,train,the
16,"If you have free time only on Saturday, you ca...",Friends and Buddies\nThis program is planned f...,Friends and Buddies,Club Saturday Swim,Sibshops (Ages 10-13),Banana Splits,B,False,train,the


In [76]:
df_filled_none

Unnamed: 0,prompt,context,A,B,C,D,answer,is_question,dataset,E
0,We can know from the passage that the author w...,Last week I talked with some of my students ab...,doctor,model,teacher,reporter,C,False,train,
1,Many graduates today turn to cosmetic surgery ...,Last week I talked with some of my students ab...,marry a better man/woman,become a model,get an advantage over others in job-hunting,attract more admirers,C,False,train,
4,What could be the best title for this passage?,"YUZHOU, HENAN -An accident in a central China ...",Death Toll Rises in an Accident in China,A Coal Mine Accident in Central China,An Accident in Central China,Coal Mine Accidents in China,B,True,train,
8,How many tips does the author give on career m...,Understanding the process of making career cho...,1.,2.,3.,4.,D,True,train,
11,Which of the following best describes the fami...,Astronauts on shorter shuttle missions often w...,They are caring and thoughtful.,They are worried and upset.,They are impatient and annoyed.,They are excited and curious.,A,True,train,
...,...,...,...,...,...,...,...,...,...,...
97679,The writer might feel _ before the Math Test.,"One day in the eighth grade, I was taking a Ma...",surprised,relaxed,worried,excited,C,False,test,
97680,Which is the best expression to fill in the bl...,"One day in the eighth grade, I was taking a Ma...",had breakfast,went to bed,took exercise,rode to school,B,True,test,
97681,The writer's father was _ .,"One day in the eighth grade, I was taking a Ma...",proud of him,tired of him,strict with him,pleased with him,C,False,test,
97682,"From the passage, we know that the writer _ .","One day in the eighth grade, I was taking a Ma...",could read Serbian,didn't cheat at last,got a good grade at last,didn't work hard that night,B,False,test,


In [None]:
dummy_df

In [274]:
dummy_df.dropna(how='any', axis=0).shape , dummy_df.shape

((9618, 9), (13660, 9))

In [276]:
missing_dummy_df  = dummy_df.dropna(how='any', axis=0)

In [293]:
missing_dummy_df.to_csv('./df_train_fill.csv', index=False)

In [278]:
missing_dummy_df_fill = dummy_df[~dummy_df.id.isin(missing_dummy_df.id)]

In [280]:
missing_dummy_df_fill

Unnamed: 0,prompt,context,A,B,C,D,E,answer,id
46688,What phenomenon makes global winds blow northe...,These winds blow predominantly from the northe...,muon effect,,tropical effect,centrifugal effect,coriolis effect,E,46688
46690,What is the least dangerous radioactive decay?,"If the beta decay of 222Rn is possible, it is ...",alpha decay,beta decay,zeta decay,,gamma decay,A,46690
46693,What kind of a reaction occurs when a substanc...,The direct reaction of O2 with fuel is preclud...,,invention reaction,Fluid Reaction,nitrogen reaction,combustion reaction,E,46693
46694,Organisms categorized by what species descript...,It can be distinguished from allopatric specia...,,species complex,surface species,ring species,fitting species,D,46694
46697,Zinc is more easily oxidized than iron because...,Zinc is more reactive than iron or steel and t...,much metal,,active metal,Trap metal,usually metal,C,46697
...,...,...,...,...,...,...,...,...,...
60337,"Animal claws, spines, and shells are examples ...",Since survival behaviours are so vital for an ...,defense mechanism,display behavior,learned behavior,spontaneous mutation,,A,60337
60338,The jejunum is about 0.9 meters (3 feet) long ...,* The jejunum is typically of larger diameter ...,,black at death,empty at death,time.the at death,weeks at death,C,60338
60341,What is the number waves that pass a fixed poi...,If there is a periodic travelling wave solutio...,combination frequency,heating frequency,wave tendency,,wave frequency,E,60341
60345,"Melting glaciers, rising temperatures and drou...",Impacts include changes in regional rainfall p...,nature's natural cycle,air pollution,global warming,sudden warming,,C,60345


In [282]:
missing_dummy_df_fill[['prompt','A', 'B', 'C', 'D', 'E', 'answer', 'id']].to_csv('./missing_complex_df.csv',index=False)

In [285]:
def fill_missing_options_complex_cases(row):
    options = ['A', 'B', 'C', 'D', 'E']
    available_options = {opt: row[f'{opt}'] for opt in options if pd.notna(row[f'{opt}'])}
    
    # Remove the correct answer from available options
    correct_option = row['answer']
    available_options.pop(correct_option, None)
    
    # Generate similar words for available incorrect options
    similar_words_list = []
    for option_word in available_options.values():
        # Considering only the last word in the phrase
        option_last_word = option_word.split()[-1]
        try:
            similar_words = [item[0].replace('_', ' ') for item in model.most_similar(option_last_word, topn=3)]
            for word in similar_words:
                # Prepending with the original prefix to create a similar phrase
                similar_phrase = option_word.replace(option_last_word, word)
                similar_words_list.append(similar_phrase)
        except KeyError:  # Skip words not in vocabulary
            continue

    random.shuffle(similar_words_list)
    
    # Fill missing options
    for opt in options:
        if pd.isna(row[f'{opt}']):
            if similar_words_list:
                row[f'{opt}'] = similar_words_list.pop(0)
    
    return row


In [None]:
missing_dummy_df_fill = missing_dummy_df_fill.apply(fill_missing_options_complex_cases, axis=1)

In [281]:
missing_dummy_df_fill.head()

Index(['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer', 'id'], dtype='object')

In [None]:
missing_dummy_df_fill.to_csv('./df_train_fill_complex.csv', index=False)

In [288]:
missing_dummy_df_fill

Unnamed: 0,prompt,context,A,B,C,D,E,answer,id
46688,What phenomenon makes global winds blow northe...,These winds blow predominantly from the northe...,muon effect,tropical impact,tropical effect,centrifugal effect,coriolis effect,E,46688
46690,What is the least dangerous radioactive decay?,"If the beta decay of 222Rn is possible, it is ...",alpha decay,beta decay,zeta decay,beta degeneration,gamma decay,A,46690
46693,What kind of a reaction occurs when a substanc...,The direct reaction of O2 with fuel is preclud...,Fluid reactions,invention reaction,Fluid Reaction,nitrogen reaction,combustion reaction,E,46693
46694,Organisms categorized by what species descript...,It can be distinguished from allopatric specia...,surface vertebrate species,species complex,surface species,ring species,fitting species,D,46694
46697,Zinc is more easily oxidized than iron because...,Zinc is more reactive than iron or steel and t...,much metal,much Metal,active metal,Trap metal,usually metal,C,46697
...,...,...,...,...,...,...,...,...,...
60337,"Animal claws, spines, and shells are examples ...",Since survival behaviours are so vital for an ...,defense mechanism,display behavior,learned behavior,spontaneous mutation,display behavious,A,60337
60338,The jejunum is about 0.9 meters (3 feet) long ...,* The jejunum is typically of larger diameter ...,black at deaths,black at death,empty at death,time.the at death,weeks at death,C,60338
60341,What is the number waves that pass a fixed poi...,If there is a periodic travelling wave solutio...,combination frequency,heating frequency,wave tendency,heating ####.# MHz,wave frequency,E,60341
60345,"Melting glaciers, rising temperatures and drou...",Impacts include changes in regional rainfall p...,nature's natural cycle,air pollution,global warming,sudden warming,air air pollution,C,60345


In [289]:
missing_dummy_df_fill.dropna(how='any', axis=0).shape

(3915, 9)

In [2]:
cntx = 'MOND is an example of a class of theories known as modified gravity, and is an alternative to the hypothesis that the dynamics of galaxies are determined by massive, invisible dark matter halos. Since Milgrom\'s original proposal, proponents of MOND have claimed to successfully predict a variety of galactic phenomena that they state are difficult to understand as consequences of dark matter.Though MOND explains the anomalously great rotational velocities of galaxies at their perimeters, it does not fully explain the velocity dispersions of individual galaxies within galaxy clusters. MOND reduces the discrepancy between the velocity dispersions and clusters\' observed missing baryonic mass from a factor of around 10 to a factor of about 2. However, the residual discrepancy cannot be accounted for by MOND, requiring that other explanations close the gap such as the presence of as-yet undetected missing baryonic matter.The accurate measurement of the speed of gravitational waves compared to the speed of light in 2017 ruled out a certain class of modified gravity theories but concluded that other MOND theories that dispense with the need for dark matter remained viable. Two years later, theories put forth by Constantinos Skordis and Tom Zlosnik were consistent with gravitational waves that always travel at the speed of light. Later still in 2021, Skordis and Zlosnik developed a subclass of their theory called "RMOND", for "relativistic MOND", which had "been shown to reproduce in great detail the main observations in cosmology, including the cosmic-microwave-background power spectrum, and the matter structure power spectrum.'

In [6]:
cntx = 'Outstanding problems for MOND The most serious problem facing Milgrom\'s law is that it cannot eliminate the need for dark matter in all astrophysical systems: galaxy clusters show a residual mass discrepancy even when analyzed using MOND. The fact that some form of unseen mass must exist in these systems detracts from the adequacy of MOND as a solution to the missing mass problem, although the amount of extra mass required is a fifth that of a Newtonian analysis, and there is no requirement that the missing mass be non-baryonic. It has been speculated that 2 eV neutrinos could account for the cluster observations in MOND while preserving the hypothesis\'s successes at the galaxy scale. Indeed, analysis of sharp lensing data for the galaxy cluster Abell 1689 shows that MOND only becomes distinctive at Mpc distance from the center, so that Zwicky\'s conundrum remains, and 1.8 eV neutrinos are needed in clusters.The 2006 observation of a pair of colliding galaxy clusters known as the "Bullet Cluster", poses a significant challenge for all theories proposing a modified gravity solution to the missing mass problem, including MOND. Astronomers measured the distribution of stellar and gas mass in the clusters using visible and X-ray light, respectively, and in addition mapped the inferred dark matter density using gravitational lensing. In MOND, one would expect the "missing mass" to be centred on regions of visible mass which experience accelerations lower than a0 (assuming the external field effect is negligible). In ΛCDM, on the other hand, one would expect the dark matter to be significantly offset from the visible mass because the halos of the two colliding clusters would pass through each other (assuming, as is conventional, that dark matter is collisionless), whilst the cluster gas would interact and end up at the centre. An offset is clearly seen in the observations. It has been suggested, however, that MOND-based models may be able to generate such an offset in strongly non-spherically symmetric systems, such as the Bullet Cluster.A significant piece of evidence in favor of standard dark matter is the observed anisotropies in the cosmic microwave background. While ΛCDM is able to explain the observed angular power spectrum, MOND has a much harder time, though recently it has been shown that MOND can fit the observations too. MOND also encounters difficulties explaining structure formation, with density perturbations in MOND perhaps growing so rapidly that too much structure is formed by the present epoch. However, forming galaxies more rapidly than in ΛCDM can be a good thing to some extent.Several other studies have noted observational difficulties with MOND. For example, it has been claimed that MOND offers a poor fit to the velocity dispersion profile of globular clusters and the temperature profile of galaxy clusters, that different values of a0 are required for agreement with different galaxies\' rotation curves, and that MOND is naturally unsuited to forming the basis of cosmology. Furthermore, many versions of MOND predict that the speed of light is different from the speed of gravity, but in 2017 the speed of gravitational waves was measured to be equal to the speed of light to high precision. This is well understood in modern relativistic theories of MOND, with the constraint from gravitational waves actually helping by substantially restricting how a covariant theory might be constructed.Besides these observational issues, MOND and its relativistic generalizations are plagued by theoretical difficulties. Several ad hoc and inelegant additions to general relativity are required to create a theory compatible with a non-Newtonian non-relativistic limit, though the predictions in this limit are rather clear. This is the case for the more commonly used modified gravity versions of MOND, but some formulations (most prominently those based on modified inertia) have long suffered from poor compatibility with cherished physical principles such as conservation laws. Researchers working on MOND generally do not interpret it as a modification of inertia, with only very limited work done on this area.'

In [7]:
len(cntx)

4174

In [8]:
cntx[:2500]

'Outstanding problems for MOND The most serious problem facing Milgrom\'s law is that it cannot eliminate the need for dark matter in all astrophysical systems: galaxy clusters show a residual mass discrepancy even when analyzed using MOND. The fact that some form of unseen mass must exist in these systems detracts from the adequacy of MOND as a solution to the missing mass problem, although the amount of extra mass required is a fifth that of a Newtonian analysis, and there is no requirement that the missing mass be non-baryonic. It has been speculated that 2 eV neutrinos could account for the cluster observations in MOND while preserving the hypothesis\'s successes at the galaxy scale. Indeed, analysis of sharp lensing data for the galaxy cluster Abell 1689 shows that MOND only becomes distinctive at Mpc distance from the center, so that Zwicky\'s conundrum remains, and 1.8 eV neutrinos are needed in clusters.The 2006 observation of a pair of colliding galaxy clusters known as the "B

In [9]:
cntx[2500:4200]

"too much structure is formed by the present epoch. However, forming galaxies more rapidly than in ΛCDM can be a good thing to some extent.Several other studies have noted observational difficulties with MOND. For example, it has been claimed that MOND offers a poor fit to the velocity dispersion profile of globular clusters and the temperature profile of galaxy clusters, that different values of a0 are required for agreement with different galaxies' rotation curves, and that MOND is naturally unsuited to forming the basis of cosmology. Furthermore, many versions of MOND predict that the speed of light is different from the speed of gravity, but in 2017 the speed of gravitational waves was measured to be equal to the speed of light to high precision. This is well understood in modern relativistic theories of MOND, with the constraint from gravitational waves actually helping by substantially restricting how a covariant theory might be constructed.Besides these observational issues, MON

In [None]:
def get_tokens(index, row, tokenizer, retrieved_articles_parsed):
        system_prefix = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_prefix}"
        instruction = f"Your task is to analyze the question and answer below. If the answer is correct, respond yes, if it is not correct respond no. As a potential aid to your answer, background context from Wikipedia articles is at your disposal, even if they might not always be pertinent."
        wiki_context = f"{retrieved_articles_parsed[index][-3][2]}\n{retrieved_articles_parsed[index][-2][2]}\n{retrieved_articles_parsed[index][-1][2]}" 
#         context1 = f"{retrieved_articles[index][-4][2]}\n{retrieved_articles[index][-3][2]}\n{retrieved_articles[index][-2][2]}\n{retrieved_articles[index][-1][2]}"
        input_prefix = f"Context: {wiki_context[:MAX_CONTEXT]}\nQuestion: {row['prompt']}\nProposed answer: "
        prompt_prefix = system_prefix.format(instruction=instruction, input_prefix=input_prefix)
        prefix = tokenizer(prompt_prefix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH)['input_ids']
        prompt_suffix = [f"{row[letter]}\n\n### Response:\n" for letter in 'ABCDE']
        suffix = tokenizer(prompt_suffix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH, padding=True)['input_ids'][:, 1:]
        return prefix, suffix 
    
    

def run_model(device, df):
    model = ShardedLlama(checkpoint_path, device=f'cuda:{device}')
    inputs = df.reset_index().apply(lambda row: get_tokens(row['index'], row, tokenizer=model.tokenizer, retrieved_articles_parsed=retrieved_articles_parsed), axis=1).values
    batches = np.array_split(inputs, N_BATCHES)
    outputs = []
    for batch in batches:
        outputs += model(batch, output_token=4874)
    return outputs

# Run model

with ThreadPoolExecutor() as executor:
    outputs = list(executor.map(run_model, [0, 1], np.array_split(df, 2)))
    outputs = sum(outputs, [])

In [None]:
def get_tokens(row, tokenizer):
    context = row['context'][:MAX_CONTEXT]
    prompt = row['prompt']
    
    # Create a key for the cache
    cache_key = f"context:{context}_prompt:{prompt}"
    
    if cache_key in tokenized_cache:
        return tokenized_cache[cache_key]
    
    # ... Your existing tokenization code ...
    
    prefix, suffix = tokenizer(prompt_prefix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH)["input_ids"], tokenizer(prompt_suffix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH, padding=True)["input_ids"][:, 1:]
    
    # Store tokenized output in the cache
    tokenized_cache[cache_key] = (prefix, suffix)
    
    return prefix, suffix

In [158]:
import torch

In [159]:
labels = torch.tensor([3, 0, 3, 3, 0, 4, 3, 0, 3, 4, 4, 3])

y_preds = torch.tensor([[ 0.0809,  0.1396, -0.0039,  0.1592,  0.0755],
        [ 0.0881,  0.1466, -0.0024,  0.1619,  0.0827],
        [ 0.0910,  0.1463,  0.0056,  0.1694,  0.0862],
        [ 0.0878,  0.1381, -0.0026,  0.1658,  0.0776],
        [ 0.0795,  0.1336, -0.0080,  0.1588,  0.0768],
        [ 0.0787,  0.1359, -0.0029,  0.1639,  0.0768],
        [ 0.0841,  0.1335, -0.0039,  0.1603,  0.0808],
        [ 0.0921,  0.1506,  0.0006,  0.1663,  0.0889],
        [ 0.0819,  0.1423, -0.0025,  0.1623,  0.0814],
        [ 0.0825,  0.1419, -0.0056,  0.1591,  0.0781],
        [ 0.0906,  0.1427, -0.0031,  0.1647,  0.0822],
        [ 0.0837,  0.1398, -0.0032,  0.1606,  0.0798]])

In [160]:
correct_indices = labels.view(-1)

In [161]:
correct_indices

tensor([3, 0, 3, 3, 0, 4, 3, 0, 3, 4, 4, 3])

In [163]:
positive_scores = y_preds[torch.arange(y_preds.size(0)), correct_indices]

In [164]:
positive_scores

tensor([0.1592, 0.0881, 0.1694, 0.1658, 0.0795, 0.0768, 0.1603, 0.0921, 0.1623,
        0.0781, 0.0822, 0.1606])

In [None]:
negative_scores = 

In [165]:
# Step 1: Create a mask
mask = torch.ones(y_preds.shape, dtype=torch.bool)
mask[torch.arange(y_preds.size(0)), correct_indices] = 0

# Step 2: Use this mask to select the scores associated with the incorrect labels
negative_scores_all = y_preds[mask].view(y_preds.size(0), -1)

# Step 3: If you just want one negative score (maximum) for each instance
max_negative_scores = negative_scores_all.max(dim=1).values

In [166]:
max_negative_scores

tensor([0.1396, 0.1619, 0.1463, 0.1381, 0.1588, 0.1639, 0.1335, 0.1663, 0.1423,
        0.1591, 0.1647, 0.1398])

In [167]:
negative_scores_all

tensor([[ 0.0809,  0.1396, -0.0039,  0.0755],
        [ 0.1466, -0.0024,  0.1619,  0.0827],
        [ 0.0910,  0.1463,  0.0056,  0.0862],
        [ 0.0878,  0.1381, -0.0026,  0.0776],
        [ 0.1336, -0.0080,  0.1588,  0.0768],
        [ 0.0787,  0.1359, -0.0029,  0.1639],
        [ 0.0841,  0.1335, -0.0039,  0.0808],
        [ 0.1506,  0.0006,  0.1663,  0.0889],
        [ 0.0819,  0.1423, -0.0025,  0.0814],
        [ 0.0825,  0.1419, -0.0056,  0.1591],
        [ 0.0906,  0.1427, -0.0031,  0.1647],
        [ 0.0837,  0.1398, -0.0032,  0.0798]])

In [168]:
max_negative_scores.unsqueeze(0)

tensor([[0.1396, 0.1619, 0.1463, 0.1381, 0.1588, 0.1639, 0.1335, 0.1663, 0.1423,
         0.1591, 0.1647, 0.1398]])

#### Check Loader

In [81]:
import torch
import numpy as np
from typing import Optional, Union
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

In [96]:
from transformers import AutoTokenizer, AutoConfig
from torch.utils.data import DataLoader

In [104]:
MAX_INPUT= 768

In [105]:

def prepare_answering_input_deberta(
        tokenizer, # longformer_tokenizer
        question,  # str
        options,   # List[str]
        context,   # str
        max_seq_length=4096,
    ):
    
    first_sentence = [ "[CLS] " + context ] * 5
    second_sentences = [" #### " + question + " [SEP] " + option + " [SEP]" for option in options]
    tokenized_examples = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=max_seq_length, add_special_tokens=False)
      

    return tokenized_examples






class LlmseDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, is_train=False, aug_prob=0.8):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.aug_prob = aug_prob
        self.option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        tokenized_example = dict()
        
        if self.is_train and torch.rand(1)<self.aug_prob:
            prm = torch.randperm(5).numpy()

            
            permed_a2e = np.array(['A','B','C','D','E'])[prm]
            permed_dict_a2p = {a: p for p, a in enumerate(permed_a2e)}
            
            # options = [ example[option] + " [SEP]" for option in permed_a2e] # for longformer
            options = [ example[option] for option in permed_a2e] 
            

            tokenized_example = prepare_answering_input_deberta(tokenizer=self.tokenizer, question=example['prompt'], options=options, context= example['context'], max_seq_length = MAX_INPUT)
            

            tokenized_example['label'] = permed_dict_a2p[example['answer']]

            
        else:

            options = [ example[option] for option in 'ABCDE']
            # tokenized_example = prepare_answering_input(tokenizer=self.tokenizer, question=example['prompt'], options=options, context= example['context'], max_seq_length = config.MAX_INPUT )
        
            tokenized_example = prepare_answering_input_deberta(tokenizer=self.tokenizer, question=example['prompt'], options=options, context= example['context'], max_seq_length = MAX_INPUT )
            
            tokenized_example['label'] = self.option_to_index[example['answer']]

        return tokenized_example
            


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
        
   

In [106]:
train_df= pd.read_csv('input_data/train_folds_article_context_70+40.csv')

In [107]:
train_df['len_context'] = train_df['context'].apply(lambda x: len(x))

In [108]:
tokenizer = AutoTokenizer.from_pretrained('models/microsoft/deberta-v3-large/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [109]:
train_set = train_df[train_df.len_context >4000].head(5)

In [110]:
train_ds = LlmseDataset(train_set, tokenizer, is_train=True, aug_prob=1.0)
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
train_dataloader = DataLoader(
        train_ds, 
        batch_size=5, 
        shuffle=True, 
        collate_fn=data_collator,
        pin_memory=True,
        drop_last=True
    )


In [111]:
cc = next(iter(train_dataloader))

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [115]:
cc['input_ids'][1]

tensor([[     1,    341, 119524,  ...,    268,    260,      2],
        [     1,    341, 119524,  ...,    260,    309,      2],
        [     1,    341, 119524,  ...,   3389,    260,      2],
        [     1,    341, 119524,  ...,    912,    260,      2],
        [     1,    341, 119524,  ...,    268,    260,      2]])

In [125]:
train_set.iloc[0].values

array([1,
       'How does Modified Newtonian Dynamics (MOND) impact the observed "missing baryonic mass" discrepancy in galaxy clusters, according to the provided excerpt from Wikipedia?',
       'MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions.',
       'MOND explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.',
       'MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."',
       'MOND is a theory that eliminates the observed missing baryonic mass in galaxy clusters by imposing a new mathematical formulation of gravity that does not require the existence of dark matter.',
       "MOND's impact on the observed missing baryonic mass in galaxy clusters remains a s

In [119]:
tokenizer.decode(cc['input_ids'][1][0])

'[CLS] -Modified Newtonian dynamics (MOND) is a hypothesis that proposes a modification of Newton\'s law of universal gravitation to account for observed properties of galaxies. It is an alternative to the hypothesis of dark matter in terms of explaining why galaxies do not appear to obey the currently understood laws of physics. -MOND is an example of a class of theories known as modified gravity, and is an alternative to the hypothesis that the dynamics of galaxies are determined by massive, invisible dark matter halos. Since Milgrom\'s original proposal, proponents of MOND have claimed to successfully predict a variety of galactic phenomena that they state are difficult to understand as consequences of dark matter.Though MOND explains the anomalously great rotational velocities of galaxies at their perimeters, it does not fully explain the velocity dispersions of individual galaxies within galaxy clusters. MOND reduces the discrepancy between the velocity dispersions and clusters\' 

In [121]:
print(tokenizer.decode(cc['input_ids'][1][1]))

[CLS] -Modified Newtonian dynamics (MOND) is a hypothesis that proposes a modification of Newton's law of universal gravitation to account for observed properties of galaxies. It is an alternative to the hypothesis of dark matter in terms of explaining why galaxies do not appear to obey the currently understood laws of physics. -MOND is an example of a class of theories known as modified gravity, and is an alternative to the hypothesis that the dynamics of galaxies are determined by massive, invisible dark matter halos. Since Milgrom's original proposal, proponents of MOND have claimed to successfully predict a variety of galactic phenomena that they state are difficult to understand as consequences of dark matter.Though MOND explains the anomalously great rotational velocities of galaxies at their perimeters, it does not fully explain the velocity dispersions of individual galaxies within galaxy clusters. MOND reduces the discrepancy between the velocity dispersions and clusters' obse