In [158]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import spacy

In [159]:
nlp = spacy.load('en_core_web_sm')

In [160]:
data = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
data.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [161]:
data.isnull().sum()

id        0
prompt    0
A         0
B         0
C         0
D         0
E         0
answer    0
dtype: int64

In [162]:
target = data['answer']

In [163]:
data = data.drop(['id'], axis=1)

In [164]:
NO_OF_SAMPLES = 2

def display_column(data, no_of_samples):
    for i in range(no_of_samples):
        print(str(i) + " ======> " + data[i] + '\n')

In [165]:
display_column(data['prompt'], NO_OF_SAMPLES)





In [166]:
display_column(data['A'], NO_OF_SAMPLES)





In [167]:
display_column(data['B'], NO_OF_SAMPLES)





In [168]:
display_column(data['C'], NO_OF_SAMPLES)





In [169]:
display_column(data['D'], NO_OF_SAMPLES)





In [170]:
display_column(data['E'], NO_OF_SAMPLES)





In [171]:
import re
def clean_text(text):
    text = re.sub("[^A-Za-z0-9]", " ", text)
    return text.lower()

In [172]:
def lemmitization(text):
    words = ''
    text = nlp(text)
    for word in text:
        words += ' ' + word.lemma_
    return words

In [173]:
for i in data.columns:
    data[i] = data[i].apply(lambda x: clean_text(x))
    data[i] = data[i].apply(lambda x: lemmitization(x))

In [174]:
data['prompt'][2]

' which of the follow statement accurately describe the origin and significance of the triskele symbol'

In [175]:
# For convenience we'll turn our pandas Dataframe into a Dataset
from datasets import Dataset
train_ds = Dataset.from_pandas(data)

In [176]:
train_ds

Dataset({
    features: ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 200
})

In [177]:
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

# The path of the model checkpoint we want to use
model_dir = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Now we'll instatiate the model that we'll finetune on our public dataset, then use to
# make prediction on the private dataset.
model = AutoModelForMultipleChoice.from_pretrained(model_dir)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [178]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'abcde'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option.upper()])
        
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    
#     if 'answer' not in example.keys():
#         return tokenized_example
    tokenized_example['label'] = option_to_index[example['answer'].replace(' ', '')]
    return tokenized_example

tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

  0%|          | 0/200 [00:00<?, ?ex/s]

Transformers doesn’t have a data collator for multiple choice, so you’ll need to adapt the DataCollatorWithPadding to create a batch of examples. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length


Here's the reformatted text with headings and bullet points for clarity:

---

**Code Explanation: Data Collator for Multiple-Choice Tasks**

This code defines a data collator class for multiple-choice tasks using the Hugging Face Transformers library. A data collator is used to process and prepare data for model training. In multiple-choice tasks, the model must choose the correct answer from a set of options for a given question or prompt. The data collator is responsible for dynamically padding and formatting the input data for these tasks. Here's an explanation of the code:

**Importing Necessary Modules:**

- The code imports various modules and classes from the Transformers library, including dataclasses for creating structured data classes, PreTrainedTokenizerBase for tokenization, and some types for function annotations.

**DataCollatorForMultipleChoice Data Class:**

- This class is defined as a dataclass using the @dataclass decorator. Dataclasses are a convenient way to define classes that are primarily used to store data.

**Constructor:**

- The constructor (__init__ method) takes the following arguments:
  - `tokenizer`: A tokenizer object for tokenizing the input data.
  - `padding`: An optional argument specifying the padding strategy. It can be a boolean (True or False), a string, or a PaddingStrategy object.
  - `max_length`: An optional argument specifying the maximum length of the padded input data.
  - `pad_to_multiple_of`: An optional argument specifying the padding to a multiple of a given value.

**__call__ Method:**

- This method is called when an instance of the DataCollatorForMultipleChoice class is invoked.
- It takes a list of features as input, where each feature is a dictionary containing information related to multiple-choice questions and their options.
- It extracts the labels from each feature and populates them into a separate list called labels.
- It computes the batch size and the number of choices (options) in each question.
- It flattens the features to create a list of dictionaries, where each dictionary corresponds to one choice for each question.
- It uses the tokenizer to pad and format the flattened features based on the specified padding strategy, maximum length, and pad_to_multiple_of value.
- It returns a batch of data with the following components:
  - `input_ids`: Padded input IDs for each choice.
  - `attention_mask`: Padded attention masks.
  - `token_type_ids`: Padded token type IDs (usually not relevant for multiple-choice tasks).
  - `labels`: The labels for the correct choices.

This data collator is designed to work with the Transformers library, especially for multiple-choice tasks, where you have a list of questions, each with multiple choices, and you want to prepare the data for training a model that can choose the correct answer from these choices. The collator ensures that all inputs are padded and formatted correctly for training.

---

In [179]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that dynamically pads the inputs for multiple choice questions.
    """

    # Constructor
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # Determine the label name based on the presence of 'label' or 'labels'
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        
        # Extract and remove labels from features
        labels = [feature.pop(label_name) for feature in features]
        
        # Compute batch size and number of choices
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        
        # Flatten the features to create a list of dictionaries
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        
        # Combine the flattened features into a single list
        flattened_features = sum(flattened_features, [])
        
        # Use the tokenizer to pad and format the features
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        
        # Reshape the batch
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        
        # Create a tensor for labels
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        
        return batch


In [180]:
# The arguments here are selected to run quickly; feel free to play with them.
model_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none'
)

In [181]:
# Generally it's a bad idea to validate on your training set, but because our training set
# for this problem is so small we're going to train on all our data.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

In [182]:
# Training should take about a minute
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.593356
2,No log,1.531491
3,No log,1.368984




TrainOutput(global_step=75, training_loss=1.5594384765625, metrics={'train_runtime': 48.4724, 'train_samples_per_second': 12.378, 'train_steps_per_second': 1.547, 'total_flos': 140249266584720.0, 'train_loss': 1.5594384765625, 'epoch': 3.0})

In [183]:
# Now we can actually make predictions on our questions
predictions = trainer.predict(tokenized_train_ds)



In [184]:
# The following function gets the indices of the highest scoring answers for each row
# and converts them back to our answer format (A, B, C, D, E)
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

In [185]:
# Let's double check our output looks correct:
predictions_to_map_output(predictions.predictions)

array(['d b e', 'd c a', 'a d c', 'a b c', 'd e b', 'b a c', 'b a c',
       'd e b', 'c d a', 'a c b', 'b c e', 'd a b', 'c b e', 'e d c',
       'b d a', 'd b a', 'e b a', 'a d b', 'd a b', 'e a b', 'b d c',
       'b e c', 'c d a', 'c b a', 'e a d', 'e b c', 'a d e', 'd a b',
       'e b c', 'c b e', 'b e d', 'e c d', 'd e b', 'd a c', 'e c d',
       'd b e', 'e d a', 'a d e', 'e d a', 'a e c', 'e a d', 'e a c',
       'b c e', 'b d e', 'd e c', 'a b c', 'b c e', 'c b e', 'd c e',
       'b a c', 'b d e', 'e c d', 'a c b', 'a d c', 'b c a', 'b e d',
       'c d a', 'c b d', 'a d e', 'c b a', 'b e c', 'c a b', 'c d b',
       'c b a', 'a c e', 'c e d', 'c d a', 'e a b', 'e c d', 'd e c',
       'e c a', 'a b d', 'd e a', 'b d a', 'd b e', 'e b c', 'c d a',
       'b d e', 'c d e', 'a e b', 'c d e', 'd a c', 'e c d', 'a e d',
       'b c d', 'b d a', 'c d b', 'a d b', 'e c a', 'b d a', 'd b a',
       'b c d', 'e c a', 'e d a', 'd e b', 'c e b', 'c a d', 'b d c',
       'c e b', 'd e

In [186]:
# Now we can load up our test set to use our model on!
# The public test.csv isn't the real dataset (it's actually just a copy of train.csv without the answer column)
# but it has the same format as the real test set, so using it is a good way to ensure our code will work when we submit.
test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
test_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...


In [187]:
# There are more verbose/elegant ways of doing this, but if we give our test set a random `answer` column
# we can make predictions directly with our trainer.
test_df['answer'] = 'a'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

  0%|          | 0/200 [00:00<?, ?ex/s]

In [188]:
# Here we'll generate our "real" predictions on the test set
test_predictions = trainer.predict(tokenized_test_ds)



In [189]:
# Now we can create our submission using the id column from test.csv
submission_df = test_df[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)
submission_df['prediction'] = submission_df['prediction'].apply(lambda x :x.upper())
submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['prediction'] = submission_df['prediction'].apply(lambda x :x.upper())


Unnamed: 0,id,prediction
0,0,B D E
1,1,D C B
2,2,D B A
3,3,B D C
4,4,D C E


In [190]:
# Once we write our submission file we're good to submit!
submission_df.to_csv('submission.csv', index=False)

In [191]:
# # pip install accelerate
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/flan-t5/pytorch/base/4")
# model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/flan-t5/pytorch/base/4", device_map="auto")

# input_text = "translate English to German: How old are you?"
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

# outputs = model.generate(input_ids)
# print(tokenizer.decode(outputs[0]))
