#### LLM Science Exam: Wiki-RAG + BERT
##### https://www.kaggle.com/competitions/kaggle-llm-science-exam


Context created following these notebooks:
https://www.kaggle.com/code/emmermarcell/context-creation & 
https://www.kaggle.com/code/judehunt23/data-collection-training-data-wiki-pages

In [146]:
# Let's import the public training set and take a look
import pandas as pd
train_df = pd.read_csv('./data/train.csv')
print(train_df.shape)
train_df.head()

(200, 8)


Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [147]:
# For convenience we'll turn our pandas Dataframe into a Dataset
from datasets import Dataset
train_ds = Dataset.from_pandas(train_df)

In [148]:
context_df=pd.concat([pd.read_csv('./data/train_pages_df.csv',header=0),\
                     pd.read_csv('./data/physics_pages_df.csv',header=0)],axis=0)

In [149]:
context_df=context_df[['cleaned_text','chunk_id']]

In [150]:
#context text
context_df.head()

Unnamed: 0,cleaned_text,chunk_id
0,understanding consequences supersymmetry susy ...,1
1,schr dinger equation harmonic oscillator takes...,1
2,"special case shape invariance, discussed below...",1
3,schr dinger equation harmonic oscillator takes...,1
4,"special case shape invariance, discussed below...",1


In [151]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader


#### https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [176]:
from sentence_transformers import SentenceTransformer
model_st = SentenceTransformer('all-MiniLM-L6-v2') 

In [177]:
import faiss
from tqdm import tqdm
index = faiss.IndexFlatIP(model.get_sentence_embedding_dimension())

# Iterate over the dataset in batches
# Define batch size
batch_size = 32  # Adjust based on your available memory

# Iterate over the dataset in batches
for start_idx in tqdm(range(0, len(context_df['cleaned_text']), batch_size)):
    end_idx = min(start_idx + batch_size, len(context_df['cleaned_text']))
    
    # Encode a batch of context sentences
    batch_sentences = context_df['cleaned_text'].iloc[start_idx:end_idx]
    context_embeddings = model_st.encode(
        batch_sentences.tolist(),  # Convert to list for batch processing
        convert_to_tensor=True,
        normalize_embeddings=True).half()  # Use mixed-precision training (FP16) if supported
    
    # Convert the embeddings to a numpy array
    context_embeddings_np = context_embeddings.detach().cpu().numpy().astype('float32')

    # Add the embeddings to the Faiss index
    index.add(context_embeddings_np)


100%|█████████████████████████████████████████| 173/173 [01:48<00:00,  1.60it/s]


In [178]:
# Function to retrieve most similar documents
def retrieve_most_similar(query, k=1):
    query_embedding = model_st.encode(query, show_progress_bar=False, convert_to_tensor=True, normalize_embeddings=True)
    query_embedding = query_embedding.reshape(1, -1)  # Reshape for Faiss
    query_embedding = query_embedding.detach().cpu().numpy()
    _, idx = index.search(query_embedding, k)
    return idx[0]

# Example usage
query_text = train_df['prompt'].iloc[1]
print(f'example prompt:\n {query_text}')
similar_documents_indices = retrieve_most_similar(query_text)

# Print similar documents
for idx in similar_documents_indices:
    print('\n example context:')
    print(context_df['cleaned_text'].iloc[int(idx)])


example prompt:
 Which of the following is an accurate definition of dynamic scaling in self-similar systems?

 example context:
dynamic scaling sometimes known family vicsek scaling litmus test shows whether evolving system exhibits self similarity. general function said exhibit dynamic scaling satisfies f x , x z . displaystyle f x,t sim theta varphi left frac x z right . exponent displaystyle theta fixed dimensional requirement f displaystyle f theta . numerical value f displaystyle f theta remain invariant despite unit measurement displaystyle changed factor since displaystyle varphi dimensionless quantity. many systems evolve self similar fashion sense data obtained snapshot fixed time similar respective data taken snapshot earlier later time. is, system similar different times. litmus test self similarity provided dynamic scaling.


In [173]:
train_df['context_idx']=train_df['prompt'].apply(lambda x: retrieve_most_similar(x))

In [183]:
# context_df and train_df are defined
train_df['context'] = train_df['context_idx'].apply(lambda idx: context_df['cleaned_text'].iloc[idx[0]])

In [194]:
train_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,embeddings,context_idx,context
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,"[-0.04548792, -0.016431049, 0.0439609, 0.10899...",[354],given galaxy gravitational acceleration equals...
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,"[-0.0026680573, -0.057913736, -0.040776283, 0....",[816],dynamic scaling sometimes known family vicsek ...
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,"[-0.06822429, 0.09698395, -0.07818291, -0.0894...",[587],triskelion triskeles ancient motif consisting ...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,"[-0.039146785, 0.005445784, 0.07188326, 0.0365...",[4949],"physics, especially quantum field theory, regu..."
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,"[-0.06695653, -0.022392983, 0.03409412, -0.031...",[532],several qualitative observations made diffract...


In [184]:
# train_df.to_csv('train_with_context.csv')

In [185]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased')

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [235]:
predicted_answers_idx = []

# Iterate through each row in the DataFrame
for index, row in train_df.iterrows():
    context = row['context']
    question = row['prompt']
    options = row[['A','B','C','D','E']]

    # Encode context and question once
    encoded_prompt = tokenizer.encode_plus(
        question,
        #context,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        max_length=352,
        truncation=True
    )
    
    # Separate encoding for options (without duplicating the prompt)
    input_ids = []
    attention_masks = []

    for option in options:
        option_encoding = tokenizer.encode_plus(
            option,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            max_length=32,  # Separate max length for options
            truncation=True
        )

        # Combine the prompt with each option
        combined_input_ids = torch.cat([encoded_prompt['input_ids'], option_encoding['input_ids']], dim=-1)
        combined_attention_mask = torch.cat([encoded_prompt['attention_mask'], option_encoding['attention_mask']], dim=-1)
        
        input_ids.append(combined_input_ids)
        attention_masks.append(combined_attention_mask)

    # Convert to tensors
    input_ids = torch.cat(input_ids).unsqueeze(0)  # Add batch dimension
    attention_masks = torch.cat(attention_masks).unsqueeze(0)

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits

    # Get predicted answer
    predicted_answer_index = torch.argmax(logits, dim=1).item()
    option_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
    predicted_answers_idx.append(option_mapping[predicted_answer_index])

print(predicted_answers_idx)

['B', 'B', 'A', 'B', 'A', 'A', 'B', 'E', 'B', 'A', 'C', 'E', 'A', 'B', 'C', 'E', 'C', 'D', 'C', 'B', 'A', 'A', 'C', 'B', 'E', 'A', 'B', 'C', 'B', 'B', 'E', 'D', 'C', 'C', 'B', 'B', 'A', 'B', 'C', 'B', 'D', 'D', 'D', 'C', 'D', 'C', 'B', 'D', 'D', 'C', 'C', 'C', 'C', 'C', 'D', 'B', 'C', 'A', 'C', 'D', 'A', 'A', 'B', 'C', 'D', 'D', 'C', 'C', 'A', 'B', 'B', 'E', 'B', 'B', 'D', 'E', 'D', 'A', 'D', 'C', 'E', 'D', 'A', 'B', 'A', 'C', 'B', 'B', 'E', 'E', 'A', 'A', 'B', 'E', 'A', 'D', 'C', 'A', 'D', 'B', 'E', 'E', 'C', 'A', 'A', 'D', 'A', 'B', 'D', 'A', 'B', 'A', 'D', 'E', 'D', 'D', 'A', 'E', 'C', 'A', 'E', 'C', 'B', 'C', 'D', 'B', 'B', 'D', 'B', 'B', 'A', 'C', 'E', 'A', 'C', 'C', 'B', 'C', 'A', 'C', 'C', 'B', 'D', 'C', 'E', 'B', 'D', 'D', 'E', 'C', 'C', 'E', 'B', 'E', 'D', 'E', 'A', 'E', 'E', 'C', 'A', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'E', 'B', 'B', 'C', 'D', 'C', 'D', 'A', 'C', 'A', 'B', 'E', 'D', 'A', 'E', 'E', 'E', 'A', 'B', 'D', 'E', 'B', 'C', 'E', 'C', 'A', 'D', 'C', 'A', 'C', 'A', 'C']

In [238]:
train_df['predicted_answer']=predicted_answers_idx

In [241]:
count=0
for i in range(train_df.shape[0]):
    if (train_df['answer'].iloc[i]==train_df['predicted_answer'].iloc[i]):
        count=count+1
print(count/200)

0.24
