In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, T5EncoderModel, TFBertModel, BertModel, AutoModelForMaskedLM
from transformers import AutoTokenizer, BertTokenizer
from transformers import GenerationConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

In [30]:
train_df = pd.read_csv("data/ScienceQA/train.csv")

In [31]:
# Load model directly
from transformers import AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5", use_fast=True)
# model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5", device_map="cuda:1")

tokenizer = AutoTokenizer.from_pretrained("daryl149/llama-2-7b-chat-hf", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("daryl149/llama-2-7b-chat-hf", device_map="cuda:1")

# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl", use_fast=True)
# model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", device_map="cuda:1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [32]:
train_df = pd.read_csv('data/ScienceQA/train.csv')

In [33]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

if device == torch.device("cpu"):
    print("RUNNING ON CPU")
else:
    print("RUNNING ON CUDA")
    torch.cuda.synchronize(device)

RUNNING ON CUDA


In [34]:
prefix = '''Answer the question by chosing one option from below.'''
suffixes = 'Correct Answer: '
question = train_df.loc[0, 'prompt']
option_1 = train_df.loc[0, 'A']
option_2 = train_df.loc[0, 'B']
option_3 = train_df.loc[0, 'C']
option_4 = train_df.loc[0, 'D']
option_5 = train_df.loc[0, 'E']
sample_answer = train_df.loc[0, 'answer']
sample_query = f'{prefix}\n{question} \nA: {option_1}\nB: {option_2}\nC: {option_3}\nD: {option_4}\nE: {option_5}\nCorrect Answer: {sample_answer}'
print(sample_query)

Answer the question by chosing one option from below.
Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters? 
A: MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B: MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C: MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D: MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2.
E: MO

In [35]:
# Function to create prompts
def create_prompts(df):
    query_prompts = []
    sample_answer = ''
    for index, rows in tqdm(df.iterrows()):
        question = rows['prompt']
        option_1 = rows['A']
        option_2 = rows['B']
        option_3 = rows['C']
        option_4 = rows['D']
        option_5 = rows['E']
        
        # query = f'{sample_query}\n \n{question} \nA: {option_1}\nB: {option_2}\nC: {option_3}\nD: {option_4}\nE: {option_5}\n{suffixes}\n'
        query = f'You are a helpful assistant on multiple choices questions. Question: {question}\nOption A: {option_1}\nOption B: {option_2}\nOption C: {option_3}\nOption D: {option_4}\nOption E: {option_5}\n Please rank three correct options based on your confidence in descending order and do not provide any explanation.'
        query_prompts.append(query)
    
    return query_prompts
        
prompts = create_prompts(train_df)

200it [00:00, 11089.00it/s]


In [36]:
# Function to modify a prompt
def modify_prompt(answer_set, index):
    option = {}
    question = train_df.loc[index, 'prompt']
    query = f'{sample_query}\n \n{question}'
    for i in sorted(answer_set):
        option[i] = train_df.loc[index, i]
        query = query + f'\n{i}: {train_df.loc[index, i]}'
    query = query + f'\n{suffixes}\n'
    return query

In [37]:
def predict_answers(model, tokenizer, prompts):
    '''
    Funtion to predict one option at a time using Flan T5 - XL model
    '''
    predictions = []
    valid = set(['A', 'B', 'C', 'D', 'E'])
    
    for index, prompt in tqdm(zip(range(0, len(prompts)), prompts)):
        final_pred = []
        # Passing the newly constructed prompt instead of the dialogue
        tokenized_input = tokenizer(prompt, return_tensors='pt').to(device)
        model_output = tokenizer.decode(
          model.generate(
              tokenized_input['input_ids'],
              max_new_tokens=50
          )[0],
          skip_special_tokens=True
        )
          
        # add remaining letters
        to_add = valid - set(final_pred)
        count = 0
        
        # Predicting remaining options by changing the prompts
        while to_add:
            if len(to_add)<=2:
                break
            elif count>2:
                break
            elif count>2 and len(to_add)>3:
                final_pred.append(['C A E'])
                
            prompt_modified = modify_prompt(to_add, index)
            tokenized_input = tokenizer(prompt_modified, return_tensors='pt').to(device)
            model_output = tokenizer.decode(
              model.generate(
                  tokenized_input['input_ids'],
                  max_new_tokens=50
              )[0],
              skip_special_tokens=True
            )
            final_pred.append(model_output)
            to_add = valid - set(final_pred)
            count += 1
        
        predictions.append(final_pred)
    return predictions

In [38]:
predictions = predict_answers(model, tokenizer, prompts)
predictions[0]

200it [24:13,  7.27s/it]


['Answer the question by chosing one option from below.\nWhich of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters? \nA: MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."\nB: MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.\nC: MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.\nD: MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2

In [27]:
def average_precision_at_k(recommended, relevant, k=3):
    if len(recommended) > k:
        recommended = recommended[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(recommended):
        if p in relevant and p not in recommended[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not relevant:
        return 0.0
    return score / min(len(relevant), k)

def mean_average_precision_at_k(recommended_lists, relevant_lists, k=3):
    assert len(recommended_lists) == len(relevant_lists)
    return sum(average_precision_at_k(r, rel, k) for r, rel in zip(recommended_lists, relevant_lists)) / len(recommended_lists)

label = train_df['answer'].to_numpy().tolist()
print(f"MAP@3: {mean_average_precision_at_k(predictions, label, k=3)}\n------------------------\n")

MAP@3: 0.7025
------------------------

