### This script is to evaluate original Llama-3.2-1B-Instruct using test dataset

In [1]:
#%% import packages

from datasets import load_dataset
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import torch
import time
import pandas as pd
import numpy as np
import copy

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
#%% Loading the model and tokenizer

model_path = "./Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True)

model.to(device)
model.eval() 


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [4]:
#%% load dataset

df = pd.read_csv("./data/relevance_test3.csv")

df['relevance'].value_counts()   


relevance
FAIL    7895
PASS    1105
Name: count, dtype: int64

In [5]:
#%% testing

system_prompt = """ Given the following QUESTION and DOCUMENTS, 
                    Please analyze the contents of DOCUMENTS and determine 
                    whether it is relevant in answering the QUESTION. 
                        
                    The output should strictly use the following template: 
                    Output: "PASS" if the contents of DOCUMENTS is relevant in answering the QUESTION
                    and "FAIL" if the contents is not relevant in answering the QUESTION’ on the last line.
                    """
                    
                    
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
    


df['relevance_1b'] = ''

for i in range(len(df)):  # it will take several hours depending on your hardware 
    
    query_sentence = df['question'][i]
    response = df['text'][i]
    
      
    user_prompt = """
                QUESTION: {},
                
                DOCUMENTS: {}
    
                """.format(query_sentence, response)    
                    
                        
    messages = [
        {"role": "system", 
         "content": system_prompt },
        {"role": "user", 
         "content": user_prompt},
        ]
         
         
         
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(device)
    

    #attention_mask = torch.ones(input_ids.size()).to(device)
    outputs = model.generate(
        input_ids,
        #attention_mask,
        max_new_tokens=512,
        eos_token_id=terminators,
        pad_token_id=128004,
        do_sample=True,
        temperature=0.4,
        top_p=0.9,
    )
    
    response = outputs[0][input_ids.shape[-1]:]
    response1 = tokenizer.decode(response, skip_special_tokens=True)

    # if relevant doc is detected, 
    try:
        if response1.split('Output:')[1].strip().replace('"','')=="PASS":
            df.loc[i,'relevance_1b'] = "PASS"
        else:
            df.loc[i,'relevance_1b'] = "FAIL"
    except:
        df.loc[i,'relevance_1b'] = "FAIL"
    
    if i % 10 ==0:
        print(i)   

    if i ==20:  # comment this to evaluate all dataset
        break
        
# save the file

# df.to_csv('./data/relevance_1b_test.csv', index=False)    

0
10
20


In [6]:
#%% upload the tested result
df = pd.read_csv("./data/relevance_1b_test.csv")

print(df['relevance'].value_counts())    
print(df['relevance_1b'].value_counts()) 


relevance
FAIL    7895
PASS    1105
Name: count, dtype: int64
relevance_1b
PASS    7793
FAIL    1207
Name: count, dtype: int64


In [7]:
#%% accuracy(confusion matrix)

tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(df)):
    if df['relevance_1b'][i]=='PASS' and df['relevance'][i]=='PASS':
        tp += 1
    elif df['relevance_1b'][i]=='PASS' and df['relevance'][i]=='FAIL':
        fp += 1
    elif df['relevance_1b'][i]=='FAIL' and df['relevance'][i]=='PASS':
        fn += 1
    elif df['relevance_1b'][i]=='FAIL' and df['relevance'][i]=='FAIL':
        tn += 1        

total = tp+fp+fn+tn
accuracy = (tp+tn)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2/(1/precision+1/recall)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Accuracy: 0.23666666666666666
Precision: 0.13011677146156808
Recall: 0.9176470588235294
F1: 0.22791638570465275
