### This script is to evaluate finetuned Llama-3.2-1B with LM head using test dataset

In [1]:
#%% import packages


from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import time
import pandas as pd
import numpy as np
import copy

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
#%% Loading the model and tokenizer

model_path = "./llama3_2_1b_1101_3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           torch_dtype=torch.bfloat16,                                                            
                                                           low_cpu_mem_usage=True)

model.config.pad_token_id = model.config.eos_token_id

model.to(device)
model.eval() 


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128001)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), ep

In [4]:
#%% load dataset 

class RelevanceDataset(Dataset):
    def __init__(self, tokenizer, file_path):
        self.data = self.load_data(file_path)
        self.tokenizer = tokenizer
        
       
        self.cutoffdoc = 300
        self.system_prompt = """ Given the following QUESTION and DOCUMENTS, 
                            Please analyze the contents of DOCUMENTS and determine 
                            whether it is relevant in answering the QUESTION. 
                                
                            The output should strictly use the following template: 
                            Output: "PASS" if the contents of DOCUMENTS is relevant in answering the QUESTION
                            and "FAIL" if the contents is not relevant in answering the QUESTION’ on the last line.
                            """

    def load_data(self, file_path):
        data = pd.read_csv(file_path)
        return data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data['question'][idx]
        context = self.data['text'][idx]
        output = self.data['output'][idx]
        shorttext = ' '.join(context.split()[:self.cutoffdoc])
        user_prompt = """
                    QUESTION: {},
                    
                    DOCUMENTS: {}

                    """.format(question, shorttext)    
        # response = "Output:" + rec['relevance']    
        text = ["role:", "system",'\n'  
               "content:", self.system_prompt,'\n'
               "role:", "user", '\n' 
               "content", user_prompt
               ]
        texts = " ".join(text)   
        
        inputs = tokenizer(texts, padding='max_length', max_length=512,truncation=True,return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze()
        #labels = copy.deepcopy(input_ids)
        attention_mask = inputs['attention_mask'].squeeze() 
        return input_ids, attention_mask, output
    

test_dataset = RelevanceDataset(tokenizer,"./data/relevance_test3a.csv" )
  

batchsize = 40
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batchsize, pin_memory=True) 


In [5]:
#%% testing

true = []
pred = []
for i, batch in enumerate(test_dataloader):
    # Move batch tensors to the device
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    true.append(labels.cpu().tolist())


    # Forward pass
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        pred.append(predictions.cpu().tolist())  

    if i % 10 ==0:
        print(i) 
    if i ==20:  # comment this to evaluate all dataset
        break

       

0
10
20


In [None]:
#%% save the result

#data = pd.DataFrame()
#data['true'] = true_list
#data['pred'] = pred_list
#data.to_csv('./data/llama3_2_1b_1101_3_test.csv', index=False)   


In [6]:
#%% upload the tested result
df = pd.read_csv("./data/llama3_2_1b_1101_3_test.csv")



In [7]:
#%% accuracy(confusion matrix)

tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(df)):
    if df['pred'][i]==1 and df['true'][i]==1:
        tp += 1
    elif df['pred'][i]==1 and df['true'][i]==0:
        fp += 1
    elif df['pred'][i]==0 and df['true'][i]==1:
        fn += 1
    elif df['pred'][i]==0 and df['true'][i]==0:
        tn += 1           

total = tp+fp+fn+tn
accuracy = (tp+tn)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2/(1/precision+1/recall)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Accuracy: 0.9353333333333333
Precision: 0.7749737118822292
Recall: 0.6669683257918552
F1: 0.7169260700389105
