In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [4]:
filename = "./data/X_test_df.csv"

X_test_df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")
y_true = X_test_df.sentiment


In [5]:
from main.prompt_engineer import PromptGenerator
prompt_generator = PromptGenerator()
y_true = X_test_df.sentiment
X_test_df = prompt_generator.generate_dataframe_prompts(X_test_df, prompt_type='test')

In [6]:
from main.model_fine_tune import ModelReloader
# Specify the base model name and the fine-tuned model ID
# base_model_name = "NousResearch/Llama-2-7b-hf"
# model_name = "MaziyarPanahi/Mistral-7B-Instruct-v0.2"
#model_name = "NousResearch/Llama-2-7b-hf"'
model_name = "alpindale/gemma-7b"
fine_tuned_model_id = "./trained_model"

# Create an instance of ModelReloader
reloader = ModelReloader(model_name, fine_tuned_model_id)

# Reload the fine-tuned model and tokenizer
ft_model, tokenizer = reloader.reload()

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]


In [7]:
from main.predict import ModelPredictor
from main.evaluation import ModelEvaluator
predictor = ModelPredictor(ft_model, tokenizer)
y_pred = predictor.predict(X_test_df)


evaluator = ModelEvaluator()
evaluator.evaluate(y_true, y_pred)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

Accuracy: 0.334
Accuracy for label 0: 0.003
Accuracy for label 1: 0.997
Accuracy for label 2: 0.000

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.00      0.01       300
           1       0.33      1.00      0.50       301
           2       0.00      0.00      0.00       300

    accuracy                           0.33       901
   macro avg       0.28      0.33      0.17       901
weighted avg       0.28      0.33      0.17       901


Confusion Matrix:
[[  1 299   0]
 [  1 300   0]
 [  0 300   0]]





: 