In [None]:
from huggingface_hub import login
login(token="YOUR_TOKEN")

In [None]:
import pandas as pd

# Load the data
data_dir = "../data/lm_finetune_data/ctx-resp_train.csv"
data = pd.read_csv(data_dir)

In [None]:
import os
from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    HfArgumentParser,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    pipeline,
    logging,
    set_seed
)
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer

# Transformer parameters. you can vary the model choice to conduct prompt generation with different models

model_name = "wxjiao/alpaca-7b"

# Bitsandbytes parameters

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit model 
bnb_4bit_use_double_quant = True

# The quantization type for 4-bit model (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# The compute dtype for 4-bit model
bnb_4bit_compute_dtype = torch.bfloat16

In [None]:
# Load model from Hugging Face Hub with model name and bitsandbytes configuration
def create_bnb_config(load_in_4bit, bnb_4_bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
        Configure model quantization using bitsandbytes to speed up training and inference
        :param load_in_4bit: Load the model in 4-bit precision mode
        :param bnb_4_bit_use_double_quant: nested quantization for 4-bit model
        :param bnb_4bit_quant_type: The quantization type for 4-bit model
        :param bnb_4bit_compute_dtype: The compute dtype for 4-bit model
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
    )
    return bnb_config

def load_model(model_name, bnb_config):
    """
        Load the model and tokenizer
        :param model_name: Huggingface model name
        :param bnb_config: Bitsandbytes configuration
    """

    # Get number of GPU device and set maximum memory
    n_gpus = torch.cuda.device_count()
    max_memory = f'{22960}MB'

    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                 quantization_config=bnb_config,
                                                 device_map = "auto",
                                                 max_memory= {i: max_memory for i in range(n_gpus)})
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

model, tokenizer = load_model(model_name, bnb_config)

In [None]:
from tqdm import tqdm
import pickle as p

def create_prompt_formats(sample):
    """
        Create a formatted prompt template for a prompt in the instruction dataset
    """
    # Initialize static strings for the prompt template
    INTRO_BLURB = "Below is an instruction that describes a task. Please respond with 'True' or 'False' only that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    example = "USER: I'm planning a trip, can you help me look for a flight? SYSTEM: Which day are you planning to return and from which city? USER: I want to go from NYC the day after tomorrow and return on the 13th of this month. SYSTEM: Where would you like to go? USER: I want to go to Vancouver, BC. Can you look for a Premium Economy class ticket. SYSTEM: I found 1 flight for you. It is a Delta Airlines flight that takes off at 6 am and returns at 2:50 am. The price is $505. USER: What is the departure airport, and how many stops does the flight have?"
    example_2 = "USER: Get me bus tickets to an Cher event on March 6th SYSTEM: How many to buy? USER: only one please"
    input_text = f"{INPUT_KEY}\n{sample['input']}" if sample['input'] else None
    response = f"{RESPONSE_KEY}\n{sample['output']}"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    # parts = [part for part in [blurb, instruction, "Example 1: ", example, "### Response: True", "Example 2: ", example_2, "### Response: False", input_text, response, end] if part]
    parts = [part for part in [blurb, instruction, input_text, response, end] if part]
    # Combine the prompt template elements into a single string
    formatted_prompt = "\n\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

predictions = []
for i, row in tqdm(data.iterrows(), total=len(data)):
    prompt = create_prompt_formats(row)["text"]
    input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()
    output = model.generate(input_ids, max_length=(input_ids.shape[1] + 20)) 
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    output_text = output_text[len(prompt):].lower()
    if 'false' in output_text:
        predictions.append("False")
    elif 'true' in output_text:
        predictions.append("True")
    else:
        predictions.append("error")

p.dump(predictions, open("../../outputs/prompt_pred/zero_alpaca-7b_prediction.pkl", "wb"))

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import pickle as p

labels = []
for i, row in data.iterrows():
    labels.append(row['output'])

converted_outputs = []
for output in predictions:
    converted_outputs.extend([1 if output == 'True' else 0])
# Calculate the precision, recall, F1, and support
print(precision_score(labels, converted_outputs, average=None))

conf_matrix = confusion_matrix(labels, converted_outputs)
tn,fp,fn,tp = conf_matrix.ravel()
fpr = fp / (fp + tn)

print(f"False Positive Rate: {fpr}")


tn,fp,fn,tp = conf_matrix.ravel()
fpr = fp / (fp + tn)

precision = tp / (tp + fp)
recall = tp / (tp + fn)    

print(f"Positive Precision: {precision}")
print(f"Positive Recall: {recall}")
f1 = f1_score(labels, converted_outputs, average=None)
print(f"Positive F1: {f1}")

fpr, tpr, _ = roc_curve(labels, converted_outputs)
print(f"Area Under Curve: {auc(fpr, tpr)}")
print(f"tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}")