In [17]:
import numpy as np
import random
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import transformers
# from datasets import Dataset
from peft import LoraConfig, PeftConfig, get_peft_model
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix,
                             f1_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter


In [14]:
# get working directory
cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
model_dir = os.path.join(cwd, 'model')

# load data and pre-process datasets
train_df = pd.read_csv(os.path.join(data_dir, 'gptTestNames.csv'))
# test_df = pd.read_csv(os.path.join(data_dir, 'gptTestNames.csv'))
# val_df = pd.read_csv(os.path.join(data_dir, 'gptValNames.csv'))

In [15]:
# X_train = list()
# X_test = list()
# for race in ["API", "White", "Black", "Hispanic"]:
#     train, test  = train_test_split(train_df[train_df.label==race], 
#                                     train_size=300,
#                                     test_size=300, 
#                                     random_state=42)
#     X_train.append(train)
#     X_test.append(test)

X_train, X_test  = train_test_split(train_df, 
                                train_size=int(len(train_df) * 0.0008),
                                test_size=int(len(train_df) * 0.0002), 
                                random_state=42)

# X_train = pd.concat(X_train).sample(frac=1, random_state=10)
# X_test = pd.concat(X_test)

# eval_idx = [idx for idx in train_df.index if idx not in list(train.index) + list(test.index)]
# X_eval = train_df[train_df.index.isin(eval_idx)]
X_train, X_eval = train_test_split(X_train, 
                                test_size=0.1,
                                random_state=42)
X_train = X_train.reset_index(drop=True)

In [5]:
len(X_train), len(X_test), len(X_eval)

(1867, 518, 208)

In [16]:
def generate_prompt(data_point, shuffle=False):
    if not shuffle:
        return f"""
                Guess the race of the name enclosed in square brackets into 1 of the following 4 categories: Asian, Black, Hispanic, or White. 
                Your answer should only be the category name.
                [{data_point["name"]}].
                ANSWER: {data_point["label"]}
                """.strip()
    
    categories = ["Hispanic", "Black", "White", "Asian"]
    random.shuffle(categories)
    categories_str = ', '.join(categories)
    return f"""
            Guess the race of the name enclosed in square brackets into 1 of the following 4 categories: {categories_str}. 
            Your answer should only be the category name.
            [{data_point["name"]}]
            ANSWER: {data_point["label"]}
            """.strip()

def generate_test_prompt(data_point, shuffle=False):
    if not shuffle:
        return f"""
                Guess the race of the name enclosed in square brackets into 1 of the following 4 categories: Asian, Black, Hispanic, or White. 
                Your answer should only be the category name.
                [{data_point["name"]}]
                ANSWER: """.strip()
    
    categories = ["Hispanic", "Black", "White", "Asian"]
    random.shuffle(categories)
    categories_str = ', '.join(categories)
    return f"""
            Guess the race of the name enclosed in square brackets into 1 of the following 4 categories: {categories_str}. 
            Your answer should only be the category name.
            [{data_point["name"]}]
            ANSWER: """.strip()


X_train_1 = pd.DataFrame(X_train.apply(lambda row: generate_prompt(row, shuffle=True), axis=1), 
                       columns=["name"])
X_eval_1 = pd.DataFrame(X_eval.apply(lambda row: generate_prompt(row, shuffle=True), axis=1), 
                       columns=["name"])

y_true = X_test.label
X_test_1 = pd.DataFrame(X_test.apply(lambda row: generate_test_prompt(row, shuffle=True), axis=1), 
                      columns=["name"])
X_test_2 = pd.DataFrame(X_test.apply(lambda row: generate_test_prompt(row, shuffle=False), axis=1), 
                      columns=["name"])

# train_data = Dataset.from_pandas(X_train_1)
# eval_data = Dataset.from_pandas(X_eval_1)

In [18]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data['name']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_data = MyDataset(X_train_1)
eval_data = MyDataset(X_eval_1)
train_loader = dataloader.DataLoader(train_data, batch_size=1, shuffle=True)
eval_loader = dataloader.DataLoader(eval_data, batch_size=1, shuffle=True)

In [7]:
def evaluate(y_true, y_pred):
    labels = ['API', 'Black', 'Hispanic', 'White']
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred, target_names=labels)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels)
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [19]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token='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'
        )

output_dir="trained_weigths"

peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        tokenizer_name_or_path=model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name,)
    # if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = get_peft_model(model, peft_config)
trainable_params, all_param = model.get_nb_trainable_parameters()

optimizer = torch.optim.AdamW(model.parameters(), lr="cosine")
lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=(len(train_data) * args.num_epochs),
        )


# compute_dtype = getattr(torch, "float16")

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, 
#     bnb_4bit_quant_type="nf4", 
#     bnb_4bit_compute_dtype=compute_dtype,
#     bnb_4bit_use_double_quant=False,
# )

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype=compute_dtype,
#     # quantization_config=bnb_config, 
# )

# model.config.use_cache = False
# model.config.pretraining_tp = 1

# tokenizer = AutoTokenizer.from_pretrained(model_name, 
#                                           trust_remote_code=True,
#                                          )
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# model, tokenizer = setup_chat_format(model, tokenizer)

config.json: 100%|██████████| 614/614 [00:00<00:00, 348kB/s]
model.safetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 13.6MB/s]
model-00001-of-00002.safetensors:  56%|█████▋    | 5.63G/9.98G [00:30<00:23, 182MB/s] 
Downloading shards:   0%|          | 0/2 [00:31<?, ?it/s]


OSError: [Errno 28] No space left on device