In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix,
                             f1_score)
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.2.0+cu121


In [22]:
# get working directory
cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
model_dir = os.path.join(cwd, 'model')

# load data and pre-process datasets
train_df = pd.read_csv(os.path.join(data_dir, 'gptTestNames.csv'))
# test_df = pd.read_csv(os.path.join(data_dir, 'gptTestNames.csv'))
# val_df = pd.read_csv(os.path.join(data_dir, 'gptValNames.csv'))

In [23]:
# X_train = list()
# X_test = list()
# for race in ["API", "White", "Black", "Hispanic"]:
#     train, test  = train_test_split(train_df[train_df.label==race], 
#                                     train_size=300,
#                                     test_size=300, 
#                                     random_state=42)
#     X_train.append(train)
#     X_test.append(test)

X_train, X_test  = train_test_split(train_df, 
                                train_size=int(len(train_df) * 0.008),
                                test_size=int(len(train_df) * 0.002), 
                                random_state=42)

# X_train = pd.concat(X_train).sample(frac=1, random_state=10)
# X_test = pd.concat(X_test)

# eval_idx = [idx for idx in train_df.index if idx not in list(train.index) + list(test.index)]
# X_eval = train_df[train_df.index.isin(eval_idx)]
X_train, X_eval = train_test_split(X_train, 
                                test_size=0.1,
                                random_state=42)
X_train = X_train.reset_index(drop=True)

In [5]:
len(X_train), len(X_test), len(X_eval)

(18678, 5188, 2076)

In [24]:
def generate_prompt(data_point):
    return f"""
            Category the name enclosed in square brackets, 
            into 1 of the following 4 categories: Asian Pacific Islander, Black, Hispanic, or White.
            Your answer should only be the category name. 
            [{data_point["name"]}].
            ANSWER: {data_point["label"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Category the name enclosed in square brackets, 
            into 1 of the following 4 categories: Asian Pacific Islander, Black, Hispanic, or White.
            Your answer should only be the category name.
            [{data_point["name"]}]
            ANSWER: """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["name"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["name"])

y_true = X_test.label
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["name"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [7]:
def evaluate(y_true, y_pred):
    labels = ['API', 'Black', 'Hispanic', 'White','none']
    mapping = {'API': 0, 'Black': 1, 'Hispanic':2, 'White': 3, 'none': 4}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred, target_names=labels)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels)
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [8]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.47s/it]


In [27]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(test))):
    # for i in [69, 222, 676, 1270, 2060, 3684, 3827, 4472, 4799, 4972, 5120]:
        prompt = test.iloc[i]["name"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 4, 
                        # temperature = 0.0,
                        do_sample = False,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split(":")[-1]
        if "Pacific Islander" in answer:
            y_pred.append("API")
        elif "Black" in answer:
            y_pred.append("Black")
        elif "Hispanic" in answer:
            y_pred.append("Hispanic")
        elif "White" in answer:
            y_pred.append("White")
        else:
            y_pred.append("none")
            print(prompt,answer)
    return y_pred

In [26]:
y_pred = predict(X_test, model, tokenizer)



Category the name enclosed in square brackets, 
            into 1 of the following 4 categories: Asian Pacific Islander, Black, Hispanic, or White.
            Your answer should only be the category name.
            [Houston Jennifer]
            ANSWER:  Asian Pacific Islander
Category the name enclosed in square brackets, 
            into 1 of the following 4 categories: Asian Pacific Islander, Black, Hispanic, or White.
            Your answer should only be the category name.
            [Coma Allter]
            ANSWER:  Asian Pacific Islander
Category the name enclosed in square brackets, 
            into 1 of the following 4 categories: Asian Pacific Islander, Black, Hispanic, or White.
            Your answer should only be the category name.
            [Showers Mark]
            ANSWER:  Asian Pacific Islander
Category the name enclosed in square brackets, 
            into 1 of the following 4 categories: Asian Pacific Islander, Black, Hispanic, or White.
            Yo

In [50]:
unique_res = set(y_pred)  # Get unique labels
print(unique_res)

unique_labels = set(y_true)
print(unique_labels)

labels = list(set(list(set(y_true))+list(set(y_pred))))
labels

{'Hispanic', 'White', 'API', 'Black', 'none'}
{'Black', 'Hispanic', 'White', 'API'}


['Hispanic', 'White', 'API', 'Black', 'none']

In [33]:
indices = [i for i, x in enumerate(y_pred) if x == "none"]
indices

[69, 222, 676, 1270, 2060, 3684, 3827, 4472, 4799, 4972, 5120]

In [36]:
print(X_test.iloc[indices].name.values)

['Category the name enclosed in square brackets, \n            into 4 categories: Asia Pacific Islander, Black, Hispanic, or White.\n\n            [Houston Jennifer] ='
 'Category the name enclosed in square brackets, \n            into 4 categories: Asia Pacific Islander, Black, Hispanic, or White.\n\n            [Coma Allter] ='
 'Category the name enclosed in square brackets, \n            into 4 categories: Asia Pacific Islander, Black, Hispanic, or White.\n\n            [Showers Mark] ='
 'Category the name enclosed in square brackets, \n            into 4 categories: Asia Pacific Islander, Black, Hispanic, or White.\n\n            [Cubas Jhon] ='
 'Category the name enclosed in square brackets, \n            into 4 categories: Asia Pacific Islander, Black, Hispanic, or White.\n\n            [Honer Barbara] ='
 'Category the name enclosed in square brackets, \n            into 4 categories: Asia Pacific Islander, Black, Hispanic, or White.\n\n            [Toler Passion] ='
 'Categ

In [43]:
# y_pred = ['API' if 'Asia Pacific Islander' in x else x for x in y_pred]
evaluate(y_true, y_pred)

Accuracy: 0.418
Accuracy for label 0: 0.894
Accuracy for label 1: 0.042
Accuracy for label 2: 0.421
Accuracy for label 3: 0.478

Classification Report:
              precision    recall  f1-score   support

         API       0.03      0.89      0.07       104
       Black       0.49      0.04      0.08       706
    Hispanic       0.85      0.42      0.56       874
       White       0.85      0.48      0.61      3504
        none       0.00      0.00      0.00         0

    accuracy                           0.42      5188
   macro avg       0.45      0.37      0.26      5188
weighted avg       0.79      0.42      0.52      5188



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ValueError: At least one label specified must be in y_true

In [51]:
conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels)

In [52]:
print(conf_matrix)

[[ 368   59  440    6    1]
 [  55 1676 1742   25    6]
 [   4    7   93    0    0]
 [   5  229  438   30    4]
 [   0    0    0    0    0]]


In [55]:
f1_micro = f1_score(y_true, y_pred, average='micro')
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score (Micro): {f1_micro:.3f}")
print(f"F1 Score (Macro): {f1_macro:.3f}")
print(f"F1 Score (Weighted): {f1_weighted:.3f}")

F1 Score (Micro): 0.418
F1 Score (Macro): 0.264
F1 Score (Weighted): 0.520
