In [1]:
import pandas as pd
import numpy as np
import torch
import random
from datasets import Dataset,DatasetDict
from sklearn.metrics import (accuracy_score, 
                            precision_score, 
                            recall_score,
                            f1_score,  
                            roc_auc_score,
                            average_precision_score,
                            precision_recall_fscore_support,
                            precision_recall_curve,
                            auc)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BitsAndBytesConfig
import evaluate
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.impute import KNNImputer
# from peft import LoraConfig,get_peft_model
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
from transformers import AutoTokenizer, DataCollatorWithPadding
import functools
import os
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"


# Function to set seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Ensure PyTorch uses all available threads
torch.set_num_threads(torch.get_num_threads())

def data_pre_processing(df):
    timestamp_columns = [col for col in df.columns if "timestamp" in col]
    other_non_imp_column_to_remove = ["id", "general_relapse_class"]
    columns_with_all_nan = df.columns[df.isna().all(axis=0)].tolist()
    print("shape of the dataframe before dropping columns", df.shape)
    df.drop(timestamp_columns + other_non_imp_column_to_remove + columns_with_all_nan, axis=1, inplace=True)
    print("shape of the dataframe after dropping columns", df.shape)
    categorical = df.select_dtypes(include=['bool', 'object']).columns.tolist()
    numerical = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    print("Length of categorical and numerical columns:", len(categorical), len(numerical))
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    imputed_data = imputer.fit_transform(df[numerical])
    df_imputed_numerical = pd.DataFrame(imputed_data, columns=numerical, index=df.index)
    df[numerical] = df_imputed_numerical
    return pd.get_dummies(df, columns=categorical, drop_first=True)

# In[3]:

df = pd.read_csv("/home/ubuntu/Tabular_Machine_Learning_Using_LLM/Data/patient_features_early_stage.csv")
df.rename(columns={'relapse?': 'general_relapse_class'}, inplace=True)
y = df["general_relapse_class"].values
df_encoded = data_pre_processing(df)
X = df_encoded.values
df_encoded.head()

def row_to_sentence_full(row):
    sentence_parts = [
        f"A patient of age {row['age']} started chemotherapy at {row['chemotherapy@t1_start_time']} days",
        f"underwent radiotherapy for {row['radiotherapy@t1_duration_days']} days",
        f"and had surgery at {row['surgery@t1_time']} days",
        f"with a history of smoking {row['nb_cig_packs_year']} cigarette packs per year",
        f"and {row['nb_cigs_day']} cigarettes per day",
    ]
    family_history = []
    if row['family_lung_cancer']:
        family_history.append("lung cancer")
    if row['family_other_cancer']:
        family_history.append("other types of cancer")
    if family_history:
        sentence_parts.append(f"and a family history that includes: {', '.join(family_history)}")
    else:
        sentence_parts.append("and no family history of cancer")
    for col in row.index:
        if pd.api.types.is_numeric_dtype(row[col]) and not '@' in col and col not in ['age', 'Unnamed: 0']:
            sentence_parts.append(f"{col.replace('_', ' ')} of {row[col]}")
        elif pd.api.types.is_bool_dtype(row[col]) and row[col]:
            sentence_parts.append(f"and has a condition of {col.replace('@', ' at ').replace('_', ' ')}")
    sentence = ', '.join(sentence_parts) + '.'
    return sentence


texts = df_encoded.apply(row_to_sentence_full, axis=1).tolist()
labels = y.tolist()

data = {'text': texts,
        'labels':labels}
new_df = pd.DataFrame(data)

X = new_df["text"]
y = new_df["labels"]

relapse_df = new_df[new_df['labels'] == 1]
non_relapse_df = new_df[new_df['labels'] == 0]

# For relapse patients
relapse_train, relapse_test = train_test_split(relapse_df, test_size=100, random_state=42)
relapse_train, relapse_val = train_test_split(relapse_train, test_size=50, random_state=42)

# For non-relapse patients
non_relapse_train, non_relapse_test = train_test_split(non_relapse_df, test_size=100, random_state=42)
non_relapse_train, non_relapse_val = train_test_split(non_relapse_train, test_size=50, random_state=42)

train_df = pd.concat([relapse_train, non_relapse_train])
test_df = pd.concat([relapse_test, non_relapse_test])
val_df = pd.concat([relapse_val, non_relapse_val])

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)
ds = DatasetDict({'train': train_dataset, 'val': val_dataset,'test':test_dataset})



# model name
# model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'tiiuae/falcon-7b'
# model_name = 'meta-llama/Llama-2-7b'
model_name = 'dfurman/llama-7b'

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)
# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits[:, 0] > 0).astype(int)
    
    # Calculate basic metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    
    # Calculate probabilities for the positive class
    probabilities = torch.sigmoid(torch.tensor(logits[:, 0])).numpy()
    
    # Calculate AUC-ROC
    auc_roc = roc_auc_score(labels, probabilities)
    
    # Calculate AUC-PR
    precision_curve, recall_curve, _ = precision_recall_curve(labels, probabilities)
    auc_pr = auc(recall_curve, precision_curve)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc_roc': auc_roc,
        'auc_pr': auc_pr
    }

class CustomTrainer(Trainer):
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss()
        # No need to apply sigmoid or softmax as CrossEntropyLoss does that internally
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
        
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
        
#         # compute custom loss without pos_weight
#         loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32))
#         return (loss, outputs) if return_outputs else loss

# define training args
training_args = TrainingArguments(
    output_dir = 'binary_classification_falcon',
    learning_rate = 1e-4,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

# train
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics
  
)



  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
shape of the dataframe before dropping columns (1348, 76)
shape of the dataframe after dropping columns (1348, 70)
Length of categorical and numerical columns: 59 11


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1048/1048 [00:00<00:00, 18338.36 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 12530.41 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 17853.80 examples/s]
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [2]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc Roc,Auc Pr
1,No log,0.689946,0.49,0.653061,0.494845,0.96,0.4328,0.426982
2,No log,0.628279,0.4,0.565217,0.443182,0.78,0.3148,0.379779
3,No log,0.849236,0.5,0.666667,0.5,1.0,0.2916,0.376439
4,No log,0.687512,0.5,0.666667,0.5,1.0,0.2764,0.370428
5,No log,0.620872,0.5,0.666667,0.5,1.0,0.2496,0.361385
6,No log,0.644327,0.49,0.657718,0.494949,0.98,0.2656,0.366424
7,No log,0.628783,0.42,0.567164,0.452381,0.76,0.2412,0.357156
8,0.615700,0.558042,0.33,0.464,0.386667,0.58,0.2304,0.353699
9,0.615700,0.605239,0.4,0.558824,0.44186,0.76,0.232,0.35398
10,0.615700,0.638906,0.43,0.589928,0.460674,0.82,0.2184,0.349058


Checkpoint destination directory binary_classification_falcon/checkpoint-66 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory binary_classification_falcon/checkpoint-132 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory binary_classification_falcon/checkpoint-198 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory binary_classification_falcon/checkpoint-264 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory binary_classification_falcon/checkpoint-330 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory binary_classification_falcon/checkpoint-396 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destin

TrainOutput(global_step=660, training_loss=0.6008430365360145, metrics={'train_runtime': 2672.8629, 'train_samples_per_second': 3.921, 'train_steps_per_second': 0.247, 'total_flos': 3.779809052236186e+16, 'train_loss': 0.6008430365360145, 'epoch': 10.0})

In [3]:
tokenized_ds['test']

Dataset({
    features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 200
})

In [4]:
logits, _, _ = trainer.predict(tokenized_ds['test'])

In [5]:
tokenized_ds['test']['labels']

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [6]:
predictions = np.argmax(logits, axis=-1)
print("accuracy:", accuracy_score(tokenized_ds['test']['labels'], predictions))
print("precision:", precision_score(tokenized_ds['test']['labels'], predictions))
print("recall:", recall_score(tokenized_ds['test']['labels'], predictions))
print("f1:", f1_score(tokenized_ds['test']['labels'], predictions))
print("auc_roc:", roc_auc_score(tokenized_ds['test']['labels'], torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()[:, 1]))
print("auc_pr:", average_precision_score(tokenized_ds['test']['labels'], torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()[:, 1]))

accuracy: 0.625
precision: 0.6213592233009708
recall: 0.64
f1: 0.6305418719211823
auc_roc: 0.6617999999999999
auc_pr: 0.6738688490064637
