In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import transformers
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence, Tuple, List
from peft import LoraConfig, get_peft_model
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
from scipy.special import softmax
import sklearn.metrics

In [3]:
@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    run_name: str = field(default="run")
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(default=256, metadata={"help": "Maximum sequence length."})
    gradient_accumulation_steps: int = field(default=1)
    per_device_train_batch_size: int = field(default=8)
    per_device_eval_batch_size: int = field(default=4)
    num_train_epochs: int = field(default=5)
    fp16: bool = field(default=False)
    #logging_steps: int = field(default=1000)
    save_steps: int = field(default=200)
    eval_steps: int = field(default=200)
    evaluation_strategy: str = field(default="steps")
    load_best_model_at_end: bool = field(default=True)     # load the best model when finished training (default metric is loss)
    metric_for_best_model: str = field(default="eval_loss") # the metric to use to compare models
    greater_is_better: bool = field(default=False)           # whether the `metric_for_best_model` should be maximized or not
    logging_strategy: str = field(default="steps")  # Log every "steps"
    logging_steps: int = field(default=200)  # Log every 100 steps
    warmup_ratio: int = field(default=0.1)
    weight_decay: float = field(default=1e-2)
    learning_rate: float = field(default=1e-5)
    lr_scheduler_type: str = field(default='linear')
    save_total_limit: int = field(default=5)
    load_best_model_at_end: bool = field(default=True)
    output_dir: str = field(default="/common/zhangz2lab/zhanh/Jupyter_Scripts/output_0828/llama_results")
    find_unused_parameters: bool = field(default=False)
    checkpointing: bool = field(default=False)
    dataloader_pin_memory: bool = field(default=False)
    eval_and_save_results: bool = field(default=True)
    save_model: bool = field(default=False)
    seed: int = field(default=42)
    logging_first_step: bool = field(default=True)
    early_stopping_patience: int = field(default = 5)  # number of evaluations without improvement to wait
    early_stopping_threshold: float = field(default = 1e-3)  # threshold for an improvement
training_args = TrainingArguments() 

In [4]:
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

class DNADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=training_args.model_max_length):
        self.texts = dataframe["sequence"].tolist()  # CHANGED
        self.labels = dataframe["label"].tolist()  # CHANGED
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [5]:
def custom_data_collator(data):
    input_ids = torch.stack([item['input_ids'] for item in data])
    attention_mask = torch.stack([item['attention_mask'] for item in data])
    labels = torch.stack([item['labels'] for item in data])
    return {
        'input_ids': input_ids,  # CHANGED key name
        'attention_mask': attention_mask,  # CHANGED key name
        'labels': labels
    }

In [6]:
@dataclass
class ModelArguments:
    #model_name_or_path: Optional[str] = field(default="guardrail/llama-2-7b-guanaco-instruct-sharded")
    model_name_or_path: Optional[str] = field(default="togethercomputer/LLaMA-2-7B-32K")
    #model_name_or_path: Optional[str] = field(default="facebook/bart-base")
    use_lora: bool = field(default=True, metadata={"help": "whether to use LoRA"})
    lora_r: int = field(default=8, metadata={"help": "hidden dimension for LoRA"})
    lora_alpha: int = field(default=32, metadata={"help": "alpha for LoRA"})
    lora_dropout: float = field(default=0.05, metadata={"help": "dropout rate for LoRA"})
    #lora_target_modules: str = field(default="k_proj,q_proj,v_proj,fc1,fc2,output_proj", metadata={"help": "where to perform LoRA"})
    lora_target_modules: str = field(default="q_proj,v_proj", metadata={"help": "where to perform LoRA"})
    #lora_target_modules: str = field(default="query,key,value", metadata={"help": "where to perform LoRA"})
    use_4bit: bool = field(default=True, metadata={"help": "whether to use 4-bit quantization"})
    use_nested_quant: bool = field(default=False, metadata={"help": "Activate nested quantization for 4-bit base models"})
    bnb_4bit_compute_dtype: str = field(default="bfloat16", metadata={"help": "Compute dtype for 4-bit base models"})
    bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "Quantization type (fp4 or nf4)"})
model_args = ModelArguments()

In [7]:
@dataclass
class DataArguments:
    data_path: str = field(default="/common/zhangz2lab/zhanh/GUE/EMP/H3", metadata={"help": "Path to the training data."})
data_args = DataArguments()        

In [8]:
#model_name_or_path = "guardrail/llama-2-7b-guanaco-instruct-sharded"
#model_name_or_path = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path,
        #cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        padding_side="right",
        use_fast=True,
        trust_remote_code=True)  # CHANGED (though this was originally okay)
texts = ["I love this!", "This is bad.", "Could be better.", "Excellent!"]
labels = [1, 0, 0, 1]

train_dataset = SimpleDataset(texts, labels, tokenizer)
test_texts = ["This is amazing!", "I don't like it."]
test_labels = [1, 0]
test_dataset = SimpleDataset(test_texts, test_labels, tokenizer)

train_df = pd.read_csv(os.path.join(data_args.data_path, "train.csv"))  # ADDED
test_df = pd.read_csv(os.path.join(data_args.data_path, "test.csv"))  # ADDED
val_df = pd.read_csv(os.path.join(data_args.data_path, "dev.csv"))  # ADDED

train_dataset = DNADataset(train_df, tokenizer)  # CHANGED
test_dataset = DNADataset(test_df, tokenizer)  # CHANGED
val_dataset = DNADataset(val_df, tokenizer)  # CHANGED

In [9]:
first_sequence = val_df.iloc[0]["sequence"]
print(first_sequence)
print(len(first_sequence))
# Tokenize the sequence without padding
tokens = tokenizer.encode(first_sequence, add_special_tokens=True, padding=False, return_tensors="pt")

# Print out the token IDs
print(tokens[0].tolist())
print(len(tokens[0].tolist()))

ACAATAATAATAATAATAATAATAATAATAACAATAACAATAGTGGTAATAGTAGTAATAATAATAACAATAATAACAATAATAAAAATAATAATGACTTCGGCATTAAGATTGATAACAATTCACCGTCTTATGAAGGGTTTCCCCAGTTACAAATACCGCTTTCACAAGACAATTTGAACATAGAAGATAAAGAGGAGATGTCACCTAATATTGAAATTAAAAACGAACAAAATATGACTGACTCAAACGATATTCTTGGAGTATTCGATCAGTTAGATGCTCAGCTATTTGGGAAATACCTACCTTTAAATTACCCCTCTGAATGAAAACGTTATCTTTGATTTATATTCTATAATATCGTGGCTACAGCACTTGCTGAACATAAGCTTAAAACGTTTATGTGTGTATTTATATATGATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATGGTGATAAACTGCAATAACAA
500
[319, 5454, 8254, 8254, 8254, 8254, 8254, 8254, 8254, 8254, 8254, 2477, 29909, 8254, 2477, 29909, 8254, 23799, 26788, 6040, 8254, 29954, 16881, 6040, 8254, 8254, 8254, 2477, 29909, 8254, 8254, 2477, 29909, 8254, 1299, 23184, 8254, 8254, 1299, 12739, 1783, 9472, 29954, 8766, 1299, 6040, 10051, 1299, 29911, 29954, 8254, 2477, 29909, 1299, 9472, 2477, 11135, 29911, 1783, 29911, 1299, 29954, 6344, 26788, 29954, 19988, 29911, 4174, 4174, 10051, 29911, 8687, 6344, 1299, 2477, 11135, 1783, 29911, 

In [10]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
print(compute_dtype)
#device_map = {"": 0}
#device_map = {"": "cuda:" + str(int(os.environ.get("LOCAL_RANK") or 0))}
device_map = "auto"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=model_args.use_4bit,
    bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=model_args.use_nested_quant,
)

if compute_dtype == torch.float16 and model_args.use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
        print("=" * 80)

#n_gpus = torch.cuda.device_count()
#max_memory = {i: '80GB' for i in range(n_gpus)}

torch.bfloat16


In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, num_labels=2, device_map=device_map,
    quantization_config=bnb_config, torch_dtype=torch.float32)  # CHANGED

#model = AutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, num_labels=2, device_map=device_map,
#    quantization_config=bnb_config) 
#model = AutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, num_labels=2, device_map='sequential',
#                                                           max_memory=max_memory, quantization_config=bnb_config, torch_dtype=torch.float32)
if model_args.use_lora:
        lora_config = LoraConfig(
            r = model_args.lora_r,
            lora_alpha = model_args.lora_alpha,
            target_modules = list(model_args.lora_target_modules.split(",")),
            lora_dropout = model_args.lora_dropout,
            bias="none",
            task_type="SEQ_CLS",
            inference_mode=False,
            #peft_type="ADALORA",
        )
        #print(list(model_args.lora_target_modules.split(",")))
        model = get_peft_model(model, lora_config)
        #model = AdaLoraModel(model, lora_config, "default")
        model.print_trainable_parameters()

# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=2,
#     logging_dir="./logs",
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at togethercomputer/LLaMA-2-7B-32K and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,210,688 || all params: 3,373,551,616 || trainable%: 0.12481469025194841


In [12]:
def compute_metrics(pred):
    logits = pred.predictions[0]
    labels = pred.label_ids
    preds = logits.argmax(-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

def compute_metrics(pred):
    logits = pred.predictions[0]
    labels = pred.label_ids
    
    print("Logits Shape:", logits.shape)
    print("Some Logits:", logits[:5])  # Print the first 5 samples
    print("Labels Shape:", labels.shape)
    print("Some Labels:", labels[:5]) 
    
    preds = logits.argmax(-1)
    
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    print("Some Probs:", probs[:5]) 
    
    # If it's binary classification, take the probability of the second class (usually the positive class)
    # This assumes that the output shape is [batch_size, 2]
    preds_prob = probs[:, 0]

    # Apply thresholding to get binary predictions
    threshold = 0.5
    preds = (preds_prob > threshold).astype(int)

    # Now, compute your metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    mcc = matthews_corrcoef(labels, preds)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mcc': mcc
    }

def calculate_metric_with_sklearn(pred, threshold=0.5):
    logits = pred.predictions
    labels = pred.label_ids
    # Compute probabilities from logits
    probabilities = softmax(logits, axis=-1)
    #print("Logits:", logits[:5]) # Printing the first 5 logits
    #print("Labels:", labels[:5]) # Printing the first 5 labels
    #print("Probabilities:", probabilities[:5]) # Printing the first 5 probabilities

    # Assuming the second column is the positive class
    valid_scores = probabilities[:, 1] 

    # Convert probabilities to binary predictions based on threshold
    predictions = (valid_scores > threshold).astype(int)

    #valid_mask = labels != -100  # Exclude padding tokens (assuming -100 is the padding token ID)
    #valid_predictions = predictions[valid_mask]
    #valid_labels = labels[valid_mask]
    #valid_scores = valid_scores[valid_mask] 

    # Debugging prints
    #print("Logits:", logits[:5]) # Printing the first 5 logits
    #print("Labels:", valid_labels[:5]) # Printing the first 5 labels
    #print("Probabilities:", probabilities[:5]) # Printing the first 5 probabilities

#     return {
#         "accuracy": sklearn.metrics.accuracy_score(valid_labels, valid_predictions),
#         "f1": sklearn.metrics.f1_score(valid_labels, valid_predictions, average="macro", zero_division=0),
#         "matthews_correlation": sklearn.metrics.matthews_corrcoef(valid_labels, valid_predictions),
#         "precision": sklearn.metrics.precision_score(valid_labels, valid_predictions, average="macro", zero_division=0),
#         "recall": sklearn.metrics.recall_score(valid_labels, valid_predictions, average="macro", zero_division=0),
#         "pr_auc": sklearn.metrics.average_precision_score(valid_labels, valid_scores),
#         "roc_auc": sklearn.metrics.roc_auc_score(valid_labels, valid_scores),
#         "brier_score": sklearn.metrics.brier_score_loss(valid_labels, valid_scores)
#     }
    return {
        "accuracy": sklearn.metrics.accuracy_score(labels, predictions),
        "f1": sklearn.metrics.f1_score(labels, predictions, average="macro", zero_division=0),
        "matthews_correlation": sklearn.metrics.matthews_corrcoef(labels, predictions),
        "precision": sklearn.metrics.precision_score(labels, predictions, average="macro", zero_division=0),
        "recall": sklearn.metrics.recall_score(labels, predictions, average="macro", zero_division=0),
        "pr_auc": sklearn.metrics.average_precision_score(labels, valid_scores),
        "roc_auc": sklearn.metrics.roc_auc_score(labels, valid_scores),
        "brier_score": sklearn.metrics.brier_score_loss(labels, valid_scores)
    }

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=calculate_metric_with_sklearn,
    data_collator=custom_data_collator
)

In [14]:
trainer.train()
results = trainer.evaluate()
print(results)

Step,Training Loss,Validation Loss,Accuracy,F1,Matthews Correlation,Precision,Recall,Pr Auc,Roc Auc,Brier Score
200,0.8116,0.750422,0.523046,0.522535,0.045675,0.522873,0.522801,0.506038,0.521217,0.273853
400,0.7253,0.716216,0.545758,0.522111,0.097496,0.554172,0.543867,0.526037,0.546631,0.259958
600,0.6849,0.612821,0.680695,0.679613,0.362572,0.682337,0.680241,0.746138,0.752861,0.211362
800,0.5309,0.481323,0.779559,0.779507,0.559917,0.780165,0.779751,0.83199,0.854471,0.156907
1000,0.4653,0.465807,0.799599,0.799579,0.599161,0.799589,0.799572,0.835012,0.862805,0.149234
1200,0.461,0.47399,0.800935,0.800928,0.601867,0.800924,0.800943,0.843604,0.868483,0.148309
1400,0.4614,0.505203,0.788911,0.785709,0.592505,0.804837,0.787909,0.853396,0.868269,0.1595
1600,0.4338,0.474986,0.804275,0.803715,0.610672,0.806807,0.803872,0.85644,0.873165,0.14641
1800,0.4176,0.433525,0.813627,0.813347,0.628169,0.814819,0.813352,0.862431,0.87969,0.137622
2000,0.4202,0.441847,0.799599,0.79949,0.600626,0.800765,0.799863,0.863263,0.883756,0.140104


{'eval_loss': 0.39563965797424316, 'eval_accuracy': 0.8463593854375417, 'eval_f1': 0.8460691778911151, 'eval_matthews_correlation': 0.6941895991914795, 'eval_precision': 0.8481466515812726, 'eval_recall': 0.8460461255600578, 'eval_pr_auc': 0.8947050788330689, 'eval_roc_auc': 0.9103943164170579, 'eval_brier_score': 0.11823498775671602, 'eval_runtime': 70.0168, 'eval_samples_per_second': 21.381, 'eval_steps_per_second': 5.356, 'epoch': 5.0}
