In [None]:
! pip install trl
! pip install torch
! pip install transformers
! pip install bitsandbytes
! pip install accelerate
! pip install peft




In [None]:
import os
import argparse
import torch
# from torch.utils.data import Dataset
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
class ExplanationDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=256):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Read CSV file using pandas
        df = pd.read_csv(file_path)
        for _, row in df.iterrows():
            prompt, response = row["prompt"], row["response"]
            self.data.append((prompt, response))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt, response = self.data[idx]

        # Tokenize inputs and outputs
        inputs = self.tokenizer(prompt, response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        # Ensure the input format matches Mistral's requirements
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': inputs['input_ids'].clone()  # Mistral model requires labels
        }


In [None]:



def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")


def format_instruction(sample):
    return f""" You are a medical explanation generator. You take a clinical note and diagnosis to generate the explanation for the diagnosis.
        ### Input:
        {sample["text"]}

        ### Response:
        {sample["label"]}
    """

def finetune_model(args):

    # base model to finetune
    model_id = args.base_model

    # BitsAndBytesConfig to quantize the model int-4 config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    # load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map="auto")
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

    # exp_dataset = load_dataset("csv", data_files=file_path)
    df = pd.read_csv(file_path)
    exp_dataset = Dataset.from_pandas(df)
    print("exp_dataset: ", exp_dataset)
    # exp_dataset = ExplanationDataset(file_path, tokenizer)

    # LoRA config based on QLoRA paper
    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
        bias="none",
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
    )

    # prepare model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    # print the number of trainable model params
    print_trainable_parameters(model)

    model_args = TrainingArguments(
        output_dir="bio_mistral_finetuned",
        num_train_epochs=5,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        optim="paged_adamw_32bit",
        logging_steps=10,
        save_strategy="epoch",
        learning_rate=2e-4,
        bf16=False,
        tf32=False,
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        lr_scheduler_type="constant",
        disable_tqdm=False
    )

    max_seq_length = 256

    trainer = SFTTrainer(
        model=model,
        train_dataset=exp_dataset,
        dataset_text_field="text",
        peft_config=peft_config,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        packing=True,
        formatting_func=format_instruction,
        args=model_args,
    )

    # train
    trainer.train()

    # save model
    trainer.save_model()

    torch.cuda.empty_cache()

    new_model = "biomistral_instruct_explanation_full"
    # Save the fine-tuned model
    trainer.model.save_pretrained(new_model)
    model.save_pretrained('biomistral_instruct_explanation_0-100')

    return trainer.model, tokenizer



In [None]:
def main( base_model, model_name, file_path, auth_token=None, push_to_hub=False):
    args = argparse.Namespace(base_model=base_model, model_name=model_name, file_path=file_path, auth_token=auth_token, push_to_hub=push_to_hub)
    print("Args: ", args)
    model, tokenizer = finetune_model(args)

    eval_prompt = """The following is a clinical note and the patient was diagnosed with Stable angina. Explain with evidence the diagnosis of Stable angina.
    Patient Details:
    Age: 72
    Sex: Female

    Chief Complaint (CC):
    The patient reports symptoms that increase with physical exertion but are alleviated with rest.

    History of Present Illness (HPI):
    The patient describes experiencing pain in various locations including the lower chest, side of the chest, upper chest, left breast, and epigastric region. The pain is characterized as tedious, heavy, and exhausting. The pain radiates to under the jaw, right and left triceps, and right shoulder. The patient reports that the pain is intense in the epigastric region and precisely located in the right shoulder. The onset of the pain was rapid.

    Past Medical History (PMH):
    The patient has a history of being significantly overweight, high cholesterol, excessive alcohol consumption, smoking, high blood pressure, and heart disease including angina and a previous heart attack. The patient has not traveled out of the country in the last 4 weeks.

    Medications and Allergies:
    The patient is likely on medications for high cholesterol and high blood pressure, but specific medications and dosages are not provided in the information given. No known allergies are reported.

    Physical Examination (PE):
    Vitals and general appearance are not provided in the information given. The patient's reported symptoms suggest possible cardiovascular issues, particularly given the patient's past medical history.

    What are the evidences for the diagnosis of Stable angina?
    """

    model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
    model.eval()

    print("The reponse for the test: ")
    with torch.no_grad():
        generated_code = tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True)
    print(generated_code)

In [None]:
if __name__ == "__main__":
    # Define your variables for argument values
    dataset = "neuralwork/fashion-style-instruct"
    base_model = "BioMistral/BioMistral-7B"
    model_name = "mistral-7b-style-instruct"
    file_path = 'final_data/full_train.csv'
    auth_token = None
    push_to_hub = False

    # Call the main function with the variables
    main(base_model, model_name, file_path, auth_token, push_to_hub)

Args:  Namespace(base_model='BioMistral/BioMistral-7B', model_name='mistral-7b-style-instruct', file_path='/content/drive/MyDrive/meng_project/explanation_training/full_train.csv', auth_token=None, push_to_hub=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


exp_dataset:  Dataset({
    features: ['Unnamed: 0', 'row_id', 'diagnosis', 'text', 'label'],
    num_rows: 1000
})
trainable params: 85041152 || all params: 3837112320 || trainable%: 2.2162799758751914


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,1.1272
20,0.7053
30,0.6295
40,0.5833
50,0.553
60,0.5343
70,0.5124
80,0.4957
90,0.4262
100,0.3421




The reponse for the test: 
The following is a clinical note and the patient was diagnosed with Stable angina. Explain with evidence the diagnosis of Stable angina.
    Patient Details: 
    Age: 72
    Sex: Female

    Chief Complaint (CC): 
    The patient reports symptoms that increase with physical exertion but are alleviated with rest.

    History of Present Illness (HPI): 
    The patient describes experiencing pain in various locations including the lower chest, side of the chest, upper chest, left breast, and epigastric region. The pain is characterized as tedious, heavy, and exhausting. The pain radiates to under the jaw, right and left triceps, and right shoulder. The patient reports that the pain is intense in the epigastric region and precisely located in the right shoulder. The onset of the pain was rapid. 

    Past Medical History (PMH): 
    The patient has a history of being significantly overweight, high cholesterol, excessive alcohol consumption, smoking, high blood 

In [None]:
# destination_file_path = '/content/drive/MyDrive/meng_project'
# new_model = "/content/biomistral_instruct_explanation_full"

# !cp -r $new_model $destination_file_path

In [None]:
# destination_file_path = '/content/drive/MyDrive/meng_project'
# new_model = "/content/biomistral_instruct_explanation"

# !cp -r $new_model $destination_file_path