In [2]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

In [63]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)



==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


ValueError: 
                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
                    in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to
                    `from_pretrained`. Check
                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                    for more details.
                    

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## **data prepocessing**

In [None]:
#this code remaps the csv file and and json file to make a single csv that has symptoms and disease for finetuning.

import pandas as pd
import json

# Read the CSV file
csv_path = 'symdisease - symdisease.csv'  # Change this to your actual CSV file path
df = pd.read_csv(csv_path)

# Read the JSON file
json_path = 'mapping.json'  # Change this to your actual JSON file path
with open(json_path, 'r') as file:
    disease_mapping = json.load(file)

# Map the label numbers in the CSV to disease names using the JSON mapping
df['disease'] = df['label'].map(lambda x: next((k for k, v in disease_mapping.items() if v == x), 'Unknown'))

# Rename the text column to symptoms
df.rename(columns={'text': 'symptoms'}, inplace=True)

# Create the new dataset with the symptoms and disease columns
new_df = df[['symptoms', 'disease']]

# Save the new dataset to a CSV file
output_path = 'output_dataset.csv'  # Change this to your desired output file path
new_df.to_csv(output_path, index=False)

print('New dataset created successfully and saved to:', output_path)


In [6]:
#data preporssing for finetuning:
import pandas as pd
from datasets import Dataset, Features, Value

# Define a template for the formatted text
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Given the symptoms, name the disease:

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Assume this is defined somewhere as part of your model's tokenizer

def formatting_prompts_func(examples):
    inputs = examples["symptoms"]
    outputs = examples["disease"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Format each entry according to the alpaca_prompt template and add EOS_TOKEN
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load your CSV data
csv_file = 'output_dataset.csv'  # Adjust path as necessary
data_df = pd.read_csv(csv_file)

# Convert the DataFrame to a Hugging Face dataset, specifying only existing columns
features = Features({
    'symptoms': Value('string'),
    'disease': Value('string')
})
dataset = Dataset.from_pandas(data_df, features=features)

# Apply the formatting function to the dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Optionally, save the formatted dataset for later use
formatted_dataset.save_to_disk('/mnt/data/path_to_save_dataset')  # Adjust path as necessary

# Print the first five formatted entries to verify
print(formatted_dataset['text'][:5])



Map:   0%|          | 0/7043 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7043 [00:00<?, ? examples/s]

["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the symptoms, name the disease:\n\n### Input:\nI have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes.\n\n### Response:\nDrug Reaction<|end_of_text|>", 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the symptoms, name the disease:\n\n### Input:\nI have asthma and I get wheezing and breathing problems. I also have fevers, headaches, and I feel tired all the time.\n\n### Response:\nAllergy<|end_of_text|>', "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the symptoms,

### **training the model**

In [7]:
#settings to get best results


from trl import SFTTrainer
from transformers import TrainingArguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs",
    num_train_epochs=3,  # Adjust based on dataset size
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    warmup_steps=50,  # Assuming about 500 training steps
    max_steps=500,  # Set a reasonable limit based on your dataset and epoch calculation
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,
    eval_steps=50,  # Adjust based on preference for frequency of evaluation
    evaluation_strategy="epoch",  # Changed to match save_strategy
    save_strategy="epoch",  # Keep this aligned with evaluation_strategy
    fp16=True,  # Enable if supported
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',  # Assuming you have a validation metric setup
    greater_is_better=True
)
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=training_args
)


  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/7043 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


## Training

In [None]:
trainer_stats = trainer.train()

### **model testing**

In [73]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "name the disease  ", # instruction
        "I've been having back pain, a cough, and numbness in my arms and legs. My neck hurts too, and I've been feeling dizzy and off balance.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 84, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


["<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nname the disease  \n\n### Input:\nI've been having back pain, a cough, and numbness in my arms and legs. My neck hurts too, and I've been feeling dizzy and off balance.\n\n### Response: you may be experencing :\nCervical Spondylosis Cervical spondylosis is a common condition that occurs when the discs between the bones in your neck (vertebrae) wear down. It's also called cervical osteoarthritis or neck arthritis. Cervical spondylosis usually doesn't cause symptoms until later in life. But it can cause neck pain and stiffness, as well as pain that"]