In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the **base model** (without fine-tuning)
base_model_id = "meta-llama/Llama-3.2-1B"  # Base model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Ensure tokenizer is consistent
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Avoids padding issues

# Load model with correct precision
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# **Ensure model embeddings match tokenizer**
model.resize_token_embeddings(len(tokenizer))

# Move model to device
model.to(device)


def generate_response(prompt, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)  # ‚úÖ Explicitly pass attention mask

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,  # ‚úÖ Fixes warning
            max_length=max_length,
            do_sample=True,  # Enable randomness for diverse responses
            temperature=0.3,  # Higher temp allows more creativity
            top_p=0.9,  # Controls diversity
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)


# **Test with a medical query**
query = "what is x^2+2x+3, find a and b"
response = generate_response(query)

print("\nüîç **Medical Response (Base Model):**")
print(response)


üîç **Medical Response (Base Model):**
Generate spark code to add 100 of numbers in a list
I have a list of numbers and I want to add 100 of them in a list.
I have tried to use the following code but it doesn't work:
val list = List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
val list2 = list.take(100)
val list3 = list2 ++ list
val list4 = list3 ++ list
val list5 = list4 ++ list
val list6 = list5 ++ list
val list7 = list6 ++ list
val list8 = list7 ++ list
val list9 = list8 ++ list
val list10 = list9 ++ list
val list11 = list10 ++ list
val list12 = list11 ++ list
val list13 = list12 ++ list
val list14 = list13 ++ list
val list15 = list14 ++ list
val list16 = list15 ++ list
val list17 = list16 ++ list
val list18 = list17 ++ list
val list19 = list18 ++ list
val list20 = list19 ++ list
val list21 =


In [2]:
# **Test with a medical query**
query = "what is x^2+2x+3, find a and b"
response = generate_response(query)

print("\nüîç **Medical Response (Base Model):**")
print(response)


üîç **Medical Response (Base Model):**
what is x^2+2x+3, find a and b?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2y-5=0?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2y-5=0?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2y-5=0?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2y-5=0?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2y-5=0?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2y-5=0?
What is the equation of the line that passes through the point (2, 4) and is parallel to the line 3x+2


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import bitsandbytes as bnb

# Load Model and Tokenizer
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Select Device
device = "cuda" if torch.cuda.is_available() else "cpu"

#  Load Model in **8-bit Quantization** for Memory Efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32, 
    load_in_8bit=True,  
    device_map="auto"
)

# Prepare model for LoRA fine-tuning (Reduce memory usage)
model = prepare_model_for_kbit_training(model)

# Apply LoRA fine-tuning with **Minimal Trainable Parameters**
peft_config = LoraConfig(
    r=8,  # Increase LoRA rank for more expressive updates
    lora_alpha=16,  # Increase alpha to scale LoRA activations
    lora_dropout=0.05,  
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Move LoRA model to correct device
model.to(device)

#  Load Medical Instruction Dataset
dataset = load_dataset("medalpaca/medical_meadow_wikidoc")

#  Print available dataset splits
print("Available dataset splits:", dataset.keys())

#  Split dataset into **train and validation sets**
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)  # 90% Train, 10% Validation

# Print new dataset splits
print(f"Train size: {len(dataset['train'])}, Validation size: {len(dataset['test'])}")

# Tokenization function (Ensure Labels Are Correctly Created)
def tokenize_function(examples):
    """Format the input text for instruction-tuned fine-tuning."""
    prompt = f"### Instruction:\n{examples['instruction']}\n\n"
    if examples['input']:
        prompt += f"### Context:\n{examples['input']}\n\n"
    prompt += f"### Response:\n{examples['output']}"

    tokenized = tokenizer(prompt, truncation=True, max_length=512, return_tensors="pt")

    # Ensure labels are correct by shifting input_ids left for causal modeling
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

#  Tokenize dataset (Ensures Loss Can Be Computed)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

#  Assign the correct splits
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["test"]  # Previously used train for validation, now using real val set

# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=False)  

#  Training Arguments (Aggressive Memory Optimization)
training_args = TrainingArguments(
    output_dir="./llama3-medical-wikidoc",
    per_device_train_batch_size=1,  # **Minimal batch size to prevent OOM**
    per_device_eval_batch_size=1,  # **Minimal batch size**
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,  # **Evaluate every 500 steps**
    save_steps=500,
    logging_steps=10,
    learning_rate=5e-4,  # Slightly higher learning rate for small batch sizes
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False
)

#  Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  #  Correct validation dataset
    tokenizer=tokenizer,
    data_collator=data_collator
)

#  Train the model
trainer.train()

#  Save the fine-tuned model
model.save_pretrained("./llama3-medical-wikidoc")
tokenizer.save_pretrained("./llama3-medical-wikidoc")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Available dataset splits: dict_keys(['train'])
Train size: 9000, Validation size: 1000


  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


('./llama3-medical-wikidoc\\tokenizer_config.json',
 './llama3-medical-wikidoc\\special_tokens_map.json',
 './llama3-medical-wikidoc\\tokenizer.json')

In [7]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import bitsandbytes as bnb

dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")


README.md:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(‚Ä¶)l_meadow_wikidoc_medical_flashcards.json:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33955 [00:00<?, ? examples/s]

In [3]:
dataset

NameError: name 'dataset' is not defined

In [15]:
df = dataset["train"].to_pandas().head(10)


In [20]:
print(df.loc[0])

input          What is the relationship between very low Mg2+...
output         Very low Mg2+ levels correspond to low PTH lev...
instruction                      Answer this question truthfully
Name: 0, dtype: object


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# ‚úÖ Load the fine-tuned model path
model_path = "./llama3-medical-wikidoc"  # Path where you saved the fine-tuned model
base_model_id = "meta-llama/Llama-3.2-1B"  # The original base model

# ‚úÖ Load the tokenizer from the fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)

# ‚úÖ Ensure the tokenizer has a pad token to avoid warnings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Avoids padding issues

# ‚úÖ Select device
device = "cuda" if torch.cuda.is_available() else "cpu"

# ‚úÖ Load the base model first
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# ‚úÖ Load the fine-tuned LoRA adapter **on top of the base model**
model = PeftModel.from_pretrained(base_model, model_path)

# ‚úÖ Ensure model embeddings match tokenizer
model.resize_token_embeddings(len(tokenizer))

# ‚úÖ Move model to correct device
model.to(device)


def generate_response(prompt, max_length=256):
    """Generates a response using the fine-tuned model."""
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)  # ‚úÖ Explicitly pass attention mask

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,  # ‚úÖ Fixes warning
            max_length=max_length,
            do_sample=True,  # Enable randomness for diverse responses
            temperature=0.4,  # Lower temperature for factual responses
            top_p=0.9,  # Controls diversity
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)


# **Test the fine-tuned model with a medical query**
query = "Can you provide an overview of the symptoms of pneumonia?"
response = generate_response(query)

print("\nüîç **Medical Response (Fine-Tuned Model):**")
print(response)


Some parameters are on the meta device because they were offloaded to the cpu.



üîç **Medical Response (Fine-Tuned Model):**
Can you provide an overview of the symptoms of pneumonia? What causes pneumonia?
Pneumonia is an infection of the lungs. It is caused by an infection of the respiratory tract, such as a virus or bacteria. The infection causes inflammation of the lungs, which results in the production of mucus and thickened secretions. The mucus and secretions cause the patient to cough and produce sputum. The sputum can be yellow, green, or brown in color. The mucus and secretions can also cause the patient to have difficulty breathing. The patient may also have a fever, chills, and body aches. The patient may also have a cough that produces a thick, white or yellow substance. The cough may also produce blood or pus. The cough may also produce a large amount of sputum. The cough may also produce a large amount of sputum. The cough may also produce a large amount of sputum. The cough may also produce a large amount of sputum. The cough may also produce a la

In [None]:
def generate_response(prompt, max_length=256):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,  # Enable randomness for diverse responses
            temperature=0.1,  # Lower temp makes it more factual
            top_p=0.9,  # Controls diversity
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

query = "Can you provide an overview of the symptoms of pneumonia?"
response = generate_response(query)

print("\nüîç **Medical Response:**")
print(response)



üîç **Medical Response:**
Can you provide an overview of the symptoms of pneumonia? What are the causes of pneumonia?
Pneumonia is an infection of the lungs. It is caused by bacteria, viruses, fungi, or parasites. The causes of pneumonia are many and varied. The most common cause of pneumonia is a virus. Other causes of pneumonia include bacteria, fungi, and parasites. The symptoms of pneumonia are many and varied. The most common symptom of pneumonia is a cough. Other symptoms of pneumonia include fever, chills, chest pain, shortness of breath, and fatigue. The causes of pneumonia are many and varied. The most common cause of pneumonia is a virus. Other causes of pneumonia include bacteria, fungi, and parasites. The symptoms of pneumonia are many and varied. The most common symptom of pneumonia is a cough. Other symptoms of pneumonia include fever, chills, chest pain, shortness of breath, and fatigue. The causes of pneumonia are many and varied. The most common cause of pneumonia is

In [None]:

query = "Can you provide an overview of the lung's squamous cell carcinoma?"
response = generate_response(query)

print("\nüîç **Medical Response:**")
print(response)



üîç **Medical Response:**
Can you provide an overview of the lung's squamous cell carcinoma? What are the symptoms of this disease?
Answer: Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most