In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Mistral patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
rank = 8
alpha = 64
epochs = 5
model_name = "Mistral_7b_test_ep_5_r_8_a_64"

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = alpha,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.12.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import Dataset
df = pd.read_csv('/content/drive/My Drive/pass_QA_topic_stopic_diff (1).csv')
# Define the alpaca prompt format with placeholders for each required field
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an interviewer who asks a Java question to the user based on topic, subtopic, and difficulty.

### Input:
Topic: {topic}
Subtopic: {subtopic}
Difficulty: {difficulty}

### Response:
{question}
"""

# EOS_TOKEN to mark the end of the prompt
EOS_TOKEN = tokenizer.eos_token  # Assuming 'tokenizer' is already defined in your environment

# Define the function to format the dataset
def formatting_prompts_func(examples):
    # Create an empty list to store the formatted texts
    texts = []

    # Iterate over each example in the batch (using column access)
    for i in range(len(examples['Topic'])):
        # Extract the topic, subtopic, difficulty, question, and answer
        topic = examples['Topic'][i]
        subtopic = examples['Sub-Topic'][i]
        difficulty = examples['Difficulty'][i]
        question = examples['Question'][i]

        # Format the prompt with the extracted details
        text = alpaca_prompt.format(
            topic=topic,
            subtopic=subtopic,
            difficulty=difficulty,
            question=question
        ) + EOS_TOKEN  # Add EOS token
        texts.append(text)

    # Return the formatted prompts
    return {"text": texts}
# Example: Create a pandas DataFrame with the necessary columns (if you don't have one already)
# df = pd.DataFrame({
#     "Topic": ["OOP", "Arrays"],
#     "Subtopic": ["Inheritance", "Sorting"],
#     "Difficulty": ["Easy", "Medium"],
#     "Question": ["What is Inheritance in Java?", "Explain Merge Sort algorithm."],
#     "Answer": ["Inheritance is a mechanism where one class acquires the property and behavior of another class.",
#                "Merge Sort is a divide-and-conquer algorithm that divides the array into halves, sorts them, and merges them."]
# })

# Convert your pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Apply the formatting function to the dataset (batch processing)
dataset = dataset.map(formatting_prompts_func, batched=True)

# Now, 'dataset' contains the formatted prompts that can be used for model training


Map:   0%|          | 0/1266 [00:00<?, ? examples/s]

In [None]:
first_example_text = dataset[0]['text']
print(first_example_text)

# Access the 'text' column of the 5th example
fifth_example_text = dataset[4]['text']
print(fifth_example_text)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an interviewer who asks a Java question to the user based on topic, subtopic, and difficulty.

### Input:
Topic: Java Variables
Subtopic: Variable Declaration and Initialization
Difficulty: Easy

### Response:
What is the purpose of declaring a variable in Java?
</s>
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an interviewer who asks a Java question to the user based on topic, subtopic, and difficulty.

### Input:
Topic: Java Variables
Subtopic: Variable Declaration and Initialization
Difficulty: Medium

### Response:
Can a variable be declared and initialized at the same time in Java?
</s>


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 32,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = epochs,# Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/1266 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
4.441 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,266 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 4
\        /    Total batch size = 128 | Total steps = 50
 "-____-"     Number of trainable parameters = 20,971,520


Step,Training Loss
10,1.0085
20,0.2414
30,0.1605
40,0.1231
50,0.1007


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        topic="Java Variables",
        subtopic="Variable Declaration",
        difficulty="Easy",
        question=""  # Leave blank for generation
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an interviewer who asks a Java question to the user based on topic, subtopic, and difficulty.\n\n### Input:\nTopic: Java Variables\nSubtopic: Variable Declaration\nDifficulty: Easy\n\n### Response:\n\nWhat is the purpose of declaring a variable in Java?\n</s>']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        topic="Java Variables",
        subtopic="Variable Declaration",
        difficulty="Easy",
        question=""  # Leave blank for generation
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an interviewer who asks a Java question to the user based on topic, subtopic, and difficulty.

### Input:
Topic: Java Variables
Subtopic: Variable Declaration
Difficulty: Easy

### Response:

What is the purpose of declaring a variable in Java?
</s>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [None]:
model.push_to_hub(model_name) # Online saving
tokenizer.push_to_hub(model_name) # O

README.md:   0%|          | 0.00/590 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Mistral_7b_test_ep_5_r_8_a_64


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [None]:

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "Jiten1024/Mistral_7b_test_ep_5_r_8_a_64",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
FastLanguageModel.for_inference(model)



==((====))==  Unsloth 2024.12.2: Fast Mistral patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096, padding_idx=770)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        topic="Java Variables",
        subtopic="Variable Declaration",
        difficulty="Easy",
        question=""  # Leave blank for generation
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an interviewer who asks a Java question to the user based on topic, subtopic, and difficulty.\n\n### Input:\nTopic: Java Variables\nSubtopic: Variable Declaration\nDifficulty: Easy\n\n### Response:\n\nWhat is the purpose of declaring a variable in Java?\n</s>']

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`: