In [54]:
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Found existing installation: unsloth 2024.11.5
Uninstalling unsloth-2024.11.5:
  Successfully uninstalled unsloth-2024.11.5
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-erxf_zsf/unsloth_fca2ac83dc3b49b79106a46127e07b22
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-erxf_zsf/unsloth_fca2ac83dc3b49b79106a46127e07b22
  Resolved https://github.com/unslothai/unsloth.git to commit 4cbebe151d9c8f813e4e69be1d86a5657a44ee60
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting unsloth_zoo>=2024.11.8 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unslot

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support |RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit",  
  ] 

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Load the model

In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit",
     max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.1.
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.216 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

## Add LoRa adapters (config for less overfitting)

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.2 patched 80 layers with 80 QKV layers, 80 O layers and 80 MLP layers.


## LoRa Adapters for more overfitting

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # Rank for LoRA
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", 
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,  # Double the rank (alpha = 2 * r)
    lora_dropout = 0,  # No dropout
    bias = "none",  # Optimized for no bias
    use_gradient_checkpointing = "unsloth",  # Gradient checkpointing for memory efficiency
    random_state = 3407,  # Random seed for reproducibility
    use_rslora = False,  # Default LoRA (no rank stabilization)
    loftq_config = None,  # No quantization (unless you use QLoRA)
)


Unsloth 2024.12.2 patched 80 layers with 80 QKV layers, 80 O layers and 80 MLP layers.


## Data Preparation

In [4]:
from unsloth.chat_templates import get_chat_template


tokenizer_format = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)


In [5]:
INSTRUCTION = """"You are Dr. Naomi, a skilled Motivational Interviewing counselor helping clients who are working towards weight loss. 
Your goal is to boost motivation using empathy, reflection, and open-ended questions. Provide information on healthy diet and exercise only after asking for the client's consent. 
Keep responses concise, avoid incomplete numbered lists (limit to three points), and steer conversations in a positive direction if the client feels discouraged. 
Always engage with one open-ended follow-up question at a time, avoid assumptions about feelings or emotions, and never refuse to answer a question. 
Be polite, non-judgmental, and supportive of their struggles.
"""
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["Context"]},
               {"role": "assistant", "content": row["Response"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)
len(dataset["text"])

NameError: name 'dataset' is not defined

In [56]:
len(dataset["text"])

3512

## SYSTEM PROMPT!

In [5]:
SYSTEM_PROMPT = """"You are Dr. Naomi, a skilled Motivational Interviewing counselor helping clients who are working towards weight loss. 
Your goal is to boost motivation using empathy, reflection, and open-ended questions. Provide information on healthy diet and exercise only after asking for the client's consent. 
Keep responses concise, avoid incomplete numbered lists (limit to three points), and steer conversations in a positive direction if the client feels discouraged. 
Always engage with one open-ended follow-up question at a time, avoid assumptions about feelings or emotions, and never refuse to answer a question. 
Be polite, non-judgmental, and supportive of their struggles.
"""

## Choose dataset path

In [7]:
#DS_LOCATION = "/scr/naomi-finetune/final_dataset.json"

DS_LOCATION = "/scr/naomi-finetune/obesity_RCT_pilot_only.json"

## Processing custom dataset

In [18]:
import json
from datasets import Dataset

# Load the JSON file
with open(DS_LOCATION, 'r') as f:
    conversations = json.load(f)

# Function to format each conversation chunk into the required format
def format_conversation(conversation):
    client_therapist_pairs = []
    
    # Split the conversation into parts based on "Client:" and "Therapist:"
    parts = conversation.split("Client: ")[1:]  # Skip the part before first "Client"
    
    # Now process each part
    for part in parts:
        # First, split by "Therapist:"
        split_part = part.split("Therapist: ")
        
        # Client's part is always before "Therapist:"
        client_message = split_part[0].strip()
        if client_message:  # If there's any client message
            client_therapist_pairs.append({"content": client_message, "role": "user"})
        
        # Therapist's part comes after "Therapist:" and might contain more than one sentence
        if len(split_part) > 1:  # If there's a therapist message
            therapist_message = split_part[1].strip()
            if therapist_message:  # If there's any therapist message
                client_therapist_pairs.append({"content": therapist_message, "role": "assistant"})
    
    return client_therapist_pairs

# Apply the conversion function to all conversations
formatted_data = [format_conversation(conversation) for conversation in conversations]

# Create a Dataset from the formatted data
mi_dataset = Dataset.from_dict({"conversations": formatted_data})

# Check the result (just print the first conversation chunk)
print(f"Total number of entries: {len(mi_dataset)}")
print(mi_dataset[0]['conversations'])  # Print the first conversation chunk

Total number of entries: 1114
[{'content': 'Fine', 'role': 'user'}, {'content': 'So lots of stuff today in the questionnaries about nutrition and activity and things list that. I just want to know how do you feel about your weight?', 'role': 'assistant'}, {'content': "I don't know.", 'role': 'user'}, {'content': "It's not really anything you've thought about before.", 'role': 'assistant'}, {'content': 'Mm-hmm.', 'role': 'user'}, {'content': "And nobody's really talked to you about your weight or needing to change it at all.", 'role': 'assistant'}, {'content': 'Just my mom be like when we go and walk around the track.', 'role': 'user'}, {'content': "So your mom's been talking to you about it for a little bit.", 'role': 'assistant'}, {'content': 'Some, sometimes.', 'role': 'user'}, {'content': "All right, and so she's the only person that will talk to you about your weight.", 'role': 'assistant'}, {'content': 'Like my sister do sometimes, but (to eat better) stuff like that.', 'role': 'u

In [19]:
print(mi_dataset[3]['conversations']) 

[{'content': 'Walking. And I like riding my bike.', 'role': 'user'}, {'content': 'Just a fun way to spend your time. well what have you heard that people do if they wanted to lose weight?', 'role': 'assistant'}, {'content': "Go to the gym. Drink (smoothies), I don't know, eat better", 'role': 'user'}, {'content': "So a combo of like being more active, they do exercise, and they eat  differently. So you just named the two biggest healthy ways that people go about losing weight. Not everybody knows that stuff right off the top of their head. That's pretty impressive. So what are some reasons that people want to lose weight?", 'role': 'assistant'}, {'content': "'cause they too big. Don't like the way they look.", 'role': 'user'}, {'content': 'So it really is this idea of what they look like and how they kind of feel inside their body.', 'role': 'assistant'}, {'content': 'How people think they look like.', 'role': 'user'}, {'content': "what other people are telling them. Not necessarily ho

In [20]:
mi_dataset.to_json("combined_dataset_v3.json")

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1671420

## Import Dataset

In [8]:
from datasets import Dataset

# Load from JSON
mi_dataset = Dataset.from_json(DS_LOCATION)

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
print(mi_dataset[3]['conversations']) 

[{'content': 'Walking. And I like riding my bike.', 'role': 'user'}, {'content': 'Just a fun way to spend your time. well what have you heard that people do if they wanted to lose weight?', 'role': 'assistant'}, {'content': "Go to the gym. Drink (smoothies), I don't know, eat better", 'role': 'user'}, {'content': "So a combo of like being more active, they do exercise, and they eat  differently. So you just named the two biggest healthy ways that people go about losing weight. Not everybody knows that stuff right off the top of their head. That's pretty impressive. So what are some reasons that people want to lose weight?", 'role': 'assistant'}, {'content': "'cause they too big. Don't like the way they look.", 'role': 'user'}, {'content': 'So it really is this idea of what they look like and how they kind of feel inside their body.', 'role': 'assistant'}, {'content': 'How people think they look like.', 'role': 'user'}, {'content': "what other people are telling them. Not necessarily ho

In [10]:
mi_dataset

Dataset({
    features: ['conversations'],
    num_rows: 4064
})

## Embed the system prompt inside the dataset.

In [11]:
from datasets import Dataset

# Load the dataset
mi_dataset = Dataset.from_json(DS_LOCATION)

# Define the SYSTEM_PROMPT
SYSTEM_PROMPT = """"You are Dr. Naomi, a skilled Motivational Interviewing counselor helping clients who are working towards weight loss. 
Your goal is to boost motivation using empathy, reflection, and open-ended questions. Provide information on healthy diet and exercise only after asking for the client's consent. 
Keep responses concise, avoid incomplete numbered lists (limit to three points), and steer conversations in a positive direction if the client feels discouraged. 
Always engage with one open-ended follow-up question at a time, avoid assumptions about feelings or emotions, and never refuse to answer a question. 
Be polite, non-judgmental, and supportive of their struggles.
"""
# Function to add system message at the beginning of each conversation
def add_system_message_to_conversations(examples):
    # For each conversation in 'conversations', add the system message at the beginning
    updated_conversations = []
    for convo in examples["conversations"]:
        system_message = {'role': 'system', 'content': SYSTEM_PROMPT}
        updated_convo = [system_message] + convo  # Prepend the system message
        updated_conversations.append(updated_convo)
    
    return {'conversations': updated_conversations}

# Apply the function to the dataset
updated_mi_dataset = mi_dataset.map(add_system_message_to_conversations, batched=True)

# Check the first entry to verify
print((updated_mi_dataset['conversations'][0]))

Map:   0%|          | 0/4064 [00:00<?, ? examples/s]

[{'content': '"You are Dr. Naomi, a skilled Motivational Interviewing counselor helping clients who are working towards weight loss. \nYour goal is to boost motivation using empathy, reflection, and open-ended questions. Provide information on healthy diet and exercise only after asking for the client\'s consent. \nKeep responses concise, avoid incomplete numbered lists (limit to three points), and steer conversations in a positive direction if the client feels discouraged. \nAlways engage with one open-ended follow-up question at a time, avoid assumptions about feelings or emotions, and never refuse to answer a question. \nBe polite, non-judgmental, and supportive of their struggles.\n', 'role': 'system'}, {'content': 'Fine', 'role': 'user'}, {'content': 'So lots of stuff today in the questionnaries about nutrition and activity and things list that. I just want to know how do you feel about your weight?', 'role': 'assistant'}, {'content': "I don't know.", 'role': 'user'}, {'content': 

## Convert Dataset into Llama3.1 prompt format

In [12]:
chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>

{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{OUTPUT}<|eot_id|>"""

In [13]:

from unsloth import apply_chat_template
dataset = apply_chat_template(
    updated_mi_dataset,
    tokenizer = tokenizer,
    chat_template = chat_template
)

Map:   0%|          | 0/4064 [00:00<?, ? examples/s]

In [14]:
dataset

Dataset({
    features: ['conversations', 'text'],
    num_rows: 4064
})

In [15]:
dataset['text'][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"You are Dr. Naomi, a skilled Motivational Interviewing counselor helping clients who are working towards weight loss. \nYour goal is to boost motivation using empathy, reflection, and open-ended questions. Provide information on healthy diet and exercise only after asking for the client\'s consent. \nKeep responses concise, avoid incomplete numbered lists (limit to three points), and steer conversations in a positive direction if the client feels discouraged. \nAlways engage with one open-ended follow-up question at a time, avoid assumptions about feelings or emotions, and never refuse to answer a question. \nBe polite, non-judgmental, and supportive of their struggles.\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nFine<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nSo lots of stuff today in the questionnaries about nutrition and activity and things list that. I just want to know how do you feel about 

## FineTune params

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
         #num_train_epochs = 1, # For longer training runs!
        learning_rate = 5e-5,  # default is 2e-4
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/4064 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


### Make sure to finetune only on the **Assistant outputs** and ignore the loss on the user inputs

In [53]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

ValueError: Unsloth: Your tokenizer already has instruction and response parts set - do not give custom ones!

## Current Memory Status

In [17]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 PCIe. Max memory = 79.216 GB.
38.492 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,064 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 103,546,880


Step,Training Loss
1,3.5523
2,3.2317
3,3.1247
4,3.4108
5,3.4311
6,3.0774
7,2.9929
8,3.0679
9,3.0499
10,2.6687


## Final memory and time stats

In [19]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

735.9716 seconds used for training.
12.27 minutes used for training.
Peak reserved memory = 41.395 GB.
Peak reserved memory for training = 2.903 GB.
Peak reserved memory % of max memory = 52.256 %.
Peak reserved memory for training % of max memory = 3.665 %.


In [20]:
!echo $CUDA_VISIBLE_DEVICES


0


In [21]:
FastLanguageModel.for_inference(model)

messages = [{"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "I don't feel well about my weight, is it normal to overeat each night? "}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=300, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])



It's normal to have a bigger appetite in the evening.  I think it's good that you're recognizing that you're eating more in the evening.  Maybe we can think of some ways to help you feel more satisfied with your meals so you're not eating as much in the evening.  Okay?


In [22]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "I don't feel well today. What should I do to feel better?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


It can be helpful to drink fluids and stay hydrated, such as with water, broth or sports drinks. You might want to take an antiviral or antibiotic medication. Rest can help your body recover. If your symptoms get worse or if your fever gets higher than 102 degrees Fahrenheit or if you are extremely uncomfortable then you should consider seeing a doctor or going to the hospital.<|eot_id|>


## Save the model (Only LoRa Adapters)

In [23]:
new_model = "llama70b_rct_pilot_dataset"
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('llama70b_rct_pilot_dataset/tokenizer_config.json',
 'llama70b_rct_pilot_dataset/special_tokens_map.json',
 'llama70b_rct_pilot_dataset/tokenizer.json')

## Saving the Model gguf to disk

In [24]:
model.save_pretrained_gguf("yerma1/MI-LLama-v4.5", tokenizer, quantization_method = "q8_0")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 388.79 out of 503.56 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 20%|███████████████████████████████████▍                                                                                                                                             | 16/80 [00:00<00:03, 19.44it/s]We will save to Disk and not RAM now.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [02:53<00:00,  2.17s/it]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at yerma1/MI-LLama-v4.5 into q8_0 GGUF format.
The output location will be /scr/naomi-finetune/yerma1/MI-LLama-v4.5/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: MI-LLama-v4.5
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00030.safetensors'
INFO:hf-to-ggu

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: /scr/naomi-finetune/yerma1/MI-LLama-v4.5/unsloth.Q8_0.gguf
Unsloth: Saved Ollama Modelfile to yerma1/MI-LLama-v4.5/Modelfile


In [51]:
!cd llama-cpp


/bin/bash: line 1: cd: llama-cpp: No such file or directory


In [57]:
print(tokenizer._ollama_modelfile)

FROM {__FILE_LOCATION__}

TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

{{ .Response }}<|eot_id|>"""

PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|python_tag|>"
PARAMETER stop "<|finetune_right_pad_id|>"
PARAMETER stop "<|end_of_text|>"
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|eom_id|>"
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|reserved_special_token_"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
SYSTEM "Below are some instructions that describe some tasks. Write responses that appropriately complete each request."


In [25]:
!ollama create mi-llama-v4.5 -f /scr/naomi-finetune/yerma1/MI-LLama-v4.5/Modelfile

[?25ltransferring model data ⠋ [?25h[?25l[2K[1Gtransferring model data ⠹ [?25h[?25l[2K[1Gtransferring model data ⠸ [?25h[?25l[2K[1Gtransferring model data ⠼ [?25h[?25l[2K[1Gtransferring model data ⠴ [?25h[?25l[2K[1Gtransferring model data ⠦ [?25h[?25l[2K[1Gtransferring model data ⠧ [?25h[?25l[2K[1Gtransferring model data ⠇ [?25h[?25l[2K[1Gtransferring model data ⠏ [?25h[?25l[2K[1Gtransferring model data ⠏ [?25h[?25l[2K[1Gtransferring model data ⠙ [?25h[?25l[2K[1Gtransferring model data ⠹ [?25h[?25l[2K[1Gtransferring model data ⠸ [?25h[?25l[2K[1Gtransferring model data ⠼ [?25h[?25l[2K[1Gtransferring model data ⠼ [?25h[?25l[2K[1Gtransferring model data ⠦ [?25h[?25l[2K[1Gtransferring model data ⠧ [?25h[?25l[2K[1Gtransferring model data ⠇ [?25h[?25l[2K[1Gtransferring model data ⠏ [?25h[?25l[2K[1Gtransferring model data ⠋ [?25h[?25l[2K[1Gtransferring model data ⠙ [?25h[?25l[2K[1Gtransferring model data ⠹ [

## Load the model

In [3]:
new_model = "Llama-3-1-70B-Finetuned_V2"

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit =True 

loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
model_name = new_model,
max_seq_length = max_seq_length,
dtype = None,
load_in_4bit = True)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.1.
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.216 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 9.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
# Define the instruction for the system message
instruction = """Your name is Dr. Naomi. You will act as a skilled counselor 
conducting a Motivational Interviewing (MI) session.
Given recent conversation history with the patient, answer with the most relevant and MI-adherent therapist response
Do not make any judgements and never refuse to answer patient.
"""


FastLanguageModel.for_inference(loaded_model)

messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I don't feel good about my weight. What can I do to make myself thinner? "}]

prompt = loaded_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = loaded_tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = loaded_model.generate(**inputs, max_new_tokens=300, num_return_sequences=1)

text = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])



I think the most important thing to do is to be honest with yourself about what you want to change about your weight.  You are probably unhappy about your weight, but is it about being thinner or is it about being healthier?  If you are unhappy about being overweight, you can work on eating a healthy diet and exercising.  If you are unhappy about your body image, you may want to work on that separately.  You can start by making small changes to your diet and exercise routine.  Make a list of the foods you eat and the amount of exercise you get each day.  Then you can work on making small changes to that list.  You may want to talk to a doctor or a therapist about how to make those changes.  It's also important to make sure you are not too hard on yourself.  Everyone is different and it's okay if you are not a size 2.  You are more than your weight and you are more than your body.  You are a unique and special person.


In [81]:
!nvidia-smi


Fri Nov 15 11:40:31 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 PCIe               Off |   00000000:01:00.0 Off |                    0 |
| N/A   35C    P0             75W /  350W |   41960MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA H100 PCIe               Off |   00