In [None]:
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [11]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [13]:
from unsloth.chat_templates import get_chat_template
import re
from datasets import load_dataset

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# Formatting function
def formatting_prompts_func(examples):
    texts = []
    for example in examples['formatted']:  # Assuming your data is in a 'formatted' column
        # Extract instruction and response using regex
        match = re.search(r"<s>\[INST\](.*?)\[/INST\](.*?)</s>", example, re.DOTALL)
        if match:
            instruction = match.group(1).strip()
            response = match.group(2).strip()
            # Format into Llama-3.1 chat template
            conversation = [
                {"role": "user", "content": instruction},
                {"role": "assistant", "content": response}
            ]
            formatted_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
            texts.append(formatted_text)
        else:
            print("Warning: Could not parse example:", example)
    return {"text": texts}  # Only return the 'text' key

# Load the dataset
dataset = load_dataset("cypsiSAS/transformed_bipolar", split="train")

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Optionally, check the resulting dataset format:
print(dataset[0]["text"])  # Check the first element to ensure the formatting is correct
dataset = dataset.train_test_split(test_size=0.1)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Can you provide a sentence that illustrates bipolar disorder?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

This seroquel withdrawal sucks Problems sleeping even with trazodone, nausea anytime I try to eat or drink even the littlest things, no appetite, depression is coming back with a vengeance. I emailed my pdoc, hopefully I hear back tomorrow. I need something for the nausea at the very least. I can't get my latuda calories down without wanting to throw up. I haven't had a full meal in days. I feel awful. Please don't let this last much longer or please let me go back on seroquel. I'll deal with the weight gain if it means I feel ok. Please please let this all stop and let me feel like a person again.<|eot_id|>


In [36]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 20,
    packing = True, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 40,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [37]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [38]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

' with my emotions to the point of numbness, I get extremely emotional and then numb and then all I want to do is sleep because I\'ve hyped myself up so much that in the end I feel just nothing like every emotion leaves my body. And then an hour later or even less it\'s like that again.\n\nI have health issues including major depression, Tics and autism. I\'m not sure if this contributes. I feel like I can never be happy whenever I try I will think of one horrible thing and all my happiness will go down the drain. I\'m either sad or happy there is no inbetween with me and when I\'m sad I\'m extremely low to the point of sale elf destruction. Including binge eating or taking excessive pain killers because it helps with the headaches and my pain from my health issues. \n\nI don\'t necessarily feel like I\'ve ruined relationships in my life because most people in my life have just been nasty to me including boyfriends and people from school. I\'ve always been with bad men. People (my moth

In [39]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])

"                                                                                                                                                                                                                                                                                                                               \n\nDo you do cocaine to cope with the lows? My PDOC is much more afraid of me having a manic episode with psychosis, than he is about the lows of depression, so he never prescribed me any anti-depressants, even when I was on Olanzapine tired all the time, he would not give me any stimulant which eventually led to me being let go from my last job about a year ago. Today I got some bad news regarding other job opportunity on different continent I was looking forward to... feeling down, I've reached to my stash and had some white nose powder, how about you and a drug habit? Do you self medicate with illicit substances?<|eot_id|><|eot_id|><|begin_of_text|><|begin_of_text|><|

In [40]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla P100-PCIE-16GB. Max memory = 15.888 GB.
6.596 GB of memory reserved.


In [42]:
from unsloth.chat_templates import get_chat_template

# Load the bipolar diagnostic chat template specifically for the psychiatrist role
chat_template = "llama-3.1"

# Initialize the tokenizer with the appropriate chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template=chat_template,  # LLaMA-based psychiatrist diagnostic template
)

# Configure the model for faster inference and optimized performance
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Define a set of psychiatric diagnostic prompts focusing on bipolar disorder
diagnostic_prompts = [
    {"role": "user", "content": "I feel like I am having trouble sleeping and my thoughts are racing all the time."},
    {"role": "assistant", "content": "Can you describe your mood over the past week? Have you felt unusually energetic, irritable, or euphoric?"},
    {"role": "user", "content": "Yes, I felt like I could do anything. I started so many projects but couldn't finish them."},
    {"role": "assistant", "content": "Have you experienced any periods where you felt very low, hopeless, or lacked energy?"},
]

# Apply the chat template with tokenization for model inputs
inputs = tokenizer.apply_chat_template(
    diagnostic_prompts,
    tokenize=True,
    add_generation_prompt=True,  # Enable generation for dynamic responses
    return_tensors="pt",
).to("cuda")

# Generate the model's output with specific parameters tuned for clinical interactions
outputs = model.generate(
    input_ids=inputs, 
    max_new_tokens=128,  # Longer outputs for detailed psychiatric analysis
    use_cache=True,      # Leverage caching for efficiency
    temperature=1.2,     # Moderate creativity to adapt to diverse user inputs
    min_p=0.15           # Ensure balanced output confidence
)

# Decode and format the responses for user readability
responses = tokenizer.batch_decode(outputs)

# Print or log the responses for review by the psychiatrist
for response in responses:
    print(response)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

I feel like I am having trouble sleeping and my thoughts are racing all the time.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Can you describe your mood over the past week? Have you felt unusually energetic, irritable, or euphoric?<|eot_id|><|start_header_id|>user<|end_header_id|>

Yes, I felt like I could do anything. I started so many projects but couldn't finish them.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Have you experienced any periods where you felt very low, hopeless, or lacked energy?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Have you experienced any periods where you felt very low, hopeless, or lacked energy?<|eot_id|>


In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
model.push_to_hub("Hamatoysin/EMBS",token = HF_token) # Online saving
tokenizer.push_to_hub("Hamatoysin/EMBS", token = HF_token) # Online saving

README.md:   0%|          | 0.00/599 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Hamatoysin/EMBS


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [46]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "I have too many ideas "},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Idea overload can be overwhelming and difficult to manage. Here are some strategies that can help you better manage your ideas and channel them in a positive direction:

**Categorize your ideas**: 

Organize your ideas by topic, type (problem-solving, ideas, stories, etc.), or by frequency of thought (e.g., things you've been thinking about daily for weeks). This can help you group related ideas together and find connections that you may have missed otherwise.

**Prioritize ideas**: 

Determine which ideas are the most important

In [None]:
# Save to 8bit Q8_0
if True: model.push_to_hub_gguf("Hamatoysin/EMBS-G", tokenizer, token = HF_token)

Cloning into 'llama.cpp'...
Submodule 'kompute' (https://github.com/nomic-ai/kompute.git) registered for path 'ggml/src/ggml-kompute/kompute'
Cloning into '/kaggle/working/llama.cpp/ggml/src/ggml-kompute/kompute'...
Submodule path 'ggml/src/ggml-kompute/kompute': checked out '4565194ed7c32d1d2efa32ceab4d3c6cae006306'
make: Entering directory '/kaggle/working/llama.cpp'
cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by cmake)
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found 

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 20.32 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 51.74it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving Hamatoysin/EMBS-G/pytorch_model-00001-of-00002.bin...
Unsloth: Saving Hamatoysin/EMBS-G/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at Hamatoysin/EMBS-G into q8_0 GGUF format.
The output location will be /kaggle/working/Hamatoysin/EMBS-G/unsloth.Q8_0.gguf
This might take 3 minutes...
Writing: 100%|██████████| 3.41G/3.41G [00:57<00:00, 59.6Mbyte/s]
Unsloth: Conversion completed! Output location: /kaggle/working/Hamatoysin/EMBS-G/unsloth.Q8_0.gguf
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Hamatoysin/EMBS-G
