In [1]:
!pip cache purge
!pip install --upgrade bitsandbytes datasets torch
!pip install "xformers<0.0.27" --force-reinstall
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install "trl<0.9.0" peft accelerate bitsandbytes
!pip install tensorboard
!pip install -U scikit-learn
!pip install transformers -U

Files removed: 40
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting torch
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.0-py3-none-any.whl.metada

In [2]:
from huggingface_hub import hf_hub_download
import os
from pathlib import Path
import sys

def download_from_huggingface(repo_id, filename):
    """
    Downloads a specific file from a HuggingFace repository to the current directory.
    
    Args:
        repo_id (str): The repository ID in format 'username/repository_name'
        filename (str): The name of the file to download
    """
    try:
        # Ensure we're working with proper string inputs
        if not isinstance(repo_id, str) or not isinstance(filename, str):
            raise ValueError("Repository ID and filename must be strings")

        # Get the current working directory
        current_dir = os.getcwd()
        
        print(f"Attempting to download {filename} from {repo_id}...")
        
        # Download the file
        downloaded_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            repo_type="model",
            local_dir=current_dir,
            local_dir_use_symlinks=False
        )
        
        # Move file to current directory if it's in a subdirectory
        final_path = Path(current_dir) / filename
        if Path(downloaded_path) != final_path:
            Path(downloaded_path).rename(final_path)
            
        print(f"Successfully downloaded {filename} to {current_dir}")
        
    except Exception as e:
        print(f"Error occurred while downloading: {str(e)}")
        sys.exit(1)


REPO_ID = "MandTDigital/Llama8b_CBT_Depression_Finetune_Nov_4"
FILENAME = "Combined_Synthetic_Transcripts_Nov_2.txt"

download_from_huggingface(REPO_ID, FILENAME)

Attempting to download Combined_Synthetic_Transcripts_Nov_2.txt from MandTDigital/Llama8b_CBT_Depression_Finetune_Nov_4...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Combined_Synthetic_Transcripts_Nov_2.txt:   0%|          | 0.00/24.5M [00:00<?, ?B/s]

Successfully downloaded Combined_Synthetic_Transcripts_Nov_2.txt to /workspace


In [3]:
import re
import json
from xml.etree import ElementTree as ET
from unsloth import FastLanguageModel
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import TrainerCallback, TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
import os
import torch
from tqdm import tqdm


def parse_conversation(content):
    # Split the content into turns
    turns = re.split(r'\n(?=(?:Therapist:|Patient:))', content)
    conversation_history = []
    prompt_response_pairs = []

    for turn in turns:
        speaker, _, text = turn.partition(':')
        text = text.strip()

        # Collect each turn into conversation history
        conversation_history.append(f"{speaker}: {text}")

        # If the turn is from the AI, generate a prompt-response pair
        if speaker == 'Therapist':
            # Use the entire conversation history up to this point as the prompt
            prompt = '\n'.join(conversation_history[:-1])
            # Add the prompt-response pair to the list
            prompt_response_pairs.append({
                'instruction': "You are an AI CBT therapist. Respond appropriately in the following conversation: ",
                'input': prompt,
                'output': text
            })

    return prompt_response_pairs


def process_combined_file(file_path):
    all_pairs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into individual conversations
    conversations = content.split('</conversation>')

    for conv in conversations:
        if conv.strip():
            conv_content = conv.split('\n', 1)[1].strip() if '\n' in conv else conv.strip()
            pairs = parse_conversation(conv_content)
            all_pairs.extend(pairs)

    return all_pairs


# Process the input file and create the dataset
input_file = 'Combined_Synthetic_Transcripts_Nov_2.txt'
output_file = 'cbt_dataset.json'
prompt_response_pairs = process_combined_file(input_file)
dataset_dict = {
    'instruction': [pair['instruction'] for pair in prompt_response_pairs],
    'input': [pair['input'] for pair in prompt_response_pairs],
    'output': [pair['output'] for pair in prompt_response_pairs]
}

# Save the dataset dictionary as a JSON file
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(dataset_dict, file, ensure_ascii=False, indent=2)

print(f"Processed {len(prompt_response_pairs)} prompt-response pairs and saved to {output_file}")



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Processed 31695 prompt-response pairs and saved to cbt_dataset.json


In [4]:
# Define model and tokenizer parameters
model_name = "unsloth/Qwen2.5-7B"
max_seq_length = 10000
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Configure the PEFT model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define the EOS_TOKEN and Alpaca prompt format
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to format prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Create a Hugging Face dataset
dataset = Dataset.from_dict(dataset_dict)

print("Dataset size:", len(dataset))
print("Dataset features:", dataset.features)
print("First example:", dataset[30])

# Format the entire dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Split the formatted dataset into train, validation, and test sets
train_val_test = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_val = train_val_test['train'].train_test_split(test_size=0.1, seed=42)

formatted_train_dataset = train_val['train']
formatted_val_dataset = train_val['test']
formatted_test_dataset = train_val_test['test']


==((====))==  Unsloth 2024.11.7: Fast Qwen2 patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Unsloth 2024.11.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Dataset size: 31695
Dataset features: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None)}
First example: {'instruction': 'You are an AI CBT therapist. Respond appropriately in the following conversation: ', 'input': 'Patient: Hi, I\'m back for session 12. Here is the summary you provided me at the end of our last session:\n\n- Main topics discussed: Challenging perfectionistic thoughts, addressing guilt around self-care, and improving sleep quality.\n- Techniques or exercises used: Thought records, CBT triangle, downward arrow technique, identifying intermediate and core beliefs, cognitive restructuring, and sleep hygiene strategies. In future sessions, we will continue practicing these techniques and introduce new ones as needed, such as cognitive and behavioral experiments, reframing core beliefs, and addressing specific depressive symptoms like rumination and social withdrawal.\n- Homework assigned: Conti

Map:   0%|          | 0/31695 [00:00<?, ? examples/s]

In [None]:
# Define a callback for logging losses
class LossLoggingCallback(TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and logs is not None:
            with open(self.log_file, 'a') as f:
                if 'loss' in logs:
                    f.write(f"Step: {state.global_step}, Loss: {logs['loss']}\n")
                if 'eval_loss' in logs:
                    f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}\n")

log_file = os.path.join("outputs", "loss_log.txt")

# Configure the SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=1,
        gradient_checkpointing=True,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
    ),
    callbacks=[LossLoggingCallback(log_file)],
)

# Train the model
trainer_stats = trainer.train()

# Save the fine-tuned model
model.save_pretrained_gguf("Qwen_CBT_Nov_15", tokenizer, quantization_method="q8_0")



Map (num_proc=2):   0%|          | 0/25672 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2853 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,672 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 1,604
 "-____-"     Number of trainable parameters = 40,370,176


Step,Training Loss,Validation Loss
100,0.6305,0.62288
200,0.4986,0.503355
300,0.4087,0.424806
400,0.3667,0.359587
500,0.2967,0.307283
600,0.2842,0.260589
700,0.2351,0.222468
800,0.2306,0.187979
900,0.1251,0.160614
1000,0.1429,0.139912
