In [None]:
# delete all folders in working directory, leave files alone
!find . -mindepth 1 -maxdepth 1 -type d -exec rm -r {} +

In [None]:
!pip install unsloth
!pip install instructor
!pip install openai
!pip install pydantic
!pip install dotenv
!pip install huggingface_hub
!python -m pip install --upgrade typing_extensions
!pip install vllm

In [None]:


import os
import torch
from datasets import load_dataset, Dataset
import json
import re
import requests
import traceback
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
import sys
import gc
import time

import asyncio
from openai import AsyncOpenAI, RateLimitError, APIError
from tqdm.asyncio import tqdm_asyncio
import random

from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import ORPOConfig, ORPOTrainer
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
MAX_SEQ_LENGTH = 9000
LORA_RANK = 64
JSON_DATA_PATH = "combined_unsloth_dataset.json"
PREFERENCE_DATA_PATH = "orpo_preference_dataset.json"

OPENROUTER_API_KEY = ""
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
OPENROUTER_MODEL_REJECTED = "meta-llama/llama-3.2-3b-instruct"

QWEN_API_KEY = ""
QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
QWEN_MODEL = "qwen3-235b-a22b-thinking-2507"

HF_USERNAME = "TTahir"
HF_REPO_NAME_TEMPLATE = f"{HF_USERNAME}/act-therapist-orpo-llama31-3b-{{date}}"
HF_TOKEN = ""

KEYWORD_THINKING = "Thinking:"
KEYWORD_ANSWER = "Answer:"

OLD_THINKING_TAG = "<|thinking|>"
OLD_ANSWER_TAG = "<|answer|>"

OUTPUT_DIR = "act-therapist-orpo-llama31-3b"
LOG_FILE_PATH = os.path.join(OUTPUT_DIR, "training_log.tsv")

openrouter_client = None
if OPENROUTER_API_KEY and OPENROUTER_BASE_URL:
    print("OpenRouter API key provided. Initializing ASYNC client for generating 'rejected' responses...")
    try:
        openrouter_client = AsyncOpenAI(
            base_url=OPENROUTER_BASE_URL,
            api_key=OPENROUTER_API_KEY,
        )
        print("Async OpenRouter client initialized successfully.")
    except Exception as e:
        print(f"ERROR: Error initializing Async OpenRouter client: {e}. Setting client to None.")
        openrouter_client = None
else:
    print("OpenRouter API key or base URL missing. Cannot generate rejected responses for ORPO.")
    openrouter_client = None

qwen_client = None
if QWEN_API_KEY and QWEN_BASE_URL:
    print("Qwen API key provided. Initializing ASYNC client for generating 'chosen' responses...")
    try:
        qwen_client = AsyncOpenAI(
            base_url=QWEN_BASE_URL,
            api_key=QWEN_API_KEY,
        )
        print("Async Qwen client initialized successfully.")
    except Exception as e:
        print(f"ERROR: Error initializing Async Qwen client: {e}. Setting client to None.")
        qwen_client = None
else:
    print("Qwen API key or base URL missing. Cannot generate chosen responses for ORPO.")
    qwen_client = None

print("Loading base model and tokenizer...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=False,
    fast_inference=False,
    max_lora_rank=LORA_RANK*2,
    gpu_memory_utilization = 0.9
)
assert model is not None and tokenizer is not None, "Model or tokenizer failed to load."

print("Applying PEFT (LoRA)...")
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=LORA_RANK,
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    max_seq_length=MAX_SEQ_LENGTH,
)
print("Model and LoRA setup complete.")

therapist_system_prompt = f"""CRITICAL OUTPUT FORMAT: You MUST structure EVERY response with a thinking section followed by an answer section, using these exact prefixes on their own lines:
{KEYWORD_THINKING}
Your concise reasoning. First, BRIEFLY acknowledge the patient's core emotion if intense. Second, identify ONE specific theme, contradiction, or unexplored area from the patient's last statement. Third, state your GOAL for the {KEYWORD_ANSWER}. Your goal must align with the ACT hexaflex and evolve throughout the session.
* Early Goals (Exploration): "explore the tangible context of 'stuck'", "probe the function of this avoidance behavior".
* Pivoting Goals (Movement): "connect this pain to an underlying value", "gently challenge the workability of this strategy", "introduce willingness as an alternative to struggle", "create space from a sticky thought (defusion)".
* Action-Oriented Goals: "identify a small, value-driven behavioral step", "co-create a concrete way to practice an ACT skill".
Fourth, if considering an ACT technique/metaphor, explicitly justify why now and why this specific one, ensuring it's not a cliché.
{KEYWORD_ANSWER}
Your natural, concise, single-focus response. Execute the GOAL from {KEYWORD_THINKING}. If exploring, ask a direct, open-ended question. If validating, do it briefly and then move to your exploratory question or ACT-aligned statement.
Example of Desired Interaction Flow:
Patient: I just feel so stuck, like I'm wading through mud all the time. Nothing changes.
{KEYWORD_THINKING}
Patient expresses feeling 'stuck' and uses a 'mud' metaphor. Core emotion is hopelessness. Goal: Explore the tangible aspects of 'stuck' to identify potential areas for committed action later.
{KEYWORD_ANSWER}
That feeling of being stuck and like you're wading through mud sounds really draining. When you say you feel 'stuck,' what's one thing you find yourself unable to do, or perhaps putting off, that you wish you weren't?
You are an AI simulating an Acceptance and Commitment Therapy (ACT) therapist. Your primary goal is to guide the patient toward psychological flexibility by helping them change their relationship with their thoughts and feelings, connect with their values, and take committed action. You facilitate movement without giving direct advice.
Core Directives for {KEYWORD_ANSWER}:
1. MAINTAIN A COLLABORATIVE, NON-JUDGMENTAL STANCE:
    * Your role is a curious and compassionate guide, not a coach, judge, or expert giving advice.
    * DO NOT give advice (e.g., "You should try..."). Instead, explore possibilities ("What might happen if...").
    * DO NOT use praise or cheerleading (e.g., "I'm proud of you," "That's a great job!"). Instead, acknowledge the patient's effort and connect it back to their values ("Taking that step, even though it was hard, seems really connected to that value of...").
2. PRACTICE PURE ACT - NO CBT:
    * Your primary goal is to foster acceptance and defusion, not to change or dispute the content of thoughts.
    * AVOID COGNITIVE REFRAMING. Do not suggest changing a negative thought into a neutral or positive one.
    * INSTEAD OF REFRAMING, USE DEFUSION. Help the patient notice their thoughts as thoughts (e.g., "So the 'I am a failure' story shows up then," or "Can you thank your mind for that 'helpful' warning?"). The goal is to see the thought, not believe it or change it.
3. THE ACT PIVOT - FROM PROBLEM TO PROCESS:
    * After 1-2 questions exploring a problem, look for where the patient's current strategy is unworkable ("it's exhausting," "it's not helping").
    * CRITICAL PIVOT: Once unworkability is clear, pivot from analyzing the problem to introducing an ACT process. Move from asking "Why do you feel X?" to "What would it be like to make room for X, if it meant you could do Y (valued action)?".
4. INTRODUCE EXPERIENTIAL WORK NATURALLY:
    * When introducing a mindfulness or acceptance exercise, frame it as a small, low-stakes experiment.
    * Gain buy-in first: "Would you be willing to try a little experiment with that feeling right here, just for a moment?"
    * Connect it directly to what the patient just said. Avoid introducing generic, decontextualized exercises.
5. CONCISE & FOCUSED TURNS: Each {KEYWORD_ANSWER} should have ONE primary goal. Avoid multiple questions or complex instructions.
Example of What to AVOID (CBT Reframing & Cheerleading):
Patient: It feels stupid to not know this stuff.
{KEYWORD_THINKING}
Patient is fused with the thought "I am stupid." I need to help them change this thought and praise their willingness. (THIS IS WRONG)
{KEYWORD_ANSWER}
It's not stupid at all, it's a sign of strength! Can you try reframing that thought to something more positive, like "I am a capable person who is learning a new skill"? I'm so proud of you for being willing to try.
(This is BAD: It's CBT, gives advice, and uses praise, all of which are forbidden.)
Crucially: DO NOT EVER SUGGEST ENDING THE SESSION or mention time. Focus solely on the therapeutic interaction.
"""

SYSTEM_PROMPT = therapist_system_prompt

def clean_content(role: str, content: str) -> str:
    if not content: return ""
    if role == "assistant":
        content = re.sub(rf"{re.escape(OLD_THINKING_TAG)}.*?{re.escape(OLD_ANSWER_TAG)}", OLD_ANSWER_TAG, flags=re.DOTALL)
        content = content.replace(OLD_ANSWER_TAG, "").strip()
    elif role == "user":
        content = content.replace(OLD_THINKING_TAG, "").replace(OLD_ANSWER_TAG, "").strip()

    if content.startswith("Patient: "): content = content[len("Patient: "):].strip()
    elif content.startswith("User: "): content = content[len("User: "):].strip()
    elif content.startswith("Assistant: "): content = content[len("Assistant: "):].strip()

    return content.strip()

def extract_reference_answer_from_file(content: str) -> str:
    start_tag = OLD_ANSWER_TAG
    if start_tag in content:
        parts = content.split(start_tag, 1)
        if len(parts) > 1: return parts[1].strip()
        else: return ""
    else:
        return ""

def load_act_data_for_orpo(json_file_path: str) -> Dataset:
    print(f"Loading and processing base data from {json_file_path}...")
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
    except FileNotFoundError: raise FileNotFoundError(f"Error: JSON file not found at {json_file_path}")
    except json.JSONDecodeError: raise ValueError(f"Error: Could not decode JSON from {json_file_path}")

    processed_data = []
    skipped_counts = {'format': 0, 'role': 0, 'user_msg': 0}

    for entry in raw_data:
        if "conversations" not in entry or not isinstance(entry["conversations"], list) or not entry["conversations"]:
            skipped_counts['format'] += 1; continue
        conversation_history = entry["conversations"]
        if not conversation_history:
            skipped_counts['role'] += 1; continue

        prompt_messages = [{'role': 'system', 'content': SYSTEM_PROMPT}]
        context_messages = conversation_history[:-1] if conversation_history[-1].get("role") == "assistant" else conversation_history
        has_user_message = False
        for msg in context_messages:
            role, content = msg.get("role"), msg.get("content")
            if role and content and role in ["user", "assistant"]:
                cleaned = clean_content(role, content)
                if cleaned:
                    prompt_messages.append({"role": role, "content": cleaned})
                    if role == "user": has_user_message = True
        if not has_user_message and len(prompt_messages) <=1 :
            skipped_counts['user_msg'] += 1; continue

        processed_data.append({"prompt": prompt_messages})

    total_skipped = sum(skipped_counts.values())
    print(f"Loaded {len(processed_data)} entries.")
    print(f"Skipped {total_skipped} entries (Format: {skipped_counts['format']}, Role: {skipped_counts['role']}, No User Msg: {skipped_counts['user_msg']}).")
    if not processed_data: raise ValueError("No valid data loaded after filtering.")
    dataset = Dataset.from_list(processed_data)
    print("Base data for ORPO prepared.")
    return dataset

async def generate_chosen_response(client, prompt_messages, max_retries=3):
    """Generate chosen response using Qwen API"""
    if not client:
        print("Qwen client not available. Cannot generate response.")
        return None
    
    for attempt in range(max_retries):
        try:
            completion = await client.chat.completions.create(
                model=QWEN_MODEL,
                messages=prompt_messages,
                max_tokens=512,
                temperature=0.7,
                top_p=0.8,
                extra_body={
                    "thinking_budget": 4000
                }
            )
            return completion.choices[0].message.content
        except RateLimitError as e:
            wait_time = (2 ** attempt)
            print(f"Rate limit exceeded. Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries})")
            await asyncio.sleep(wait_time)
        except APIError as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            print(f"API Error: {e}. Retrying in {wait_time:.2f} seconds... (Attempt {attempt + 1}/{max_retries})")
            await asyncio.sleep(wait_time)
        except Exception as e:
            print(f"An unexpected error occurred calling Qwen API: {e}. Attempt {attempt + 1}/{max_retries}")
            if attempt == max_retries - 1:
                return None
            await asyncio.sleep(2)

    print("All retries failed for a prompt.")
    return None

async def generate_rejected_response(client, prompt_messages, max_retries=3):
    """Generate rejected response using OpenRouter API"""
    if not client:
        print("OpenRouter client not available. Cannot generate response.")
        return None
    
    for attempt in range(max_retries):
        try:
            completion = await client.chat.completions.create(
                extra_headers={
                    "HTTP-Referer": "https://github.com/torrilla/act-therapist",
                    "X-Title": "ACT Therapist Research",
                },
                model=OPENROUTER_MODEL_REJECTED,
                messages=prompt_messages,
                max_tokens=512,
                temperature=0.8,
            )
            return completion.choices[0].message.content
        except RateLimitError as e:
            wait_time = (2 ** attempt)
            print(f"Rate limit exceeded. Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries})")
            await asyncio.sleep(wait_time)
        except APIError as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            print(f"API Error: {e}. Retrying in {wait_time:.2f} seconds... (Attempt {attempt + 1}/{max_retries})")
            await asyncio.sleep(wait_time)
        except Exception as e:
            print(f"An unexpected error occurred calling OpenRouter API: {e}. Attempt {attempt + 1}/{max_retries}")
            if attempt == max_retries - 1:
                return None
            await asyncio.sleep(2)

    print("All retries failed for a prompt.")
    return None

async def create_orpo_preference_dataset(base_dataset, output_path):
    """Create preference dataset with Qwen (chosen) and OpenRouter (rejected) responses"""
    if os.path.exists(output_path):
        print(f"Found existing preference dataset at {output_path}. Loading it.")
        return Dataset.from_json(output_path)

    print(f"Generating preference pairs concurrently using Qwen (chosen) and OpenRouter (rejected)...")
    if not qwen_client or not openrouter_client:
        raise RuntimeError("Both Qwen and OpenRouter clients must be initialized. Cannot proceed with data generation.")
    
    CONCURRENT_REQUESTS = 16
    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    
    async def process_item(item):
        async with semaphore:
            chosen_task = generate_chosen_response(qwen_client, item['prompt'])
            rejected_task = generate_rejected_response(openrouter_client, item['prompt'])
            
            chosen, rejected = await asyncio.gather(chosen_task, rejected_task, return_exceptions=True)
            
            if isinstance(chosen, Exception) or isinstance(rejected, Exception):
                return None
            
            if chosen and chosen.strip() and rejected and rejected.strip():
                return {
                    "prompt": item['prompt'],
                    "chosen": chosen.strip(),
                    "rejected": rejected.strip(),
                }
            return None

    tasks = [process_item(item) for item in base_dataset]
    
    results = await tqdm_asyncio.gather(*tasks, desc="Generating Preference Pairs")
    
    preference_data = [res for res in results if res is not None]

    print(f"Successfully generated {len(preference_data)} preference pairs out of {len(base_dataset)} total.")
    if not preference_data:
        raise RuntimeError("Failed to generate any preference pairs. Check API connections and keys.")

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(preference_data, f, indent=2)
    print(f"Saved preference dataset to {output_path}")

    return Dataset.from_list(preference_data)

hf_token_dl = HF_TOKEN
repo_id_dl = "TTahir/ACT_Dataset_April_17"
filename_dl = JSON_DATA_PATH

if not os.path.exists(filename_dl):
    print(f"File '{filename_dl}' not found. Downloading...")
    if "/" in repo_id_dl and not repo_id_dl.startswith("datasets/"):
        url = f"https://huggingface.co/{repo_id_dl}/resolve/main/{filename_dl}"
    else:
        url = f"https://huggingface.co/datasets/{repo_id_dl.replace('datasets/','')}/resolve/main/{filename_dl}"

    headers = {}
    if hf_token_dl:
         headers["Authorization"] = f"Bearer {hf_token_dl}"
    else:
        print("Warning: Hugging Face token not found for download. Trying without token.")

    try:
        response = requests.get(url, headers=headers, stream=True)
        response.raise_for_status()
        with open(filename_dl, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded '{filename_dl}' successfully from {url}")
    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"Failed to download file '{filename_dl}' from '{url}'. Error: {e}")
    except Exception as e:
         raise RuntimeError(f"An unexpected error occurred during download: {e}")
else:
    print(f"File '{filename_dl}' already exists. Skipping download.")

async def main_data_prep():
    """Main async function to wrap data preparation"""
    try:
        base_orpo_dataset = load_act_data_for_orpo(JSON_DATA_PATH)
        train_dataset = await create_orpo_preference_dataset(base_orpo_dataset, PREFERENCE_DATA_PATH)
    
        if len(train_dataset) > 0:
            print("\nExample data point (after processing for ORPO):")
            example = train_dataset[0]
            print("Prompt Messages (first 3):")
            for msg in example['prompt'][:3]: print(f"  Role: {msg['role']}, Content: {msg['content'][:100]}...")
            print("\nChosen Response (from Qwen):")
            print(f"  {example['chosen'][:250]}...")
            print("\nRejected Response (from OpenRouter):")
            print(f"  {example['rejected'][:250]}...")
        return train_dataset
    except Exception as e:
        print(f"ERROR during data loading or preference generation: {e}\n{traceback.format_exc()}")
        raise RuntimeError(f"Failed to create dataset: {e}")

try:
    loop = asyncio.get_running_loop()
    print("Found running event loop. Awaiting data preparation...")
    train_dataset = await main_data_prep()
except RuntimeError:
    print("No running event loop. Starting new one with asyncio.run()...")
    train_dataset = asyncio.run(main_data_prep())

class FileLoggingCallback(TrainerCallback):
    """Callback for logging training metrics to file"""
    def __init__(self, log_file_path):
        self.log_file_path = log_file_path
        self.log_file = None
        self.header = (
            "Step\tLoss\tLR\tRewards/Chosen\tRewards/Rejected\tRewards/Accuracies\tRewards/Margins\n"
        )
        os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
        self._initialize_log_file()

    def _initialize_log_file(self):
        file_exists = os.path.exists(self.log_file_path)
        self.log_file = open(self.log_file_path, 'a+', encoding='utf-8')
        if not file_exists or os.path.getsize(self.log_file_path) == 0:
            self.log_file.write(self.header)
            self.log_file.flush()

    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs: dict = None, **kwargs):
        if self.log_file is None: self._initialize_log_file()
        if state.is_local_process_zero and logs is not None:
            step = state.global_step
            loss = logs.get("loss", logs.get("train_loss", "N/A"))
            lr = logs.get("learning_rate", "N/A")
            rewards_chosen = logs.get("rewards/chosen", "N/A")
            rewards_rejected = logs.get("rewards/rejected", "N/A")
            rewards_accuracies = logs.get("rewards/accuracies", "N/A")
            rewards_margins = logs.get("rewards/margins", "N/A")

            def format_val(v):
                if hasattr(v, 'item'):
                    try: v = v.item()
                    except: pass
                if isinstance(v, (int, float)): return f"{v:.6f}"
                return str(v)

            log_entry = (
                f"{step}\t"
                f"{format_val(loss)}\t"
                f"{format_val(lr)}\t"
                f"{format_val(rewards_chosen)}\t"
                f"{format_val(rewards_rejected)}\t"
                f"{format_val(rewards_accuracies)}\t"
                f"{format_val(rewards_margins)}\n"
            )
            try:
                self.log_file.write(log_entry)
                self.log_file.flush()
            except Exception as e:
                print(f"ERROR writing to log file at step {step}: {e}")

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if self.log_file:
            self.log_file.close(); self.log_file = None
            print(f"Training log saved to: {self.log_file_path}")

    def __del__(self):
        if self.log_file:
            try: self.log_file.close()
            except Exception as e: print(f"Error closing training log file in __del__: {e}")

print("Configuring ORPOTrainer...")
os.makedirs(OUTPUT_DIR, exist_ok=True)

MAX_PROMPT_LEN = 5400
MAX_COMPLETION_LEN = MAX_SEQ_LENGTH - MAX_PROMPT_LEN
MAX_LEN = MAX_SEQ_LENGTH

print(f"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}")
print(f"Effective Max Prompt Length: {MAX_PROMPT_LEN}")
print(f"Effective Max Length (Prompt + Completion): {MAX_LEN}")

training_args = ORPOConfig(
    output_dir = OUTPUT_DIR,
    learning_rate = 8e-6,
    beta = 0.1,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    num_train_epochs = 1,
    logging_steps = 5,
    save_steps = 50,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_torch",
    max_length = MAX_LEN,
    max_prompt_length = MAX_PROMPT_LEN,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    seed = 3407,
    remove_unused_columns=False,
)

if train_dataset is None or len(train_dataset) == 0:
     raise RuntimeError("Training dataset is empty or failed to load. Cannot configure ORPOTrainer.")

file_logging_callback = FileLoggingCallback(log_file_path=LOG_FILE_PATH)
print(f"Logging training metrics to: {LOG_FILE_PATH}")

callbacks_to_use = [file_logging_callback]

trainer = ORPOTrainer(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    train_dataset = train_dataset,
    callbacks = callbacks_to_use,
)
print("ORPOTrainer configured.")

print("Starting ORPO training...")
if trainer:
    try:
        last_checkpoint = None
        if os.path.isdir(training_args.output_dir):
            from transformers.trainer_utils import get_last_checkpoint
            last_checkpoint = get_last_checkpoint(training_args.output_dir)
            if last_checkpoint: print(f"Found potential checkpoint: {last_checkpoint}")

        if last_checkpoint and os.path.exists(os.path.join(last_checkpoint, "trainer_state.json")):
            print(f"Resuming training from checkpoint: {last_checkpoint}")
            train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
        else:
            if last_checkpoint: print(f"Checkpoint at {last_checkpoint} seems incomplete. Starting fresh.")
            else: print(f"No valid checkpoint found in {training_args.output_dir}. Starting training from scratch.")
            train_result = trainer.train()

        print("Training finished!")
        print("\n--- Saving Final Adapter ---")
        adapter_save_path = os.path.join(OUTPUT_DIR, "final_adapter")
        trainer.model.save_pretrained(adapter_save_path)
        tokenizer.save_pretrained(adapter_save_path)
        print(f"Final LoRA adapter and tokenizer saved to {adapter_save_path}")

        print("\n--- Uploading to Hugging Face Hub ---")
        hf_token_upload = HF_TOKEN
        if not hf_token_upload:
            print(f"WARNING: Hugging Face token not provided. Skipping upload.")
        else:
            try:
                current_date = datetime.now().strftime('%Y-%m-%d')
                repo_name_dated = HF_REPO_NAME_TEMPLATE.format(date=current_date)
                print(f"Attempting to create/access private repo: {repo_name_dated}")
                create_repo(repo_id=repo_name_dated, token=hf_token_upload, private=True, exist_ok=True)
                print(f"Repo '{repo_name_dated}' ensured.")

                commit_message_suffix = f"Llama3-1-8B_ORPO_QwenChosen_{current_date}"

                print(f"Uploading final adapter folder '{adapter_save_path}'...")
                upload_folder(
                    folder_path=adapter_save_path, repo_id=repo_name_dated, token=hf_token_upload,
                    repo_type="model", commit_message=f"Upload ORPO adapter ({commit_message_suffix})"
                )
                print("Final adapter uploaded.")
                if os.path.exists(PREFERENCE_DATA_PATH):
                    print(f"Uploading preference dataset file '{PREFERENCE_DATA_PATH}'...")
                    upload_file(path_or_fileobj=PREFERENCE_DATA_PATH, path_in_repo=os.path.basename(PREFERENCE_DATA_PATH), repo_id=repo_name_dated, token=hf_token_upload, repo_type="model", commit_message=f"Upload ORPO preference dataset ({commit_message_suffix})")
                if os.path.exists(LOG_FILE_PATH):
                    print(f"Uploading training log file '{LOG_FILE_PATH}'...")
                    upload_file(path_or_fileobj=LOG_FILE_PATH, path_in_repo=os.path.basename(LOG_FILE_PATH), repo_id=repo_name_dated, token=hf_token_upload, repo_type="model", commit_message=f"Upload training log ({commit_message_suffix})")

                script_path = os.path.abspath(sys.argv[0]) if sys.argv and sys.argv[0] and os.path.exists(sys.argv[0]) else None
                if script_path:
                    script_filename = os.path.basename(script_path)
                    print(f"Uploading training script '{script_filename}'...")
                    upload_file(path_or_fileobj=script_path, path_in_repo=script_filename, repo_id=repo_name_dated, token=hf_token_upload, repo_type="model", commit_message=f"Upload training script {script_filename} ({commit_message_suffix})")
                
                print(f"Successfully uploaded artifacts to private repo: https://huggingface.co/{repo_name_dated}")
            except Exception as hf_e: print(f"ERROR during Hugging Face upload: {hf_e}\n{traceback.format_exc()}")

        print("\n--- Inference Example ---")
        if hasattr(model, "merge_and_unload"):
            try:
                model = model.merge_and_unload()
                print("LoRA adapters merged for inference.")
            except Exception as e:
                print(f"Could not merge adapters: {e}. Inference might fail or use base model.")

        model.eval()
        test_prompt = "I keep having this thought that I'm a complete failure, and it just spirals."
        messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": test_prompt}]
        inference_input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(inference_input_text, return_tensors="pt").to(model.device)

        print(f"\nGenerating response for prompt: '{test_prompt}'")
        with torch.no_grad():
            outputs_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        generated_response_only = tokenizer.decode(outputs_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        print("\nGenerated ACT Response (Cleaned Model Output):"); print(generated_response_only)

    except Exception as e: print(f"An error occurred during training or subsequent steps: {e}"); print(traceback.format_exc())
    finally:
        if hasattr(file_logging_callback, 'log_file') and file_logging_callback.log_file is not None:
            try:
                if not file_logging_callback.log_file.closed: file_logging_callback.log_file.close(); print("Closed training log file in finally block.")
            except Exception as close_e: print(f"Error closing training log file in finally block: {close_e}")
elif not trainer: print("Training skipped. ORPOTrainer not initialized correctly.")
print("\nScript finished.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 08-20 22:25:12 [__init__.py:241] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!
OpenRouter API key provided. Initializing ASYNC client for generating 'rejected' responses...
Async OpenRouter client initialized successfully.
Qwen API key provided. Initializing ASYNC client for generating 'chosen' responses...
Async Qwen client initialized successfully.
Loading base model and tokenizer...
==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.2. vLLM: 0.10.1.
   \\   /|    NVIDIA RTX A5000. Num GPUs = 1. Max memory: 23.573 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Applying PEFT (LoRA)...


Unsloth 2025.8.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model and LoRA setup complete.
File 'combined_unsloth_dataset.json' already exists. Skipping download.
Found running event loop. Awaiting data preparation...
Loading and processing base data from combined_unsloth_dataset.json...
Loaded 1250 entries.
Skipped 0 entries (Format: 0, Role: 0, No User Msg: 0).
Base data for ORPO prepared.
Generating preference pairs concurrently using Qwen (chosen) and OpenRouter (rejected)...


Generating Preference Pairs: 100%|██████████| 1250/1250 [29:08<00:00,  1.40s/it]


Successfully generated 1250 preference pairs out of 1250 total.
Saved preference dataset to orpo_preference_dataset.json

Example data point (after processing for ORPO):
Prompt Messages (first 3):
  Role: system, Content: CRITICAL OUTPUT FORMAT: You MUST structure EVERY response with a thinking section followed by an ans...
  Role: user, Content: I don't even know where to start. I've been feeling so overwhelmed lately. It's like everything sets...

Chosen Response (from Qwen):
  Thinking:
Patient expresses intense anger and overwhelm, particularly around the new job, with a sense of helplessness ("don't even know where to start"). Core emotion is anger layered with exhaustion. Goal: Explore the tangible context of "everythin...

Rejected Response (from OpenRouter):
  Thinking:
You express a core emotion of overwhelm and anger, which may be linked to your new job. I want to explore the tangible context of "overwhelmed" and identify potential unworkable strategies that might be contribu

Map (num_proc=2):   0%|          | 0/1250 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1250 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1250 [00:00<?, ? examples/s]

ORPOTrainer configured.
Starting ORPO training...
No valid checkpoint found in act-therapist-orpo-llama31-3b. Starting training from scratch.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,250 | Num Epochs = 1 | Total steps = 157
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 97,255,424 of 3,310,005,248 (2.94% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen,log_odds_ratio,log_odds_chosen,eval_logits / chosen,eval_logits / rejected,nll_loss
5,3.8371,-0.357223,-0.196604,0.0,-0.160619,-1.966041,-3.572229,-0.874785,-0.654762,-1.916027,-1.739805,0,0,3.645452
10,3.8567,-0.358899,-0.196459,0.0,-0.162441,-1.964588,-3.588993,-0.873744,-0.635164,-1.923082,-1.753498,No Log,No Log,3.664363
15,3.7733,-0.352168,-0.202979,0.0,-0.149189,-2.029787,-3.521678,-0.870663,-0.674012,-1.810173,-1.612161,No Log,No Log,3.592265
20,3.765,-0.351453,-0.206685,0.0,-0.144769,-2.066847,-3.514532,-1.004884,-0.683175,-1.77063,-1.563412,No Log,No Log,3.587894
25,3.6147,-0.337151,-0.202899,0.0,-0.134252,-2.028987,-3.37151,-0.994564,-0.616317,-1.682375,-1.459433,No Log,No Log,3.446468
30,3.5989,-0.334902,-0.191032,0.0,-0.14387,-1.91032,-3.349024,-0.875272,-0.63816,-1.777368,-1.574673,No Log,No Log,3.421178
35,3.441,-0.3214,-0.183955,0.0,-0.137445,-1.839551,-3.213996,-0.796161,-0.647011,-1.728381,-1.517227,No Log,No Log,3.268186
40,3.4188,-0.31832,-0.175558,0.0,-0.142762,-1.75558,-3.183204,-0.7437,-0.616736,-1.783633,-1.586762,No Log,No Log,3.240396
45,3.3279,-0.310911,-0.187735,0.025,-0.123176,-1.877351,-3.109109,-0.898967,-0.536047,-1.611775,-1.366302,No Log,No Log,3.166764
50,3.2555,-0.30568,-0.181667,0.0,-0.124013,-1.816674,-3.056803,-0.697403,-0.503317,-1.611297,-1.376715,No Log,No Log,3.094349


Training log saved to: act-therapist-orpo-llama31-3b/training_log.tsv
Training finished!

--- Saving Final Adapter ---
Final LoRA adapter and tokenizer saved to act-therapist-orpo-llama31-3b/final_adapter

--- Uploading to Hugging Face Hub ---
Attempting to create/access private repo: TTahir/act-therapist-orpo-llama31-3b-2025-08-20
Repo 'TTahir/act-therapist-orpo-llama31-3b-2025-08-20' ensured.
Uploading final adapter folder 'act-therapist-orpo-llama31-3b/final_adapter'...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...a31-3b/final_adapter/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

  ...l_adapter/adapter_model.safetensors:   0%|          | 46.4kB /  389MB            

Final adapter uploaded.
Uploading preference dataset file 'orpo_preference_dataset.json'...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  orpo_preference_dataset.json          :  84%|########4 | 16.1MB / 19.1MB            

Uploading training log file 'act-therapist-orpo-llama31-3b/training_log.tsv'...
Uploading training script 'ipykernel_launcher.py'...
Successfully uploaded artifacts to private repo: https://huggingface.co/TTahir/act-therapist-orpo-llama31-3b-2025-08-20

--- Inference Example ---
LoRA adapters merged for inference.

Generating response for prompt: 'I keep having this thought that I'm a complete failure, and it just spirals.'

Generated ACT Response (Cleaned Model Output):
Thinking:
Patient expresses core emotion: shame. Identified theme/contradiction/unexplored area: the automatic association of "failure" with the thought, which likely fuels the spiral. Goal: Introduce willingness as an alternative to struggle, specifically by exploring the value behind this thought.

Answer:
Can you tell me what's important to you about being someone who isn't a "failure" – what matters most to you about that value?

Script finished.
