In [16]:
!pip install --upgrade \
    transformers>=4.32.0 \
    datasets>=2.13.0 \
    evaluate>=0.4.0 \
    accelerate>=0.20.0 \
    torch>=2.0.0 \
    sentencepiece \
    pandas \
    scikit-learn \
    nlpaug \
    --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
fastai 2.7.18 requires torch<2.6,>=1.10, but you have torch 2.6.0 which is incompatible.
torchvision 0.20.1+cu124 requires torch==2.5.1, but you have torch 2.6.0 which is incompatible.
torchaudio 2.5.1+cu124 requires torch==2.5.1, but you have torch 2.6.0 which is incompatible.[0m[31m
[0m

In [17]:
# ==============================
#  Imports
# ==============================
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    TrainingArguments,
    Trainer
)
import evaluate
import nlpaug.augmenter.word as naw

In [32]:
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import torch
import nlpaug.augmenter.word as naw
import numpy as np

def create_dataset():
    """
    Loads specific CBT-Bench subsets plus the Amod counseling dataset.
    Maps columns to (client_statement, response) if meaningful.
    Merges them, cleans them, augments them, and returns HF Datasets.
    """
    # -- 1) CBT-Bench subsets --
    cbt_subsets = [
        "qa_test", "qa_seed",
        "distortions_test", "distortions_seed",
        "core_major_test", "core_major_seed",
        "core_fine_test", "core_fine_seed"
    ]

    all_dfs = []

    # -- 1.1) Load & Map CBT-Bench --
    for subset in cbt_subsets:
        try:
            ds = load_dataset("Psychotherapy-LLM/CBT-Bench", subset)
            if 'train' not in ds:
                print(f"Subset {subset} has no 'train' split. Skipping...")
                continue

            raw_df = ds['train'].to_pandas()
            print(f"Loaded CBT-Bench subset '{subset}' with shape: {raw_df.shape}")
            print("Columns found:", raw_df.columns.tolist())

            # Decide how to rename based on the subset
            if subset.startswith("qa_"):
                # question -> client_statement, a -> response
                if 'question' in raw_df.columns and 'a' in raw_df.columns:
                    mapped_df = raw_df.rename(columns={
                        'question': 'client_statement',
                        'a': 'response'
                    })
                    mapped_df = mapped_df[['client_statement', 'response']].dropna()
                    all_dfs.append(mapped_df)
                else:
                    print(f"Skipping {subset}, missing 'question' or 'a' columns.")

            elif subset.startswith("distortions_"):
                # thoughts -> client_statement, distortions -> response
                if 'thoughts' in raw_df.columns and 'distortions' in raw_df.columns:
                    mapped_df = raw_df.rename(columns={
                        'thoughts': 'client_statement',
                        'distortions': 'response'
                    })
                    mapped_df = mapped_df[['client_statement', 'response']].dropna()
                    all_dfs.append(mapped_df)
                else:
                    print(f"Skipping {subset}, missing 'thoughts' or 'distortions'.")

            elif subset.startswith("core_major_"):
                # thoughts -> client_statement, core_belief_major -> response
                if 'thoughts' in raw_df.columns and 'core_belief_major' in raw_df.columns:
                    mapped_df = raw_df.rename(columns={
                        'thoughts': 'client_statement',
                        'core_belief_major': 'response'
                    })
                    mapped_df = mapped_df[['client_statement', 'response']].dropna()
                    all_dfs.append(mapped_df)
                else:
                    print(f"Skipping {subset}, missing 'thoughts' or 'core_belief_major'.")

            elif subset.startswith("core_fine_"):
                # thoughts -> client_statement, core_belief_fine_grained -> response
                if 'thoughts' in raw_df.columns and 'core_belief_fine_grained' in raw_df.columns:
                    mapped_df = raw_df.rename(columns={
                        'thoughts': 'client_statement',
                        'core_belief_fine_grained': 'response'
                    })
                    mapped_df = mapped_df[['client_statement', 'response']].dropna()
                    all_dfs.append(mapped_df)
                else:
                    print(f"Skipping {subset}, missing 'thoughts' or 'core_belief_fine_grained'.")

        except Exception as e:
            print(f"Could not load subset '{subset}': {e}")

    # -- 1.2) Load & Map Amod Dataset --
    try:
        amod_raw = load_dataset("Amod/mental_health_counseling_conversations")
        if 'train' in amod_raw:
            amod_df = amod_raw['train'].to_pandas()
            print(f"\nLoaded Amod counseling dataset with shape: {amod_df.shape}")
            print("Amod columns:", amod_df.columns.tolist())

            # Context -> client_statement, Response -> response
            if 'Context' in amod_df.columns and 'Response' in amod_df.columns:
                amod_df = amod_df.rename(columns={
                    'Context': 'client_statement',
                    'Response': 'response'
                })
                amod_df = amod_df[['client_statement', 'response']].dropna()
                all_dfs.append(amod_df)
            else:
                print("Skipping Amod: missing 'Context' or 'Response'.")
        else:
            print("Amod dataset doesn't have 'train' split.")
    except Exception as e:
        print(f"Could not load Amod dataset: {e}")

    # -- 1.3) Combine all DataFrames --
    if not all_dfs:
        raise ValueError("No data was successfully mapped into (client_statement, response).")

    combined_df = pd.concat(all_dfs, ignore_index=True)
    print(f"\nCombined shape => {combined_df.shape}")

    # -- 1.4) Clean & Convert to Strings BEFORE dropping duplicates --
    combined_df.dropna(inplace=True)
    combined_df['client_statement'] = combined_df['client_statement'].astype(str)
    combined_df['response'] = combined_df['response'].astype(str)

    # Now safe to drop duplicates
    combined_df.drop_duplicates(inplace=True)

    # -- 2) Data Augmentation Setup --
    aug = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased',
        action="substitute",
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )

    # -- 2.1) Perform Augmentation --
    augmented_rows = []
    for _, row in combined_df.iterrows():
        try:
            # Augment client_statement
            aug_client = aug.augment(row['client_statement'])
            if isinstance(aug_client, list):
                aug_client = " ".join(aug_client)
            elif isinstance(aug_client, np.ndarray):
                aug_client = " ".join(aug_client.astype(str))

            # Augment response
            aug_response = aug.augment(row['response'])
            if isinstance(aug_response, list):
                aug_response = " ".join(aug_response)
            elif isinstance(aug_response, np.ndarray):
                aug_response = " ".join(aug_response.astype(str))

            augmented_rows.append({
                'client_statement': aug_client,
                'response': aug_response
            })
        except Exception as e:
            print(f"Augmentation failed on row: {row}, error: {e}")
            continue

    # -- 2.2) Combine Original + Augmented --
    augmented_df = pd.DataFrame(augmented_rows)

    # Force to string again to avoid any leftover arrays
    augmented_df['client_statement'] = augmented_df['client_statement'].astype(str)
    augmented_df['response'] = augmented_df['response'].astype(str)

    final_df = pd.concat([combined_df, augmented_df], ignore_index=True)
    print(f"Final shape after augmentation => {final_df.shape}")

    # -- 2.3) Drop duplicates again after augmentation --
    final_df.drop_duplicates(inplace=True)
    print(f"Final shape after dropping duplicates => {final_df.shape}")

    # -- 3) Train-Validation Split --
    # If you have extremely short or long responses, filter them out
    # before splitting. For demonstration, let's do it here:
    final_df = final_df[
        (final_df['client_statement'].str.len() > 10) &
        (final_df['response'].str.len().between(15, 300))
    ]
    print(f"Final shape after length filtering => {final_df.shape}")

    train_df, val_df = train_test_split(
        final_df,
        test_size=0.15,
        random_state=42,
        # Stratify based on length for better distribution
        stratify=pd.qcut(final_df['response'].str.len(), q=4, labels=False)
    )

    train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

    return train_dataset, val_dataset


# ============== Create the Dataset ==============
train_dataset, val_dataset = create_dataset()
print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")


Loaded CBT-Bench subset 'qa_test' with shape: (220, 7)
Columns found: ['id', 'question', 'a', 'b', 'c', 'd', 'e']
Loaded CBT-Bench subset 'qa_seed' with shape: (47, 7)
Columns found: ['id', 'question', 'a', 'b', 'c', 'd', 'e']
Loaded CBT-Bench subset 'distortions_test' with shape: (146, 5)
Columns found: ['id', 'ori_text', 'situation', 'thoughts', 'distortions']
Loaded CBT-Bench subset 'distortions_seed' with shape: (20, 5)
Columns found: ['id', 'ori_text', 'situation', 'thoughts', 'distortions']
Loaded CBT-Bench subset 'core_major_test' with shape: (184, 5)
Columns found: ['id', 'ori_text', 'situation', 'thoughts', 'core_belief_major']
Loaded CBT-Bench subset 'core_major_seed' with shape: (20, 5)
Columns found: ['id', 'ori_text', 'situation', 'thoughts', 'core_belief_major']
Loaded CBT-Bench subset 'core_fine_test' with shape: (112, 5)
Columns found: ['id', 'ori_text', 'situation', 'thoughts', 'core_belief_fine_grained']
Loaded CBT-Bench subset 'core_fine_seed' with shape: (20, 5)
Col

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Final shape after augmentation => (7042, 2)
Final shape after dropping duplicates => (7042, 2)
Final shape after length filtering => (1658, 2)
Train size: 1409, Validation size: 249


In [33]:
import pandas as pd
from google.colab import files

# Convert Hugging Face datasets to Pandas DataFrames
train_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(val_dataset)

# Save train dataset as CSV
train_df.to_csv("train_dataset.csv", index=False)
files.download("train_dataset.csv")  # Download in Colab

# Save validation dataset as CSV
val_df.to_csv("val_dataset.csv", index=False)
files.download("val_dataset.csv")  # Download in Colab

print("Train and validation datasets saved and ready for download!")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Train and validation datasets saved and ready for download!


In [98]:
# ==============================
#  Step 2: Model Initialization
# ==============================
model_name = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

def tokenize_fn(examples):
    """
    Tokenize function for the Trainer.
    Prefix prompts with 'cbt response:' to encourage consistent style.
    """
    inputs = ["cbt response: " + text for text in examples['client_statement']]
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding='max_length'
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['response'],
            max_length=128,
            truncation=True,
            padding='max_length'
        ).input_ids

    model_inputs['labels'] = labels
    return model_inputs

# Map the tokenization over the datasets
train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/1409 [00:00<?, ? examples/s]

  """


Map:   0%|          | 0/249 [00:00<?, ? examples/s]

In [99]:
# ==============================
#  Step 3: Model Fine-Tuning
# ==============================
training_args = TrainingArguments(
    output_dir="./cbt-therapist-checkpoints",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,  # Mixed precision training - suitable for T4 GPU
    report_to="none",
    gradient_accumulation_steps=2,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

  loaded_dict = json.loads(passed_value)


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 20.12 MiB is free. Process 2364 has 14.72 GiB memory in use. Of the allocated memory 14.54 GiB is allocated by PyTorch, and 42.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [48]:
trainer.evaluate()

{'eval_loss': 0.47670862078666687,
 'eval_runtime': 2.0409,
 'eval_samples_per_second': 122.006,
 'eval_steps_per_second': 15.679,
 'epoch': 9.892655367231638}

In [93]:
# ==============================
#  Step 4: Prompt Engineering
# ==============================
def generate_response(prompt, max_length=128):
    """
    Generate a response from the fine-tuned T5 model.
    Includes a basic safety filter example.
    """
    input_text = f"cbt response: {prompt}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

    outputs = model.generate(
        input_ids,
        max_length=max_length,
        temperature=0.7,
        top_k=40,
        top_p=0.90,
        repetition_penalty=2.5,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Example safety filter
    if any(word in response.lower() for word in ["diagnose", "prescribe", "medical"]):
        return "I recommend discussing this with a licensed professional."

    return response

In [50]:
# ==============================
#  Step 5: Evaluation Metrics
# ==============================
!pip install rouge_score
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

def evaluate_model(model, tokenizer, dataset, num_samples=100):
    """
    Evaluates the model using ROUGE, BLEU, and Perplexity metrics on a small sample.
    """
    # Evaluate perplexity
    eval_results = trainer.evaluate(dataset.select(range(min(num_samples, len(dataset)))))
    perplexity = np.exp(eval_results['eval_loss'])

    # Text generation metrics
    references = []
    predictions = []

    sampled_dataset = dataset.select(range(min(num_samples, len(dataset))))
    for example in sampled_dataset:
        prompt = example['client_statement']
        reference = example['response']

        prediction = generate_response(prompt)
        references.append([reference])
        predictions.append(prediction)

    rouge_scores = rouge.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True
    )

    bleu_scores = bleu.compute(
        predictions=predictions,
        references=references
    )

    return {
        'perplexity': perplexity,
        'rouge1': rouge_scores['rouge1'],
        'rougeL': rouge_scores['rougeL'],
        'bleu': bleu_scores['bleu']
    }

# Run evaluation on the validation dataset
metrics = evaluate_model(model, tokenizer, val_dataset, num_samples=100)
print(f"""
Evaluation Results:
- Perplexity: {metrics['perplexity']:.2f}
- ROUGE-1: {metrics['rouge1']:.2f}
- ROUGE-L: {metrics['rougeL']:.2f}
- BLEU: {metrics['bleu']:.2f}
""")



  if self.temperature is not None and self.temperature != 1.0:
  if self.top_p is not None and self.top_p != 1.0:
  ):  # contrastive search uses top_k



Evaluation Results:
- Perplexity: 1.59
- ROUGE-1: 0.12
- ROUGE-L: 0.11
- BLEU: 0.02



In [97]:
def generate_response(prompt, max_length=128):
    # Insert prefix if the model was trained that way
    input_text = f"cbt response: {prompt}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

    outputs = model.generate(
    input_ids,
    max_length=128,
    num_beams=5,
    do_sample=False,      # or True if you want sampling, but that can lower exact overlap
    early_stopping=True,
    no_repeat_ngram_size=2
)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_prompt = "What should I do if I'm feeling anxious?"
answer = generate_response(test_prompt)
print("Prompt:", test_prompt)
print("Generated response:", answer)

Prompt: What should I do if I'm feeling anxious?
Generated response: ['helpless' 'unlovable']
