In [1]:
!nvidia-smi

Tue Mar 29 21:52:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 470.42.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 6000     Off  | 00000000:1B:00.0 Off |                  Off |
| 33%   33C    P8    26W / 260W |      0MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq transformers datasets wandb bitsandbytes-cuda111 rouge_score

## Setup

In [3]:
import os
from pathlib import Path
import random
import pandas as pd
from IPython.display import display, HTML

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers.trainer_pt_utils import get_parameter_names
from datasets import DatasetDict, Dataset, load_metric

import bitsandbytes as bnb

Training hyperparameters

In [4]:
# model
model_id = "EleutherAI/gpt-neo-1.3B"
output_dir = "./gpt-neo-therapist"
# data
max_length = 1024
bs = 2
val_bs = bs*2
eff_bs = 256
# training
lr = 8e-5

# Data

## Get data

In [5]:
!mkdir ./data && cd data && wget https://raw.githubusercontent.com/nbertagnolli/counsel-chat/master/data/20200325_counsel_chat.csv
!ls -hl data

mkdir: cannot create directory ‘./data’: File exists
total 3.2M
-rw-r--r-- 1 root root 3.2M Mar 29 20:19 20200325_counsel_chat.csv


## EDA

In [6]:
df = pd.read_csv("data/20200325_counsel_chat.csv", index_col=0)

In [7]:
n_qs = len(df.questionID.unique())
n_topics = len(df.topic.unique())
print(f"Total number of samples {df.shape[0]}, {n_qs} unique questions on {n_topics} topics")

Total number of samples 2129, 815 unique questions on 31 topics


In [8]:
# standardise spaces
df["questionTitle"] = df.questionTitle.map(lambda x: " ".join(x.split()))
df["questionText"] = df.questionText.map(lambda x: " ".join(x.split()))
df["answerText"] = df.answerText.map(lambda x: " ".join(x.split()))

def mb_add_period(text):
    if text[-1] not in {"?", ".", "!"}:
        return text + "."
    return text

df["questionTitle"] = df.questionTitle.map(mb_add_period)
assert (df.questionTitle.str.endswith("?") | df.questionTitle.str.endswith(".") | df.questionTitle.str.endswith("!")).all()

df["prompt"] = "Answer like a therapist:\n" + df.questionTitle + " " + df.questionText + "\nAnswer: "
df["fullText"] = df.prompt + df.answerText
df.head(3)

Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views,split,prompt,fullText
0,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Sherry Katz, LCSWCouples and Family Therapist,...",https://counselchat.com/therapists/sherry-katz...,"If everyone thinks you're worthless, then mayb...",1,2899,train,Answer like a therapist:\nCan I change my feel...,Answer like a therapist:\nCan I change my feel...
1,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Robin Landwehr, DBH, LPCC, NCCMental Health in...",https://counselchat.com/therapists/robin-landw...,"Hello, and thank you for your question and see...",1,3514,train,Answer like a therapist:\nCan I change my feel...,Answer like a therapist:\nCan I change my feel...
2,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,Lee KingI use an integrative approach to treat...,https://counselchat.com/therapists/lee-king,First thing I'd suggest is getting the sleep y...,0,5,train,Answer like a therapist:\nCan I change my feel...,Answer like a therapist:\nCan I change my feel...


Let's compute prompt and answer length length in tokens:

In [9]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

def get_length(text):
    return len(tokenizer(text)["input_ids"])

df["prompt_length"] = df.prompt.map(get_length)
df["answer_length"] = df.answerText.map(get_length)
df["full_length"] = df.fullText.map(get_length)

In [10]:
df.describe()

Unnamed: 0,questionID,upvotes,views,prompt_length,answer_length,full_length
count,2129.0,2129.0,2129.0,2129.0,2129.0,2129.0
mean,346.854861,0.489901,198.604979,85.186473,204.780648,288.916862
std,273.706241,0.942429,300.31428,55.650304,151.817316,165.131834
min,0.0,0.0,2.0,23.0,2.0,50.0
25%,78.0,0.0,58.0,54.0,106.0,179.0
50%,321.0,0.0,107.0,75.0,164.0,248.0
75%,588.0,1.0,210.0,102.0,252.0,342.0
max,884.0,9.0,3514.0,669.0,1108.0,1209.0


In [11]:
for name, group in df.groupby("split"):
    print(f"{name} split contains {len(group)} samples ({len(group.questionID.unique())} unique questions)")

test split contains 117 samples (39 unique questions)
train split contains 1839 samples (695 unique questions)
val split contains 173 samples (81 unique questions)


## Dataset prep

In [12]:
df.rename(columns={"answerText":"answer"}, inplace=True)
dataset = DatasetDict(**{
    k: Dataset.from_pandas(df.loc[df.split==k,["prompt", "answer", "topic"]]) for k in df.split.unique()
})

In [13]:
import random
import pandas as pd
from IPython.display import display, HTML
from pprint import pprint

def display_examples(dataset, num_examples=5, mode="pprint"):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    idx = random.sample(list(range(len(dataset))), num_examples)
    texts = [f'{sample["prompt"]}{sample["answer"]}' for sample in dataset.select(idx)]
    df = pd.DataFrame({"text":texts})
    
    if mode=="df":
        display(HTML(df.to_html()))
    elif mode=="pprint":
        for text in texts:
            pprint(text)
            print()
    else:
        raise ValueError(f"{mode} mode is not supported. Please select one of ['df' 'pprint']")

In [14]:
display_examples(dataset["train"])

('Answer like a therapist:\n'
 "Is it normal to go into therapy feeling nervous? I've gone to a couple "
 'therapy sessions so far and still everytime I walk in I get nervous and '
 'shaky. Is this normal? Should I still be feeling like this?\n'
 "Answer: of I would love to know a little bit more about what's going on in "
 'your life but I will attempt an answer.Yes, you could still be shaky and '
 'nervous going to therapy. This therapy thing your doing is sometimes scary. '
 'First, because your opening up things that you might have never wanted to. '
 'Second, your still building a relationship with this therapist person. You '
 'may never get over that. The therapist really can\'t be your "friend". They '
 'are there to push the buttons that you might not want pushed and help you '
 'heal. That in itself is scary and can make you anxious. Third, you really '
 'never know where this therapy thing will go. Yes, there are goals. But '
 'sometimes side roads need to be taken and somet

# Training

In [15]:
import wandb

%env WANDB_ENTITY = arampacha
wandb_entity = os.environ["WANDB_ENTITY"]

%env WANDB_PROJECT = ai-therapist
wandb_project = os.environ["WANDB_PROJECT"]

%env WANDB_LOG_MODEL = false
%env WANDB_WATCH = false

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
env: WANDB_ENTITY=arampacha
env: WANDB_PROJECT=ai-therapist
env: WANDB_LOG_MODEL=false
env: WANDB_WATCH=false


In [16]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [17]:
%%capture
if 'google.colab' in sys.modules:
    !curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
    !apt-get install git-lfs -y

In [18]:
!git lfs install

Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.


In [19]:
from huggingface_hub import Repository, notebook_login

# notebook_login()

In [20]:
if not os.path.exists(output_dir):
    repo = Repository(local_dir="./gpt-neo-therapist", clone_from='arampacha/gpt-neo-therapist')

## Data preprocessing


In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [22]:
def tokenize(batch):
    return tokenizer(batch['prompt'], batch["answer"], return_token_type_ids=True, verbose=False, return_length=True, truncation=True, max_length=max_length)

Tokenize the texts in dataset dataset:

In [23]:
column_names = dataset["train"].column_names

dataset = dataset.map(lambda x: {"answer":x["answer"]+tokenizer.eos_token}, batched=False)
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=100, remove_columns=column_names)

  0%|          | 0/1839 [00:00<?, ?ex/s]

  0%|          | 0/173 [00:00<?, ?ex/s]

  0%|          | 0/117 [00:00<?, ?ex/s]

  0%|          | 0/19 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## DataCollator

In [24]:
from dataclasses import dataclass
from transformers import PreTrainedTokenizerBase, BatchEncoding
from typing import List, Dict, Union, Optional

@dataclass
class DataCollatorForPromptGeneration:
    """
    Data collator used for line-by-line causal language modeling. Inputs are 
    dynamically padded to the maximum length of a batch if theyare not all of 
    the same length. The labels are constructed according to `toke_type_ids` 
    setting `label=-100` where `token_type_ids == 0` which corresponds to prompt. 

    Args:
        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
            The tokenizer used for encoding the data.
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
    """

    tokenizer: PreTrainedTokenizerBase
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(
        self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of, max_length=self.max_length)
        
        labels = torch.where(batch["token_type_ids"].bool(), batch["input_ids"].clone(), torch.tensor(-100))
        batch["labels"] = labels
        return batch

In [25]:
data_collator = DataCollatorForPromptGeneration(tokenizer=tokenizer)

In [26]:
# data_collator([tokenized_dataset["train"][i] for i in range(8)])["input_ids"]

## Trainer

In [27]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    group_by_length=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=val_bs,
    gradient_accumulation_steps=eff_bs//bs,
    gradient_checkpointing=True,
    learning_rate=lr,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-08,
    num_train_epochs=5,
    max_steps=-1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_strategy="steps",
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=1,
    seed=24,
    fp16=torch.cuda.is_available(),
    dataloader_drop_last=False,
    dataloader_num_workers=4,
    load_best_model_at_end=True,
    report_to="all",
    run_name="gpt-neo-13b"
)

In [28]:
# setting `use_cache=False because it's not compatible with gradient checkpointing
model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, low_cpu_mem_usage=False)

In [29]:
decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]
optimizer = bnb.optim.Adam8bit(
    params=optimizer_grouped_parameters,
    lr=training_args.learning_rate,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
)

In [32]:
import nltk
import numpy as np
nltk.download('punkt')

rouge_metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Remove prompt from predictions and labels.
    predictions = np.where(labels != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    # Add mean generated length
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

def preprocess_logits_for_metrics(logits, labels):
    return logits.argmax(-1)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, None)
)

Using amp half precision backend


In [33]:
out = trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: length. If length are not expected by `GPTNeoForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1839
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 128
  Total optimization steps = 35
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33marampacha[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss
0,8.6724,6.243495
1,6.2392,3.695696
2,3.1567,2.929911
3,2.925,2.786297
4,2.6395,2.768935


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: length. If length are not expected by `GPTNeoForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 173
  Batch size = 4
Saving model checkpoint to ./gpt-neo-therapist/checkpoint-7
Configuration saved in ./gpt-neo-therapist/checkpoint-7/config.json
Model weights saved in ./gpt-neo-therapist/checkpoint-7/pytorch_model.bin
tokenizer config file saved in ./gpt-neo-therapist/checkpoint-7/tokenizer_config.json
Special tokens file saved in ./gpt-neo-therapist/checkpoint-7/special_tokens_map.json
Deleting older checkpoint [gpt-neo-therapist/checkpoint-28] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: length. If length are not expected by `GPTNeoForCausalLM.forward`,  you can safely ignore this

In [34]:
push_to_hub = False

if push_to_hub:
    trainer.push_to_hub()
else:
    trainer.save_model()
    trainer.create_model_card("gpt-neo-therapist")

Saving model checkpoint to ./gpt-neo-therapist
Configuration saved in ./gpt-neo-therapist/config.json
Model weights saved in ./gpt-neo-therapist/pytorch_model.bin
tokenizer config file saved in ./gpt-neo-therapist/tokenizer_config.json
Special tokens file saved in ./gpt-neo-therapist/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}


# Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = AutoModelForCausalLM.from_pretrained("./tmp", use_cache=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [36]:
df = pd.read_csv("data/20200325_counsel_chat.csv")
df = df[df.split=="test"]

In [38]:
import random

def generate_one(df):
    i = random.randint(0, len(df)-1)
    sample = df.iloc[i, :]

    prompt_text = f"Answer like a therapist:\n {sample.questionTitle} {sample.questionText}\nAnswer: "
    ref_answer = sample.answerText
    prompt = tokenizer(prompt_text, return_tensors="pt")["input_ids"]

    outputs = model.generate(
        prompt.to(device),
        min_length=None,
        max_length=prompt.size(1) + 400, 
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_p=0.92,
        top_k=0,
        temperature=1.,
        num_return_sequences=1,
        repetition_penalty=1.,
    )

    print("PROMPT:")
    print(prompt_text)
    print("\nGENERATED ANSWER:")
    print(tokenizer.decode(outputs[0, prompt.size(1):], skip_special_tokens=True))
    print("\nREFERENCE ANSWER:")
    print(ref_answer)

In [39]:
generate_one(df)

PROMPT:
Answer like a therapist:
 Do I leave my cheating husband or share him with someone? I've been with my husband for eight years now. We have split twice before, and the first time was because he cheated. I took him back months later, and he really tried making it up to me by making a lot of changes. However, we continued to have issues because of my lack of trust. My insecurities and trust issues lead to physical abuse, which lead to us separating again. During that separation, he consoled himself by talking to the same girl he cheated on me with. But we then ended up back together and worked it out for a while until I got pregnant with our second child.
   The baby was a few months old, and he confessed to me about his secret relationship with her. He told me how he could never stop talking to her and how, during our issues, she has been and is the only women he's gone behind my back with (but on a friendship level because she's miles away). He confessed how he fought feelings f

In [40]:
generate_one(df)

PROMPT:
Answer like a therapist:
 Is it normal to cry during therapy? I start counseling/therapy in a few days (I'm freaking out) but my main fear is that I'll cry and embarrass myself, is it something to worry about?
Answer: 

GENERATED ANSWER:
 Crying is a normal response when you are upset and you don't know why. There are many reasons for crying, but first check with your therapist to find out why you are crying. If it is another problem then that will lead to your talking about it with a therapist. If it is not a problem then that will help open your mind to talk about whatever is causing your discomfort. Many times, we can control the frequency of our crying to how often we have cried to, but there are also other variables that can influence this such as the length of your therapy. You can also be supportive with your therapist by saying that you feel safe and not embarrassed when you cry during therapy. Also, because you trust your therapist to be your trusted friend and not jud

In [42]:
for i in range(10):
    print("-"*100)
    generate_one(df)

----------------------------------------------------------------------------------------------------
PROMPT:
Answer like a therapist:
 I am 18 years old with a baby on the way Me and the father of my child have been dating for a year. We had a big argument and at first he wanted to work it out. But with time he started asking others and they told him to leave me. Now he wants me to wait on him to mature. He wants to talk to other people. Then come back in four years. He doesn't want to deal with me while I'm pregnant. He doesn't take me to the doctor or anything. I feel alone. How do I handle this situation?
Answer: 

GENERATED ANSWER:
There is nothing to be done. All relationships take time and it is difficult to find someone to whom you are mature, emotionally and mentally, to be emotionally mature. You seem to want him to mature with your baby on the way. That is impossible. Not everything we are in is not possible. In this situation, it may not be possible to talk to him about anyt

In [43]:
for i in range(10):
    print("-"*100)
    generate_one(df)

----------------------------------------------------------------------------------------------------
PROMPT:
Answer like a therapist:
 Is it normal to go into therapy feeling nervous? I've gone to a couple therapy sessions so far and still everytime I walk in I get nervous and shaky. Is this normal? Should I still be feeling like this?
Answer: 

GENERATED ANSWER:
It is normal for people to feel nervous and shaky during sessions. Nervousness is part of life and can be uncomfortable. However, it is not a signal that someone is at risk for committing a crime. If someone feels they are at risk for committing a crime and anxiety is the reason, then it could simply be anxiety about doing things they enjoy or experiencing the feeling of being able to do certain things. This is simply a feeling and does not need to be a problem. If someone believes they are a danger and anxiety is the problem, then it may not be a problem at all, especially if their therapist can find a way to help them focus 