<a href="https://colab.research.google.com/github/vlordier/colabs/blob/main/CounselChat_EDA_and_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq transformers datasets

In [None]:
import random
import pandas as pd
from IPython.display import display, HTML

from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

# Get the data

In [None]:
!mkdir ./data && cd data && wget https://raw.githubusercontent.com/nbertagnolli/counsel-chat/master/data/20200325_counsel_chat.csv

mkdir: cannot create directory ‘./data’: File exists


In [None]:
!ls -hl data

total 6.1M
-rw-r--r-- 1 root root 3.2M Mar 28 11:04 20200325_counsel_chat.csv
-rw-r--r-- 1 root root 2.9M Mar 28 11:04 counselchat-data.csv


# EDA

In [None]:
df = pd.read_csv("data/20200325_counsel_chat.csv", index_col=0)

In [None]:
n_qs = len(df.questionID.unique())
n_topics = len(df.topic.unique())
print(f"Total number of samples {df.shape[0]}, {n_qs} unique questions on {n_topics} topics")

Total number of samples 2129, 815 unique questions on 31 topics


In [None]:
# standardise spaces
df["questionTitle"] = df.questionTitle.map(lambda x: " ".join(x.split()))
df["questionText"] = df.questionText.map(lambda x: " ".join(x.split()))
df["answerText"] = df.answerText.map(lambda x: " ".join(x.split()))

def mb_add_period(text):
    if text[-1] not in {"?", ".", "!"}:
        return text + "."
    return text

df["questionTitle"] = df.questionTitle.map(mb_add_period)
assert (df.questionTitle.str.endswith("?") | df.questionTitle.str.endswith(".") | df.questionTitle.str.endswith("!")).all()

df["prompt"] = "Answer like a therapist:\n" + df.questionTitle + " " + df.questionText + "\nAnswer: "
df["fullText"] = df.prompt + df.answerText
df.head(3)

Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views,split,prompt,fullText
0,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Sherry Katz, LCSWCouples and Family Therapist,...",https://counselchat.com/therapists/sherry-katz...,"If everyone thinks you're worthless, then mayb...",1,2899,train,Answer like a therapist:\nCan I change my feel...,Answer like a therapist:\nCan I change my feel...
1,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Robin Landwehr, DBH, LPCC, NCCMental Health in...",https://counselchat.com/therapists/robin-landw...,"Hello, and thank you for your question and see...",1,3514,train,Answer like a therapist:\nCan I change my feel...,Answer like a therapist:\nCan I change my feel...
2,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,Lee KingI use an integrative approach to treat...,https://counselchat.com/therapists/lee-king,First thing I'd suggest is getting the sleep y...,0,5,train,Answer like a therapist:\nCan I change my feel...,Answer like a therapist:\nCan I change my feel...


Let's compute prompt and answer length length in tokens:

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

def get_length(text):
    return len(tokenizer(text)["input_ids"])

df["prompt_length"] = df.prompt.map(get_length)
df["answer_length"] = df.answerText.map(get_length)
df["full_length"] = df.fullText.map(get_length)

In [None]:
df.describe()

Unnamed: 0,questionID,upvotes,views,prompt_length,answer_length,full_length
count,2129.0,2129.0,2129.0,2129.0,2129.0,2129.0
mean,346.854861,0.489901,198.604979,84.918741,204.780648,288.649131
std,273.706241,0.942429,300.31428,55.576545,151.817316,165.083026
min,0.0,0.0,2.0,23.0,2.0,50.0
25%,78.0,0.0,58.0,53.0,106.0,178.0
50%,321.0,0.0,107.0,75.0,164.0,248.0
75%,588.0,1.0,210.0,102.0,252.0,342.0
max,884.0,9.0,3514.0,668.0,1108.0,1208.0


In [None]:
for name, group in df.groupby("split"):
    print(f"{name} split contains {len(group)} samples ({len(group.questionID.unique())} unique questions)")

test split contains 117 samples (39 unique questions)
train split contains 1839 samples (695 unique questions)
val split contains 173 samples (81 unique questions)


# Dataset

In [None]:
df.rename(columns={"answerText":"answer"}, inplace=True)
dataset = DatasetDict(**{
    k: Dataset.from_pandas(df.loc[df.split==k,["prompt", "answer", "topic"]]) for k in df.split.unique()
})

In [None]:
import random
import pandas as pd
from IPython.display import display, HTML
from pprint import pprint

def display_examples(dataset, num_examples=5, mode="pprint"):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    idx = random.sample(list(range(len(dataset))), num_examples)
    texts = [f'{sample["prompt"]}{sample["answer"]}' for sample in dataset.select(idx)]
    df = pd.DataFrame({"text":texts})
    
    if mode=="df":
        display(HTML(df.to_html()))
    elif mode=="pprint":
        for text in texts:
            pprint(text)
            print()
    else:
        raise ValueError(f"{mode} mode is not supported. Please select one of ['df' 'pprint']")

In [None]:
display_examples(dataset["train"], mode="df")

Unnamed: 0,text
0,"Answer like a therapist:\nHow can I not miss my boyfriend while he's in Ireland? My boyfriend is in Ireland for 11 days, and I am an emotional wreck.\nAnswer: It sounds like you and your boyfriend are very close. Do you typically spend most of your time together? If so, it may be important to reflect on how you feel when you are apart. If any separation is difficult, you may need to examine why. Think about what it is that you miss and what you are anxious, upset or worried about. If you examine the causes of your distress you likely will experience some relief. It is important to learn how to be happy when you are alone, it will only improve the way you feel when you are with your boyfriend."
1,Answer like a therapist:\nHow do I find myself? I don't know what to say. I have never really known who I am.\nAnswer: Therapy may be an effective way for you to get a stronger sense of who you are. A competent therapist will work to create a safe and curious therapeutic relationship in which you can explore your identity. There are also many different exercises which you can do in and out of therapy which you may find helpful in this area as well.
2,"Answer like a therapist:\nHow can I control my anxiety? I started having anxiety three months ago. I'm new to having anxiety, and it's making me depressed.\nAnswer: As a past sufferer of anxiety myself, I have learned that it is a natural part of life, it is a natural part of us, the longer we try to run from it the more it entangles us in its clutches, if we deny certain parts of ourselves we will become depressed or even oppressed, the only way to deal with anxiety is to embrace it and accept it, and in fighting the fight without fighting we will eventually win, I would encourage you to look into Acceptance and Commitment Therapy, your answer I think lies in that area of research. Hope this helps,C"
3,"Answer like a therapist:\nHow can I be happy as a stay-at-home mom? Over the years, I have slowly lost everything: my jobs, my cars, my freedom, and my money. I am a stay-at-home mom who doesn't make any money, doesn't have a car, never goes out, and cleans all day long everyday.\nAnswer: There are an infinite number of possibilities here. You included a lot of things that you don't have. Is there anyway that you can have some freedom for a few hours a week, at least? A lot of times stay at home moms have groups when they meet at different public places where their children can play together and they can talk together. I don't know how old your child or children is/are, but perhaps going back to work is something that can happen in the future.I wonder if you may consider who you have in your life who can support you. I don't know whether you have a spouse or parents or siblings who could help out with some different things. I wonder if you could consider where you would like to go if you had two or three hours that you could do whatever you wanted? If you can sort out some possible answers to that, maybe you can work together with friends or family to make it happen.Also, again depending on the age of your children, some parents can do things while the children are sleeping, even if you are in the same room. For example, if your children are sleeping for about two hours, maybe 30 minutes of that could be reserved for you and the rest could be for cleaning or other things you need to do."
4,"Answer like a therapist:\nHow do I fix my marriage? I have been married for 20 years. He and I both cheated. I hid my cheating for 14 years. Over that time, I let life stress me out. I became depressed. When I finally woke up, he felt I had pushed him out.\nAnswer: Do you both want to fix the marriage?The relationship belongs to both of you so that one working without the other cooperating in this work too, will have one person who does all the changing.This dynamic itself creates problems of its own.About your husband telling you he feels pushed out, did he do anything on his own to address his problem of feeling like this?All you state is that he blames you for creating a situation he didn't like.If he did nothing because he felt unsure what to do, anything which shows he understands he has as vital a part in the marriage as you, is ok.Because affairs create mistrust between two people, the two of you would also need to be very aware to regain each other's trust.Talking as much as possible so you both know what goes on in the life of the other, how you each are feeling, what matters to you, is helpful to grow a new foundation for your marriage's future."


In [None]:
display_examples(dataset["train"])

('Answer like a therapist:\n'
 'Why am I experiencing dfficulty maintaining an erection? A few years ago I '
 'was making love to my wife when for no known reason I lost my erection, Now '
 "I'm In my early 30s and my problem has become more and more frequent. This "
 "is causing major problems for my ego and it's diminishing my self esteem. "
 'This has resulted in ongoing depression and tearing apart my marriage. I am '
 'devastated and cannot find a cause for these issues. I am very attracted to '
 'my wife and want to express it in the bedroom like I used to. What could be '
 'causing this, and what can I do about it?\n'
 'Answer: Erection maintenance is caused by connective tissue weakness as well '
 'as lymphatic stagnation of the area and accompanying channels. A proper '
 'detoxification is necessary to reverse the stagnation and get things flowing '
 'again.Lower circulation and pituitary might also be indicated.Medical will '
 'likely not recommend anything useful in the long

Note: "prompt" and "answer" are kept as separate fields because it will be handy later for proper labeling. Tokens corresponding to "prompt" will be ignored at loss computation as we are interested in generating answers only.

In [None]:
sample = dataset["train"][0]
tokenizer(sample["prompt"], sample["answer"], return_token_type_ids=True)

{'input_ids': [33706, 588, 257, 24636, 25, 198, 6090, 314, 1487, 616, 4203, 286, 852, 28063, 284, 2506, 30, 314, 1101, 1016, 832, 617, 1243, 351, 616, 7666, 290, 3589, 13, 314, 8523, 3993, 290, 314, 466, 2147, 475, 892, 546, 703, 314, 1101, 28063, 290, 703, 314, 6584, 470, 307, 994, 13, 314, 1053, 1239, 3088, 393, 39496, 7341, 13, 314, 1053, 1464, 2227, 284, 4259, 616, 2428, 11, 475, 314, 1239, 651, 1088, 284, 340, 13, 1374, 460, 314, 1487, 616, 4203, 286, 852, 28063, 284, 2506, 30, 198, 33706, 25, 220, 1532, 2506, 6834, 345, 821, 28063, 11, 788, 3863, 345, 761, 284, 1064, 649, 661, 284, 8181, 503, 351, 13, 42338, 11, 262, 1919, 4732, 287, 543, 257, 1048, 3160, 318, 257, 1263, 4588, 287, 2116, 12, 31869, 13, 48059, 11, 345, 460, 467, 2835, 290, 2835, 2111, 284, 1833, 1521, 345, 821, 407, 28063, 11, 788, 467, 736, 284, 262, 976, 4315, 290, 307, 13642, 866, 757, 13, 1858, 389, 867, 40840, 6218, 345, 460, 1064, 287, 1919, 2056, 13, 6674, 1100, 617, 286, 262, 3392, 543, 1181, 326, 645, 104