In [2]:
# %pip install google-cloud-aiplatform==1.25.0
# %pip install google-api-core==1.33.1

In [1]:
import os
import vertexai
from vertexai.preview.language_models import TextGenerationModel
import pandas as pd
from PyPDF2 import PdfReader
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
def pdf_parse(pdf_file):
    with open(pdf_file, 'rb') as pdf:
        reader = PdfReader(pdf, strict=False)
        pdf_text = []
        for page in reader.pages:
            content = page.extract_text()
            pdf_text.append(content)
        return " ".join(pdf_text)
    
def predict_large_language_model_sample(
    project_id: str,
    model_name: str,
    temperature: float,
    max_output_tokens: int,
    top_p: float,
    top_k: int,
    content: str,
    location: str = "us-central1",
    tuned_model_name: str = "",
    ) :
    
    vertexai.init(project=project_id, location=location)
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name:
        model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        content,
        temperature=temperature,
        max_output_tokens=max_output_tokens,
        top_k=top_k,
        top_p=top_p,)
    return response.text

In [3]:
def convo_df(text):
    exchanges = [exchange.strip() for exchange in text.split('\n\n') if exchange.strip()]
    zen_responses = []
    client_responses = []
    for exchange in exchanges:
        if exchange.startswith("**Zen:**"):
            zen_responses.append(exchange.replace('**Zen:**', '').strip())
        elif exchange.startswith("**Client:**"):
            client_responses.append(exchange.replace('**Client:**', '').strip())

    max_length = max(len(zen_responses), len(client_responses))
    zen_responses.extend([''] * (max_length - len(zen_responses)))
    client_responses.extend([''] * (max_length - len(client_responses)))

    df = pd.DataFrame({'Client': client_responses,'Zen': zen_responses, })
    return df

In [154]:
path = 'Alexander Street/Batch 4/'
all_files = os.listdir(path)
pdf_files = [filename for filename in all_files if filename.lower().endswith('.pdf')]
source = [os.path.splitext(pdf_file)[0] for pdf_file in pdf_files]

In [102]:
prompt = """
            ### Prompt: Using the sample therapy transcript below, generate a synthetic therapy transcript\n
            between a mobile based AI therapist Zen and a client. Understand the client's struggles from the sample\n
            and then make Zen utilize state-of-the-art therapeutic techniques such as motivational interviewing.\n
            Zen should be empathetic and a great listener.\n
            The flow of the transcript needs to be mobile friendly and engaging.\n
            
            Use the format "**Client:**" and "**Zen:**"\n
            ----------------------------------------------------------------------------
            ### Transcript:
        """

In [155]:
df = pd.DataFrame()
for i in range(len(source)):
    src = source[i]
    conv_id = i+1
    transcript = pdf_parse(path+src+'.pdf')
    text = predict_large_language_model_sample(project_id = "cloud-lab-ff59", 
                                    model_name = "text-bison", 
                                    temperature = 0.4, 
                                    max_output_tokens = 1024,
                                    top_p = 0.8, 
                                    top_k = 40, 
                                    location = "us-central1",
                                    content = prompt + transcript)
    temp = convo_df(text)
    temp['conv_id'] = conv_id
    temp['source'] = src
    temp = temp[['conv_id','Client', 'Zen', 'source']]
    
    df = df.append(temp)

In [156]:
df.to_csv("df-batch-4.csv")

In [157]:
df_1 = pd.read_csv("df-batch-1.csv")
df_2 = pd.read_csv("df-batch-2.csv")
df_3 = pd.read_csv("df-batch-3.csv")
df_3 = pd.read_csv("df-batch-4.csv")
df = pd.concat([df_1, df_2, df_3])
source_to_number = {}
unique_sources = df['source'].unique()
for i, source in enumerate(unique_sources):
    source_to_number[source] = i + 1
df['conv_id'] = df['source'].map(source_to_number)

In [158]:
df.drop("Unnamed: 0", axis = 1, inplace = True)

In [159]:
df.shape

(483, 4)

In [160]:
len(df['source'].unique())

74

In [None]:
df = df.rename(columns={"Client": "USER", "Zen": "ASSISTANT"})

In [161]:
df.to_csv("processed/PALM_Alexander_Street.csv")

In [1]:
from tqdm.notebook import tqdm
import pandas as pd

In [8]:
from fastchat.conversation import get_conv_template, register_conv_template, Conversation, SeparatorStyle
SYSTEM_MSG = """Your name is ZenAI and you're a therapist. Please have a conversation with your patient and provide them with a helpful response to their concerns."""


In [9]:
try:
    register_conv_template(
        Conversation(
            name="ZenAI",
            system_message=SYSTEM_MSG,
            roles=("USER", "ASSISTANT"),
            sep_style=SeparatorStyle.ADD_COLON_TWO,
            sep=" ",
            sep2="</s>",
        )
    )
except AssertionError:
    pass

In [96]:
def get_df():
    csv_files = ["mental_health_chatbot_dataset.csv", "psychology-dataset.csv", "who_r_u.csv"]
    df = pd.DataFrame()

    for p in csv_files:
        df1 = pd.read_csv(f"../data/processed/{p}")[["human", "zen"]]
        df1 = df1.rename(columns={"human": "USER", "zen": "ASSISTANT"})
        df1 = df1.drop_duplicates(subset=["USER", "ASSISTANT"], keep="first", ignore_index=True)

        df1["USER"] = df1.USER.str.replace('\s+', ' ', regex=True)
        df1["USER"] = df1.USER.str.replace(r'\.([a-zA-Z0-9])', r'. \1', regex=True)
        df1["ASSISTANT"] = df1.ASSISTANT.str.replace('\s+', ' ', regex=True)
        df1["ASSISTANT"] = df1.ASSISTANT.str.replace(r'\.([a-zA-Z0-9])', r'. \1', regex=True)

        df = pd.concat([df, df1])

    df = df.sample(frac=1, random_state=42)
    return df


def get_conversations(df, reset):
    conv = get_conv_template("ZenAI")
    conversations = []
    
    conv.messages = []
    for index, row in df.iterrows():
        if reset:
            conv.messages = []
        conv.append_message("USER", row["USER"])
        conv.append_message("ASSISTANT", row["ASSISTANT"])
        conversations.append(conv.get_prompt())
    
    return conversations


def get_therapy_conv():
    df = pd.read_csv("../data/processed/PALM_Alexander_Street.csv")
    df = df.dropna(ignore_index=True)
    df = df.drop(index=0)
    df.reset_index(drop=True, inplace=True)
    
    conv_ids = df.conv_id.unique()
    conversations = []
    for c in conv_ids:
        conv_df = df[df.conv_id == c]
        conversations += get_conversations(conv_df, reset=False)
    
    return conversations

In [97]:
from sklearn.model_selection import train_test_split

In [98]:
df = get_df()
conversations = get_conversations(df, reset=True)
conversations += get_therapy_conv()

train, test = train_test_split(conversations, test_size=1000, random_state=42)

In [99]:
len(conversations), len(train), len(test)

(10905, 9905, 1000)

In [12]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    HfArgumentParser,
    PreTrainedTokenizer
)
from transformers.trainer_pt_utils import LabelSmoother

[2023-11-05 02:29:34,908] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [13]:
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
model_id = "lmsys/vicuna-13b-v1.5"

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    model_max_length=1024,
    padding_side="right",
    use_fast=False,
)
tokenizer.pad_token = tokenizer.unk_token

In [15]:
import torch

def rank0_print(*args):
    print(*args)


In [100]:
conversations = conversations[0:1]

In [101]:
# Tokenize conversations
input_ids = tokenizer(
    conversations,
    return_tensors="pt",
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
).input_ids
targets = input_ids.clone()

In [102]:
input_ids

tensor([[   1, 3575, 1024,  ...,    0,    0,    0]])

In [59]:
sources = [example["conversations"] for example in raw_data][0:1]

conv = get_conversation_template("vicuna")
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

# Apply prompt templates
conversations = []
for i, source in enumerate(sources):
    if roles[source[0]["from"]] != conv.roles[0]:
        # Skip the first one if it is not from human
        source = source[1:]

    conv.messages = []
    for j, sentence in enumerate(source):
        role = roles[sentence["from"]]
        assert role == conv.roles[j % 2], f"{i}"
        conv.append_message(role, sentence["value"])
    conversations.append(conv.get_prompt())

# Tokenize conversations
input_ids = tokenizer(
    conversations,
    return_tensors="pt",
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
).input_ids
targets = input_ids.clone()

assert conv.sep_style == SeparatorStyle.ADD_COLON_TWO

In [60]:
targets[0] == input_ids[0]

tensor([True, True, True,  ..., True, True, True])

In [61]:
conv.sep + conv.roles[1] + ": "

' ASSISTANT: '

In [103]:
# Mask targets. Only compute loss on the assistant outputs.
sep = conv.sep + conv.roles[1] + ": "
for conversation, target in zip(conversations, targets):
    total_len = int(target.ne(tokenizer.pad_token_id).sum())

    turns = conversation.split(conv.sep2)
    cur_len = 1
    target[:cur_len] = IGNORE_TOKEN_ID
    for i, turn in enumerate(turns):
        if turn == "":
            break
        turn_len = len(tokenizer(turn).input_ids)

        parts = turn.split(sep)
        if len(parts) != 2:
            break
        parts[0] += sep
        # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
        instruction_len = len(tokenizer(parts[0]).input_ids) - 2

        if i != 0 and not tokenizer.legacy:
            # The legacy and non-legacy modes handle special tokens differently
            instruction_len -= 1

        # Ignore the user instructions
        target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
        cur_len += turn_len

        if i != 0 and not tokenizer.legacy:
            # The legacy and non-legacy modes handle special tokens differently
            cur_len -= 1

    target[cur_len:] = IGNORE_TOKEN_ID

    if False:  # Inspect and check the correctness of masking
        z = target.clone()
        z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
        rank0_print(tokenizer.decode(z))
        exit()

    if cur_len < tokenizer.model_max_length:
        if cur_len != total_len:
            target[:] = IGNORE_TOKEN_ID
            rank0_print(
                f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                f" #turn = {len(turns) - 1}. (ignored)"
            )

data = dict(
    input_ids=input_ids,
    labels=targets,
    attention_mask=input_ids.ne(tokenizer.pad_token_id),
)

In [111]:
parts[1]

"It can be difficult to come out to loved ones, but it's important to prioritize your own happiness and well-being. Let's work together to explore some techniques to improve your confidence and prepare for coming out. Have you considered seeking support groups or practicing self-care?"

In [109]:
z = target.clone()
z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
rank0_print(tokenizer.decode(z))

<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk> It can be difficult to come out to loved ones, but it's important to prioritize your own happiness and well-being. Let's work together to explore some techniques to improve your confidence and prepare for coming out. Have you considered seeking support groups or practicing self-care?</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><

In [108]:
total_len, cur_len

(124, 124)

In [106]:
tokenizer.model_max_length

1024

In [104]:
(target != -100).sum()

tensor(62)

In [94]:
turn.split(sep)

["Your name is ZenAI and you're a therapist. Please have a conversation with your patient and provide them with a helpful response to their concerns. USER: I am struggling with my sexuality and don't know how to come out to my family. ASSIATANT: It can be difficult to come out to loved ones, but it's important to prioritize your own happiness and well-being. Let's work together to explore some techniques to improve your confidence and prepare for coming out. Have you considered seeking support groups or practicing self-care?"]

In [89]:
(targets != -100).sum()

tensor(0)

In [71]:
turns = conversations[0].split(conv.sep2)
turns

["A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Who are you? ASSISTANT: I am Vicuna, a language model trained by researchers from Large Model Systems Organization (LMSYS).",
 'USER: Have a nice day! ASSISTANT: You too!',
 '']

In [74]:
turn = turns[0]
len(tokenizer(turn).input_ids)

67

In [76]:
parts = turn.split(sep)
parts

["A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Who are you?",
 'I am Vicuna, a language model trained by researchers from Large Model Systems Organization (LMSYS).']

In [78]:
parts[0] += sep

In [79]:
len(tokenizer(parts[0]).input_ids) - 2

42

In [80]:
cur_len

84

In [66]:
data['labels'].shape, data['input_ids'].shape, data['attention_mask'].shape

(torch.Size([1, 1024]), torch.Size([1, 1024]), torch.Size([1, 1024]))

In [69]:
conversations[0]

"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Who are you? ASSISTANT: I am Vicuna, a language model trained by researchers from Large Model Systems Organization (LMSYS).</s>USER: Have a nice day! ASSISTANT: You too!</s>"

In [67]:
data['attention_mask'].sum()

tensor(84)

In [56]:
data

{'input_ids': tensor([[   1, 3575, 1024,  ...,    0,    0,    0]]),
 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100]]),
 'attention_mask': tensor([[ True,  True,  True,  ..., False, False, False]])}

In [33]:
from fastchat.model.model_adapter import get_conversation_template

In [34]:
get_conversation_template('vicuna')

Conversation(name='vicuna_v1.1', system_template='{system_message}', system_message="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", roles=('USER', 'ASSISTANT'), messages=[], offset=0, sep_style=<SeparatorStyle.ADD_COLON_TWO: 2>, sep=' ', sep2='</s>', stop_str=None, stop_token_ids=None)

In [39]:
import json

raw_data = json.load(open("../train/v2/dummy_conversation.json", "r"))
sources = [example["conversations"] for example in raw_data]
c, t = preprocess(sources, tokenizer)

In [43]:
len(c), len(t)

(500, 500)

In [44]:
c[0]

"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Who are you? ASSISTANT: I am Vicuna, a language model trained by researchers from Large Model Systems Organization (LMSYS).</s>USER: Have a nice day! ASSISTANT: You too!</s>"

In [48]:
conversations[0]

"Your name is ZenAI and you're a therapist. Please have a conversation with your patient and provide them with a helpful response to their concerns. USER: I am struggling with my sexuality and don't know how to come out to my family. ASSIATANT: It can be difficult to come out to loved ones, but it's important to prioritize your own happiness and well-being. Let's work together to explore some techniques to improve your confidence and prepare for coming out. Have you considered seeking support groups or practicing self-care?</s>"

In [21]:
# Mask targets. Only compute loss on the assistant outputs.
sep = " ASSISTANT: "
for conversation, target in zip(conversations, targets):
    total_len = int(target.ne(tokenizer.pad_token_id).sum())

    turns = conversation.split("</s>")
    cur_len = 1
    target[:cur_len] = IGNORE_TOKEN_ID
    for i, turn in enumerate(turns):
        if turn == "":
            break
        turn_len = len(tokenizer(turn).input_ids)

        parts = turn.split(sep)
        if len(parts) != 2:
            break
        parts[0] += sep
        # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
        instruction_len = len(tokenizer(parts[0]).input_ids) - 2

        if i != 0 and not tokenizer.legacy:
            # The legacy and non-legacy modes handle special tokens differently
            instruction_len -= 1

        # Ignore the user instructions
        target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
        cur_len += turn_len

        if i != 0 and not tokenizer.legacy:
            # The legacy and non-legacy modes handle special tokens differently
            cur_len -= 1

    target[cur_len:] = IGNORE_TOKEN_ID

    if True:  # Inspect and check the correctness of masking
        z = target.clone()
        z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
        rank0_print(tokenizer.decode(z))
        exit()

#     if cur_len < tokenizer.model_max_length:
#         if cur_len != total_len:
#             target[:] = IGNORE_TOKEN_ID
#             rank0_print(
#                 f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
#                 f" #turn = {len(turns) - 1}. (ignored)"
#             )

data = dict(
    input_ids=input_ids,
    labels=targets,
    attention_mask=input_ids.ne(tokenizer.pad_token_id),
)

In [22]:
data

{'input_ids': tensor([[   1, 3575, 1024,  ...,    0,    0,    0]]),
 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100]]),
 'attention_mask': tensor([[ True,  True,  True,  ..., False, False, False]])}

In [23]:
z = target.clone()
z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)

In [25]:
z.sum()

tensor(0)

In [29]:
cur_len

1