In [12]:
"""
- Convert html to markdown with basic data cleaning.
- Deduplication.

Usage:
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
"""
import argparse
from concurrent.futures import ProcessPoolExecutor
import json
import logging
import re
from typing import Dict, Union

import bs4
import markdownify  # == 0.11.6
from tqdm import tqdm


div_pattern = re.compile("<div.*?>")
span_pattern = re.compile("<span.*?>")
code_lang_pattern = re.compile(
    "```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
)
code_lang_format = "```\g<1>\n\g<2>\n```"
regenerate_pattern = re.compile("\d+ / \d+")
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")


def reformat_code(val: str) -> str:
    # Input code format is:
    # ```
    # $<language>Copy code$<exact_code_here>
    #
    # ```
    # This function convert it into the correct markdown format
    return re.sub(code_lang_pattern, code_lang_format, val)


def html_to_markdown(val: str) -> str:
    # Remove all <div>. This is required to make intent work in code blocks.
    val = re.sub(div_pattern, "", val)
    # Remove all <span>. This is required to make underscores work in code blocks.
    val = re.sub(span_pattern, "", val)
    # Markdown to html
    val = markdownify.markdownify(val).strip()
    # Reformat code
    val = reformat_code(val)

    # Remove noisy "[number] / [number]" at the beginning
    noise = re.search(regenerate_pattern, val)
    if noise and noise.start() == 0:
        val = val[noise.end() :]
    # Remove noisy "Copy[number] chars / [number] words"
    val = re.sub(copy_chars_pattern, "", val)
    # Remove empty code block ```\nCopy code\n```
    val = re.sub(copy_code_pattern, "", val)

    # Strip
    val = val.replace("\n\n\n", "\n").strip()

    return val


def contain_blocked_words(val: str) -> bool:
    blocked_words = ["openai", "chatgpt"]
    for w in blocked_words:
        if w in val.lower():
            return True
    return False


def clean_html_one_sample(sample):
    roles = ["human", "gpt"]

    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
    if sample["conversations"][0]["from"] != "human":
        sample["conversations"] = sample["conversations"][1:]
    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    if sample["conversations"][-1]["from"] == "human":
        sample["conversations"] = sample["conversations"][:-1]
    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    for i, c in enumerate(sample["conversations"]):
        if c["from"] != roles[i % 2]:
            return (sample, 2)

        if contain_blocked_words(c["value"]):
            return (sample, 3)

        try:
            new_val = html_to_markdown(c["value"])
        except (bs4.builder.ParserRejectedMarkup, AssertionError):
            return (sample, 4)

        c["value"] = new_val

    return (sample, 0)


def clean_html_all(content):
    """
    Clean the source html files.
    """
    cnt_skip = 0
    cnt_blocked_words = 0
    cnt_wrong_format = 0
    cnt_parser_error = 0
    cnt_too_short = 0
    cnt_id_duplication = 0
    cnt_value_duplication = 0
    cnt_tag = 0

    processed = []
    with ProcessPoolExecutor() as executor:
        for result in tqdm(
            executor.map(clean_html_one_sample, content), total=len(content)
        ):
            processed.append(result)

    visited = {}
    new_content = []
    for sample, error_code in tqdm(processed):
        cid = sample["id"]
        skipped = True

        if error_code != 0:
            if error_code == 1:
                print(f"id {cid} is too short")
                cnt_too_short += 1
            elif error_code == 2:
                print(f"id {cid} has a wrong format")
                cnt_wrong_format += 1
            elif error_code == 3:
                print(f"id {cid} contains blocked words")
                cnt_blocked_words += 1
            elif error_code == 4:
                print(f"id {cid} contains parser errors")
                cnt_parser_error += 1
            else:
                raise ValueError(f"Invalid error_code: {error_code}")
        elif cid in visited:
            print(f"id {cid} is an id duplication of {visited[cid]}")
            cnt_id_duplication += 1
        elif (
            sample["conversations"][1]["value"],
            len(sample["conversations"]),
        ) in visited:
            key = (sample["conversations"][1]["value"], len(sample["conversations"]))
            print(f"id {cid} is a value duplication of {visited[key]}")
            cnt_value_duplication += 1
        else:
            key = (sample["conversations"][1]["value"], len(sample["conversations"]))
            visited[cid] = visited[key] = cid
            skipped = False

        if not skipped:
            new_content.append(sample)
        else:
            cnt_skip += 1

    print(
        f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
        f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
        f"cnt_wrong_format: {cnt_wrong_format}, "
        f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
        f"cnt_value_duplication: {cnt_value_duplication}, "
    )

    return new_content


def main(args):
    content = json.load(open(args["in_file"], "r"))
    content = clean_html_all(content)
    json.dump(content, open(args["out_file"], "w"), indent=2)

In [14]:
args = {
    "in_file": "therapy_transcripts.json",
    "out_file": "therapy_transcript_prompts-v1.json",
}
main(args)

  soup = BeautifulSoup(html, 'html.parser')
100%|██████████| 133/133 [00:00<00:00, 719.42it/s]
100%|██████████| 133/133 [00:00<00:00, 230513.40it/s]


id 12 has a wrong format
id 21 has a wrong format
id 27 has a wrong format
id 29 has a wrong format
id 38 has a wrong format
id 39 has a wrong format
id 41 has a wrong format
id 43 has a wrong format
id 51 has a wrong format
id 58 has a wrong format
id 62 has a wrong format
id 64 has a wrong format
id 65 has a wrong format
id 66 has a wrong format
id 68 has a wrong format
id 76 has a wrong format
id 81 has a wrong format
id 83 has a wrong format
id 91 has a wrong format
id 98 has a wrong format
id 100 has a wrong format
id 101 has a wrong format
id 110 has a wrong format
id 111 has a wrong format
id 117 has a wrong format
id 121 has a wrong format
id 122 has a wrong format
id 124 has a wrong format
id 126 has a wrong format
total: 133, skip: 29, new: 104, cnt_blocked_words: 0, cnt_parser_error: 0, cnt_wrong_format: 29, cnt_too_short: 0, cnt_id_duplication: 0, cnt_value_duplication: 0, 


In [27]:
sessions = json.load(open("therapy_transcript_prompts-v1.json", "r"))
len(sessions)

104

In [24]:
import os
import openai

openai.api_key = ""

Hello! How may I assist you today?


In [32]:
# issues = {}
for session in tqdm(sessions):
    if session["id"] in issues:
        continue
    
    conv = []
    for c in session['conversations']:
        if c["from"] == "human":
            conv.append(f"Patient: {c['value']}")
        else:
            conv.append(f"Therapist: {c['value']}")

    conv = "\n".join(conv)[0:3800]

    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "user", "content": f"Below is a conversation between a patient and his or her therapist. Find out the issue with the patient like why is he or she seeing the therapist. Be concise and keep your answer limited to one or two sentences. Only right the reason and nothing else.\n\n{conv}"}
      ]
    )
    issues[session["id"]] = completion.choices[0].message["content"]

100%|██████████| 104/104 [00:54<00:00,  1.92it/s]


In [33]:
issues

{'0': "The patient is seeing the therapist because she drinks alcohol to relax and unwind, however, her drinking has been causing health problems and making her depression worse. The therapist is discussing the patient's alcohol use with her and helping her come up with goals to cut back on her drinking.",
 '1': 'The patient is seeing the therapist to discuss their alcohol consumption and the potential health risks associated with it.',
 '2': 'The patient is seeing the therapist for weight loss.',
 '3': "The patient's wife has been pressuring him to improve his health habits, but he doesn't see the need for it and feels fine.",
 '4': "The patient has recently gotten a lip piercing and the therapist is concerned about how it can affect the patient's teeth and gums.",
 '5': 'The patient is seeing the therapist for help with offending behavior, including drinking and going along with his friends in activities that lead to trouble. He is also trying to make positive changes in his life, su

In [34]:
with open("issues.json", "w") as f:
    json.dump(issues, f)

In [35]:
"""
Split long conversations based on certain max length.

Usage: python3 -m fastchat.data.split_long_conversation \
    --in sharegpt_clean.json \
    --out sharegpt_split.json \
    --model-name-or-path $<model-name>
"""
import argparse
from concurrent.futures import ProcessPoolExecutor
import json
from typing import Dict, Sequence, Optional

import transformers
from tqdm import tqdm


def make_sample(sample, start_idx, end_idx):
    assert (end_idx - start_idx) % 2 == 0
    return {
        "id": sample["id"] + "_" + str(start_idx),
        "conversations": sample["conversations"][start_idx:end_idx],
    }


tokenizer = max_length = None


def split_one_sample(sample):
    tokenized_lens = []
    conversations = sample["conversations"]
    conversations = conversations[: len(conversations) // 2 * 2]
    for c in conversations:
        length = len(tokenizer(c["value"]).input_ids) + 6
        tokenized_lens.append(length)

    start_idx = 0
    cur_len = 0

    if len(conversations) % 2 != 0 or len(conversations) < 2:
        return []

    new_samples = []
    for i in range(0, len(conversations), 2):
        tmp_len = tokenized_lens[i] + tokenized_lens[i + 1]
        if cur_len + tmp_len > max_length:
            new_samples.append(make_sample(sample, start_idx, i))
            start_idx = i
            cur_len = 0
        elif i == len(conversations) - 2:
            new_samples.append(make_sample(sample, start_idx, i + 2))

        cur_len += tmp_len

    return new_samples


def split_all(content, tokenizer_, max_length_):
    """
    Keep the maximum round of conversations within the max token length constraint
    """
    global tokenizer, max_length
    tokenizer = tokenizer_
    max_length = max_length_

    new_content = []

    with ProcessPoolExecutor() as executor:
        for result in tqdm(executor.map(split_one_sample, content), total=len(content)):
            new_content.extend(result)

    return new_content


def filter_invalid_roles(content):
    new_content = []
    for i, c in enumerate(content):
        roles = ["human", "gpt"]
        if len(c["conversations"]) <= 0:
            continue

        valid = True
        for j, s in enumerate(c["conversations"]):
            if s["from"] != roles[j % 2]:
                valid = False
                break

        if valid:
            new_content.append(c)

    return new_content


def main(args):
    content = json.load(open(args["in_file"], "r"))
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        args["model_name_or_path"],
        model_max_length=args["max_length"],
        padding_side="right",
        use_fast=False,
    )
    new_content = split_all(content, tokenizer, args["max_length"])
    new_content = filter_invalid_roles(new_content)

    print(f"total: {len(content)}, new: {len(new_content)}")
    json.dump(new_content, open(args["out_file"], "w"), indent=2)


args = {
    "in_file": "therapy_transcript_prompts-v1.json",
    "out_file": "therapy_transcript_prompts-v2.json",
    "model_name_or_path": "openaccess-ai-collective/wizard-mega-13b",
    "max_length": 2048
}
main(args)

100%|██████████| 104/104 [00:00<00:00, 390.22it/s]

total: 104, new: 144





In [50]:
for i, issue in issues.items():
    template = f"""register_conv_template(
    Conversation(
        name="ZenAI-{i}",
        system="A chat between a patient and a therapist. "
        "The therapist is calm and drives the conversation by asking questions to know more about the issues and provide polite answers to the user's questions. "
        "{issue}",
        roles=("USER", "ASSISTANT"),
        messages=(),
        offset=0,
        sep_style=SeparatorStyle.ADD_COLON_TWO,
        sep=" ",
        sep2="</s>",
    )
)"""
    
    print(template)
    print()
    print()

In [46]:
data = json.load(open("therapy_transcript_prompts-v2.json", "r"))
len(data)

144

In [48]:
for i in range(len(data)):
    data[i]["issue"] = issues[data[i]["id"].split("_")[0]]

In [51]:
with open("therapy_transcript_prompts-v3.json", "w") as f:
    json.dump(data, f)

In [82]:
data = json.load(open("../psychology_10k_prompts.json", "r"))
data2 = json.load(open("therapy_transcript_prompts-v3.json", "r"))
len(data), len(data2)

(9846, 144)

In [83]:
for i in range(len(data)):
    data[i]["id"] = f"psy_{i}"
    conv = [
        {
            "from": "human",
            "value": data[i]["input"]
        },
        {
            "from": "gpt",
            "value": data[i]["output"]
        }
    ]
    data[i]["conversations"] = conv
    del data[i]["instruction"], data[i]["input"], data[i]["output"]

In [84]:
data[0]


{'id': 'psy_0',
 'conversations': [{'from': 'human',
   'value': "I'm feeling really anxious lately and I don't know why."},
  {'from': 'gpt',
   'value': "It's common to feel anxious at times, and there can be many reasons for it. Have there been any recent changes or stressors in your life that may be contributing to your anxiety? Let's work together to identify any triggers and develop coping strategies to manage your anxiety."}]}

In [78]:
data2[0]

{'id': '0_0',
 'conversations': [{'from': 'human', 'value': 'Sure.'},
  {'from': 'gpt',
   'value': "So, let's see. It looks that you put-- You drink alcohol at least four times a week on average-"},
  {'from': 'human', 'value': 'Mm-hmm.'},
  {'from': 'gpt',
   'value': '-and you usually have three to four drinks when you do drink.'},
  {'from': 'human', 'value': 'Usually three drinks and glasses of wine.'},
  {'from': 'gpt', 'value': "Okay. That's at least 12 drinks a week."},
  {'from': 'human', 'value': 'Something like that.'},
  {'from': 'gpt',
   'value': "Okay. Just so you know, my role, um, when we talk about alcohol use, is just to share information about risk and to help patients who want help. This is different than telling them what I think they should do. I don't do that."},
  {'from': 'human', 'value': 'Okay.'},
  {'from': 'gpt',
   'value': 'Uh, what else can you tell me about your drinking.'},
  {'from': 'human',
   'value': "Well, I usually drink when I'm at home trying

In [85]:
data3 = data + data2
len(data3)

9990

In [86]:
with open("../prompts.json", "w") as f:
    json.dump(data3, f)

In [None]:
# import transformers, json

# raw_data = json.load(open("/home/jupyter/therapy-bot/data/prompts.json", "r"))
# sources = [example["conversations"] for example in raw_data]
# issues = []
# for example in raw_data:
#     try:
#         issues.append(example["issue"])
#     except KeyError:
#         issues.append("")


# tokenizer = transformers.AutoTokenizer.from_pretrained(
#     "openaccess-ai-collective/wizard-mega-13b",
#     model_max_length=2048,
#     padding_side="right",
#     use_fast=False,
# )
# tokenizer.pad_token = tokenizer.unk_token

# from fastchat.train.train import preprocess
# data_dict = preprocess(sources, tokenizer, issues)