In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
file_path = '/content/drive/My Drive/sg_90k_part1.json'


In [None]:

import json

with open(file_path) as f:
  data = json.load(f)
  length = len(data)
  print(length)


45332


In [None]:
!pip install markdownify

Collecting markdownify
  Downloading markdownify-0.11.6-py3-none-any.whl (16 kB)
Installing collected packages: markdownify
Successfully installed markdownify-0.11.6


In [None]:
"""
- Convert html to markdown with basic data cleaning.
- Deduplication.

Usage:
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
"""
import argparse
from concurrent.futures import ProcessPoolExecutor
import json
import logging
import re
from typing import Dict, Union

import bs4
import markdownify  # == 0.11.6
from tqdm import tqdm


div_pattern = re.compile("<div.*?>")
span_pattern = re.compile("<span.*?>")
code_lang_pattern = re.compile(
    "```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
)
code_lang_format = "```\g<1>\n\g<2>\n```"
regenerate_pattern = re.compile("\d+ / \d+")
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")


def reformat_code(val: str) -> str:
    # Input code format is:
    # ```
    # $<language>Copy code$<exact_code_here>
    #
    # ```
    # This function convert it into the correct markdown format
    return re.sub(code_lang_pattern, code_lang_format, val)


def html_to_markdown(val: str) -> str:
    # Remove all <div>. This is required to make intent work in code blocks.
    val = re.sub(div_pattern, "", val)
    # Remove all <span>. This is required to make underscores work in code blocks.
    val = re.sub(span_pattern, "", val)
    # Markdown to html
    val = markdownify.markdownify(val).strip()
    # Reformat code
    val = reformat_code(val)

    # Remove noisy "[number] / [number]" at the beginning
    noise = re.search(regenerate_pattern, val)
    if noise and noise.start() == 0:
        val = val[noise.end() :]
    # Remove noisy "Copy[number] chars / [number] words"
    val = re.sub(copy_chars_pattern, "", val)
    # Remove empty code block ```\nCopy code\n```
    val = re.sub(copy_code_pattern, "", val)

    # Strip
    val = val.replace("\n\n\n", "\n").strip()

    return val


def contain_blocked_words(val: str) -> bool:
    blocked_words = ["openai", "chatgpt"]
    for w in blocked_words:
        if w in val.lower():
            return True
    return False


def clean_html_one_sample(sample):
    roles = ["human", "gpt"]

    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
    if sample["conversations"][0]["from"] != "human":
        sample["conversations"] = sample["conversations"][1:]
    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    if sample["conversations"][-1]["from"] == "human":
        sample["conversations"] = sample["conversations"][:-1]
    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    char_count = 0
    new_conversations = []
    for i, c in enumerate(sample["conversations"]):
        if c["from"] != roles[i % 2]:
            return (sample, 2)

        if contain_blocked_words(c["value"]):
            return (sample, 3)

        try:
            new_val = html_to_markdown(c["value"])
        except (bs4.builder.ParserRejectedMarkup, AssertionError):
            return (sample, 4)

        # Filter empty answers like https://sharegpt.com/c/mrllZ6u
        if not new_val or not new_val[0].isprintable():
            break

        char_count += len(new_val)
        new_conversations.append(
            {
                "from": c["from"],
                "value": new_val,
            }
        )

    new_conversations = new_conversations[: len(new_conversations) // 2 * 2]
    sample["conversations"] = new_conversations

    if char_count < 16 or len(sample["conversations"]) <= 0:
        return (sample, 1)

    return (sample, 0)


def clean_html_all(content, begin, end):
    """
    Clean the source html files.
    """
    cnt_skip = 0
    cnt_blocked_words = 0
    cnt_wrong_format = 0
    cnt_parser_error = 0
    cnt_too_short = 0
    cnt_id_duplication = 0
    cnt_value_duplication = 0
    cnt_plugin = 0
    cnt_tag = 0

    content = content[begin:end]
    processed = []
    with ProcessPoolExecutor() as executor:
        for result in tqdm(
            executor.map(clean_html_one_sample, content), total=len(content)
        ):
            processed.append(result)

    visited = {}
    new_content = []
    for sample, error_code in processed:
        cid = sample["id"]
        skipped = True

        if error_code != 0:
            if error_code == 1:
                print(f"id {cid} is too short")
                cnt_too_short += 1
            elif error_code == 2:
                print(f"id {cid} has a wrong format")
                cnt_wrong_format += 1
            elif error_code == 3:
                print(f"id {cid} contains blocked words")
                cnt_blocked_words += 1
            elif error_code == 4:
                print(f"id {cid} contains parser errors")
                cnt_parser_error += 1
            else:
                raise ValueError(f"Invalid error_code: {error_code}")
        elif cid in visited:
            print(f"id {cid} is an id duplication of {visited[cid]}")
            cnt_id_duplication += 1
        elif sample.get("plugins", None) is not None:
            print(f"id {cid} contains plugin")
            cnt_plugin += 1
        else:
            key = (
                sample["conversations"][0]["value"],
                sample["conversations"][1]["value"],
            )
            if key in visited:
                print(f"id {cid} is a value duplication of {visited[key]}")
                cnt_value_duplication += 1
            else:
                visited[cid] = visited[key] = cid
                skipped = False

        if not skipped:
            new_content.append(sample)
        else:
            cnt_skip += 1

    print(
        f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
        f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
        f"cnt_wrong_format: {cnt_wrong_format}, "
        f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
        f"cnt_value_duplication: {cnt_value_duplication}, cnt_plugin: {cnt_plugin}"
    )

    return new_content


def main():
    in_file = '/content/drive/My Drive/sg_90k_part1.json'
    out_file = 'sharegpt_clean.json'
    begin = 1
    end = 45332  # Process only first 5 rows

    content = json.load(open(in_file, "r"))
    content = clean_html_all(content, begin, end)
    json.dump(content, open(out_file, "w"), indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()


  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')
  soup = BeautifulSoup(html, 'html.parser')
  k = self.parse_starttag(i)
  k = self.parse_starttag(i)
100%|██████████| 45331/45331 [06:03<00:00, 124.65it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
id 6Fg09Ek contains blocked words
id XBWxQmO contains blocked words
id LazxCQY is too short
id lpdMwau has a wrong format
id Pa5zD8I is a value duplication of MwBHYue
id 539R1Pm is a value duplication of 598BaQ3
id xnYsKYh is a value duplication of hvEAUtZ
id 46Kmrcr contains blocked words
id 1X9zoXb contains blocked words
id mrWaS1S is a value duplication of 7XcoZjQ
id 8vObiZi contains blocked words
id cdHv2yl is a value duplication of yp0SLIe
id hLCh4iD is a value duplication of z7hfXP3
id OunYal2 is a value duplication of bL4XwMu
id voDhDIL has a wrong format
id AhfoiqU contains blocked words
id zc6CuYD is a value duplication of UW3sR2H
id dql02O1 is a value duplication of Hl3Mjtv
id d7BZs2r contains blocked words
id DXj8HYg contains blocked words
id mA8oZ02 contains blocked words
id lrMKHA1 contains blocked words
id 6cDSYSg is a value duplication of J8OZv8f
id qivgZxA is a value duplication of BjaXxo3
id s71n9Yl is a 

In [None]:
import json

try:
    with open("/content/sharegpt_clean.json", 'r') as file:
        data = json.load(file)
        print("File read successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

File read successfully.


In [None]:
import pandas as pd


In [None]:
df_rows = []
for item in data:
    id = item['id']
    for conversation in item['conversations']:
        df_rows.append([id, conversation['from'], conversation['value']])

# Creating a DataFrame
df = pd.DataFrame(df_rows, columns=['id', 'from', 'value'])

# Saving the DataFrame to a pickle file
df.to_pickle('/content/df_conversations.pkl')

# The DataFrame is now saved as a pickle file
df.head()

Unnamed: 0,id,from,value
0,QWJhYvA,human,Summarize the main ideas of Jeff Walker's Prod...
1,QWJhYvA,gpt,Here are the main ideas of Jeff Walker's Produ...
2,QWJhYvA,human,Summarize the main ideas of Brendon Burchard's...
3,QWJhYvA,gpt,Here are the main ideas of Brendon Burchard's ...
4,QWJhYvA,human,What are the mental triggers in Jeff Walker's ...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448568 entries, 0 to 448567
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      448568 non-null  object
 1   from    448568 non-null  object
 2   value   448568 non-null  object
dtypes: object(3)
memory usage: 10.3+ MB
