# Cleaning & Formatting Data

Data was harvested, cleaned and formatted from 
https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered
License: apache-2.0

AFAIK, the raw dataset was harvested from HTML chats in mulitple languages.
We want clean, English. English because we will use this data for semantic drift detection and I personally can only judge whether English has drifted.

It's great data but not perfect for our needs
1. Prompt tokens may be too big for model
2. Completion tokens may be too big for model
3. May be non-English characters/text
4. Weird characters, weird line endings
4. Lots of spaces and newline where not needed.
5. We can add token and sentence counts while we're cleaning to make the clean data more useful.
6. Replace id with uuid4.

RAW:
```JSON
{
    "prompt": "Answer as if you were the medical director of an anesthesia service. Develop a preoperatively testing algorithm reported in a chart format. Select specific tests that are required for each test based on both particular medical diagnoses as well as by the type of operation is planned.",
    "completion": "Preoperative Testing Algorithm  Patient's Medical Diagnosis:   Type of Operation:   Recommended Testing:  | Test | Medical Diagnosis | Type of Operation | | --- | --- | --- | | Complete Blood Count (CBC) | All | All | | Coagulation Panel | Bleeding disorders, liver disease | All major surgeries, neurosurgery, orthopedic surgeries | | Electrol",
    "id": "RzBNjvK"
  }
  ```
CLEAN:
```JSON
{
    "prompt_token_len": 50,
    "prompt_sent_len": 3,
    "prompt": "Answer as if you were the medical director of an anesthesia service. Develop a preoperatively testing algorithm reported in a chart format. Select specific tests that are required for each test based on both particular medical diagnoses as well as by the type of operation is planned.",
    "completion_token_len": 48,
    "completion_sent_len": 2,
    "completion": "Preoperative Testing Algorithm Patient's Medical Diagnosis: Type of Operation: Recommended Testing: Test Medical Diagnosis Type of Operation --- --- --- Complete Blood Count (CBC) All All Coagulation Panel Bleeding disorders, liver disease All major surgeries, neurosurgery, orthopedic surgeries Electrol",
    "chat_id": "14789613-f5af-4385-9d80-3058fbdb1f8c"
}
  ```


In [None]:
import datetime

def print_runtime():
    run_time = datetime.datetime.now().replace(second=0, microsecond=0)
    print("Last run time: {}".format(run_time))

In [None]:
import spacy
import uuid
import json

# Ideally, this function should be called right after importing spaCy and before loading any pipelines.
spacy.prefer_gpu()

# pprint.pprint(spacy.info())
print(json.dumps(spacy.info(), indent=4))

nlp = spacy.load("en_core_web_sm")

print_runtime()

In [None]:
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import re

print("Creating language_detector")
@Language.factory("language_detector")
def get_lang_detector(nlp, name):
   return LanguageDetector()

nlp = spacy.load('en_core_web_sm')  # 1
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

print("language_detector created")
print_runtime()

In [None]:
# import raw json data

print("Loading rawJsonFile")

# start small
srcRawJsonFilePath = '..\\data\\raw\\ShareGptChatPairs_dev_32.json'
destCleanJsonFilePath = '..\\data\\clean\\ShareGptChatPairs_dev_32_cleaned_formatted.json'

# then do the big file    
# srcRawJsonFilePath = '..\\data\\raw\\'..\\data\\raw\\ShareGptChatPairs_3330.json'
# destCleanJsonFilePath = '..\\data\\clean\\ShareGptChatPairs_3330_cleaned_formatted.json'


rawJsonFile = open(srcRawJsonFilePath)

# returns JSON object as 
# a dictionary
rawData = json.load(rawJsonFile)
# Closing file
rawJsonFile.close()

print("rawJsonFile loaded")

def is_english(unknown_language):
    doc = nlp(unknown_language)
    detected_lang = doc._.language 
    # print(detected_lang)
    return detected_lang['language'] == 'en' and detected_lang['score'] > 0.9

def is_only_english(json):
    pok = is_english(json["prompt"])
    cok = is_english(json["completion"])
    return pok and cok

def clean_string(string):
    #string = text.encode("ascii", "replace")
    return " ".join((re.sub('"', '', string)).replace("\\", "").replace("_", "").replace("|", "").split())

def gen_uuid():
    return clean_string(json.dumps(uuid.uuid4(), default=str))

def clean_data(raw):
    clean_prompt = clean_string(raw["prompt"])
    prompt_doc = nlp(clean_prompt)
    prompt_tok_len = len(prompt_doc)
    prompt_sents = list(prompt_doc.sents)
    prompt_sent_count =len(prompt_sents)
                       
    clean_completion = clean_string(raw["completion"])
    completion_doc = nlp(clean_completion)
    completion_tok_len = len(completion_doc)
    completion_sents = list(completion_doc.sents)
    completion_sent_count =len(completion_sents)
    return {
        "prompt_token_len": prompt_tok_len,
        "prompt_sent_len": prompt_sent_count,
        "prompt": clean_prompt,
        "completion_token_len": completion_tok_len,
        "completion_sent_len": completion_sent_count,
        "completion": clean_completion,
        "chat_id": gen_uuid()
    }  

print_runtime()

# how long will it take to clean your data?
Have a look at this discussion about what should be a straightforward problem:

https://stackoverflow.com/questions/7370801/how-do-i-measure-elapsed-time-in-python


In [None]:
from timeit import default_timer as timer

started = timer()
cleaned = []
count = 0
max = 100

for i, d in enumerate(rawData):
    # we only accept eng lang
    if is_only_english(d):
        cleaned.append(clean_data(d))
    
    count += 1
    
    if count % 10 == 0:
        print(count)
    
    # limit the work
    if(count>max):
        break
    
completed = timer()
print("duration for {} records was {} seconds.".format(count, completed - started))
print("----------------------------------------------")

#print(json.dumps(cleaned[0], indent=4))
print_runtime()


duration for 100 records was 29.424200800000108  seconds.

On my mchaine, spacy.prefer_gpu() makes no difference

Next...

ShareGptChatPairs_3330_cleaned.json will be about 2415 cleaned and formatted chat pairs.
Writing the cleaned data to file will be quick

In [None]:

# write cleaned json to the cleaned directory
with open(destCleanJsonFilePath, 'w') as f:
    f.write("[\n")
    idx = 0
      
    count = len(cleaned)
    while idx < count:
        f.write(json.dumps(cleaned[idx], indent=4))
        if idx < count-1:
            f.write(",\n") 
        idx += 1

        # prevent writing to file in an endless loop in the vent of bad code.       
        if(idx > max):
            break 
              
    f.write("\n]")
    
    print("Finished writing to file")

    print_runtime()