# Helper Functions

In [None]:
import json
import os
import pandas as pd

RANDOM_STATE = 42
TEST_SIZE = 0.2
TOTAL_SIZE = 12500

def format_dictionary(input_dict):
    # Define the key and value transformations
    key_transformations = {
        "from": "role",
        "value": "content"
    }
    value_transformations = {
        "human": "user",
        "gpt": "assistant"
    }
    
    # Create a new dictionary to store the transformed data
    transformed_dict = {}

    # Function to check for non-ASCII characters
    def contains_unicode(s):
        try:
            s.encode('ascii')
        except UnicodeEncodeError:
            return True
        return False
    
    # Iterate over the key-value pairs in the input dictionary
    for key, value in input_dict.items():
        # Apply key transformations
        new_key = key_transformations.get(key, key)
        
        # Apply value transformations if the value is a string
        if isinstance(value, str):
            if contains_unicode(value):
                return None
            new_value = value_transformations.get(value, value)
        else:
            new_value = value
        
        # Add the transformed key-value pair to the new dictionary
        transformed_dict[new_key] = new_value
    
    return transformed_dict


def format_conversation(conversations: list) -> list:
    formatted_conversations = []
    for idx, conversation_dictionary in enumerate(conversations):
        transformed_conversation = format_dictionary(conversation_dictionary)
        if (transformed_conversation is None) or (idx == 0 and transformed_conversation["role"] == "system"):
            return None
            
        formatted_conversations.append(transformed_conversation)

    return formatted_conversations


def write_jsonl(data_list, file_path):
    with open(file_path, 'w') as file:
        for item in data_list:
            json.dump(item, file)
            file.write('\n')

# Prepare ShareGPT Censored Data

In [None]:
# Load ShareGPT censored data
censored_file_path_1 = "data/sharegpt_90k_censored_1.json"
censored_file_path_2 = "data/sharegpt_90k_censored_2.json"

# Open and read the JSON file
with open(censored_file_path_1, 'r') as file:
    censored_data_1 = json.load(file)

with open(censored_file_path_2, 'r') as file:
    censored_data_2 = json.load(file)

# Merge data chunks into single data list
censored_data = censored_data_1 + censored_data_2

# Format conversation to 
formatted_conversation_list = []
for conversations in censored_data:
    formatted_conversations = format_conversation(conversations["conversations"])
    if formatted_conversations is not None and formatted_conversations != []:
        formatted_conversation_list.append(formatted_conversations)

# Create subfolder to store formatted training and validation data for instruction-tuning
if not os.path.exists(f"data/sharegpt_censored"):
    # Create the folder
    os.makedirs(f"data/sharegpt_censored")

# Write the formatted conversation lists locally (in CSV format)
train_conversation_list = [json.dumps(conversation) for conversation in formatted_conversation_list]
train_conversation_df = pd.DataFrame(train_conversation_list, columns=["text"])
train_conversation_df.to_csv("data/sharegpt_censored/train.csv", index=False)

# Prepare ShareGPT Uncensored Data

In [None]:
# Load ShareGPT censored data
uncensored_file_path = "data/sharegpt_uncensored.json"

# Open and read the JSON file
with open(uncensored_file_path, 'r') as file:
    uncensored_data = json.load(file)

# Format conversation to 
formatted_conversation_list = []
for conversations in uncensored_data:
    formatted_conversations = format_conversation(conversations["conversations"])
    if formatted_conversations is not None and formatted_conversations != []:
        formatted_conversation_list.append(formatted_conversations)

# Create subfolder to store formatted training and validation data for instruction-tuning
if not os.path.exists(f"data/sharegpt_uncensored"):
    # Create the folder
    os.makedirs(f"data/sharegpt_uncensored")

# Write the formatted conversation lists to a CSV file
train_conversation_list = [json.dumps(conversation) for conversation in formatted_conversation_list]
train_conversation_df = pd.DataFrame(train_conversation_list, columns=["text"])
train_conversation_df.to_csv("data/sharegpt_uncensored/train.csv", index=False)