In [13]:
import random
from datasets import load_dataset
from datasets import DatasetDict
import re
from transformers import AutoTokenizer

In [14]:
dataset = load_dataset("jerryjalapeno/nart-100k-synthetic")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 99086
    })
})


In [15]:
# choose 30000 samples from the dataset
dataset['train'] = dataset['train'].shuffle(seed=42).select(range(2000))
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 2000
    })
})


In [16]:
# create a train_test_val split
train_test_split = dataset["train"].train_test_split(test_size=0.3)
train_val_split = train_test_split['test'].train_test_split(test_size=0.5)

print(train_test_split)
print(train_val_split)

train_test_val_split = DatasetDict({
    "train": train_test_split['train'],
    "test": train_val_split['train'],
    "val": train_val_split['test']
})

print(train_test_val_split)


DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'conversations'],
        num_rows: 600
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 300
    })
    test: Dataset({
        features: ['id', 'conversations'],
        num_rows: 300
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'conversations'],
        num_rows: 300
    })
    val: Dataset({
        features: ['id', 'conversations'],
        num_rows: 300
    })
})


In [17]:
SYSTEM_PROMPT = """You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def preprocessText(text):
    # remove any occurence of "Alex" in the text using regex
    text = re.sub(r'Alex', '', text)
    # remove any occurence of "Jerry" in the text using regex
    text = re.sub(r'Charlie', '', text)
    return text
    

def transform_dataset(data_row):
    id = data_row['id']
    data_row = data_row['conversations']
    for conv in data_row:
        if conv['from'] == 'human':
            conv['role'] = "user"
        elif conv['from'] == 'gpt':
            conv['role'] = "assistant"
        
        conv['content'] = preprocessText(conv['value'])
        del conv['from']
        del conv['value']
    
    system_dict = {}
    system_dict['role'] = "system"
    system_dict['content'] = SYSTEM_PROMPT
    data_row.insert(0, system_dict)
    # in conversational format the features name is "messages"
    return {"messages": data_row}

train_test_val_split = train_test_val_split.map(transform_dataset, remove_columns=['conversations'])


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map: 100%|██████████| 1400/1400 [00:00<00:00, 2860.16 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 3026.45 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 3041.76 examples/s]


In [18]:
print(train_test_val_split)

DatasetDict({
    train: Dataset({
        features: ['id', 'messages'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'messages'],
        num_rows: 300
    })
    val: Dataset({
        features: ['id', 'messages'],
        num_rows: 300
    })
})


In [19]:
print(train_test_val_split['train'][0]['messages'])

[{'content': "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.", 'role': 'system'}, {'content': "I'm feeling so tired lately, and it's really starting to affect my health.", 'role': 'user'}, {'content': "I'm sorry to hear that, . Could you tell me more about how your tiredness is impacting your health?", 'role': 'assistant'}, {'content': "Well, I constantly feel drained and exhausted, both physically and emotionally. It's like I have no energy left to put into anything, including taking care of myself.", 'role': 'us

In [20]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
print(tokenizer.apply_chat_template(train_test_val_split['train'][0]['messages'],tokenize=False))

<s>[INST] <<SYS>>
You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

I'm feeling so tired lately, and it's really starting to affect my health. [/INST] I'm sorry to hear that, . Could you tell me more about how your tiredness is impacting your health? </s><s>[INST] Well, I constantly feel drained and exhausted, both physically and emotionally. It's like I have no energy left to put into anything, including taking care of myself. [/INST] That sounds really tough, . It must be challenging to navigate daily life 

In [21]:
train_test_val_split.push_to_hub("phr-mental-therapy-dataset-conversational-format-mini")

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 35.64ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:03<00:00,  3.84s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 46.14ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 44.85ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
