In [1]:
import random
from datasets import load_dataset
from datasets import DatasetDict
import re
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("jerryjalapeno/nart-100k-synthetic")
print(dataset)

Downloading readme: 100%|██████████| 285/285 [00:00<00:00, 548kB/s]
Downloading data: 100%|██████████| 546M/546M [02:56<00:00, 3.10MB/s] 
Generating train split: 99086 examples [00:05, 19476.55 examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 99086
    })
})





In [None]:
# choose 30000 samples from the dataset
# dataset['train'] = dataset['train'].shuffle(seed=42).select(range(2000))
# print(dataset)

In [3]:
# create a train_test_val split
train_test_split = dataset["train"].train_test_split(test_size=0.3)
train_val_split = train_test_split['test'].train_test_split(test_size=0.5)

print(train_test_split)
print(train_val_split)

train_test_val_split = DatasetDict({
    "train": train_test_split['train'],
    "test": train_val_split['train'],
    "val": train_val_split['test']
})

print(train_test_val_split)


DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 69360
    })
    test: Dataset({
        features: ['conversations', 'id'],
        num_rows: 29726
    })
})
DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 14863
    })
    test: Dataset({
        features: ['conversations', 'id'],
        num_rows: 14863
    })
})
DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 69360
    })
    test: Dataset({
        features: ['conversations', 'id'],
        num_rows: 14863
    })
    val: Dataset({
        features: ['conversations', 'id'],
        num_rows: 14863
    })
})


In [4]:
SYSTEM_PROMPT = """You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def preprocessText(text):
    text = re.sub(r'Alex', '', text)
    text = re.sub(r'Charlie', '', text)
    # remove ", " when it appears at the start of a sentence
    text = re.sub(r'^, ', '', text)
    # remove " ." with "."
    text = re.sub(r' \.', '.', text)
    # remove " ," with ","
    text = re.sub(r' ,', ',', text)
    # remove " ?" with "?"
    text = re.sub(r' \?', '?', text)
    # remove ",." with "."
    text = re.sub(r',\.', '.', text)
    # remove ",?" with "?"
    text = re.sub(r',\?', '?', text)
    # remove more than one space
    text = re.sub(r' +', ' ', text)
    
    return text
    

def transform_dataset(data_row):
    id = data_row['id']
    data_row = data_row['conversations']
    for conv in data_row:
        if conv['from'] == 'human':
            conv['role'] = "user"
        elif conv['from'] == 'gpt':
            conv['role'] = "assistant"
        
        conv['content'] = preprocessText(conv['value'])
        del conv['from']
        del conv['value']
    
    system_dict = {}
    system_dict['role'] = "system"
    system_dict['content'] = SYSTEM_PROMPT
    data_row.insert(0, system_dict)
    # in conversational format the features name is "messages"
    return {"messages": data_row}

train_test_val_split = train_test_val_split.map(transform_dataset, remove_columns=['conversations'])


Map: 100%|██████████| 69360/69360 [00:35<00:00, 1945.52 examples/s]
Map: 100%|██████████| 14863/14863 [00:08<00:00, 1812.71 examples/s]
Map: 100%|██████████| 14863/14863 [00:07<00:00, 1992.37 examples/s]


In [5]:
print(train_test_val_split)

DatasetDict({
    train: Dataset({
        features: ['id', 'messages'],
        num_rows: 69360
    })
    test: Dataset({
        features: ['id', 'messages'],
        num_rows: 14863
    })
    val: Dataset({
        features: ['id', 'messages'],
        num_rows: 14863
    })
})


In [6]:
print(train_test_val_split['train'][0]['messages'])

[{'content': "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.", 'role': 'system'}, {'content': "Hi, I'm feeling afraid and experiencing a sense of loss. I've been struggling with some conflicts with my friends lately.", 'role': 'user'}, {'content': "Hey, thank you for reaching out. I'm here to support you. Conflict can be tough, but remember that it's normal in relationships. Could you tell me more about what's been happening?", 'role': 'assistant'}, {'content': "Well, I've noticed that my friends and I have been d

In [7]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
print(tokenizer.apply_chat_template(train_test_val_split['train'][0]['messages'],tokenize=False))

<s>[INST] <<SYS>>
You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

Hi, I'm feeling afraid and experiencing a sense of loss. I've been struggling with some conflicts with my friends lately. [/INST] Hey, thank you for reaching out. I'm here to support you. Conflict can be tough, but remember that it's normal in relationships. Could you tell me more about what's been happening? </s><s>[INST] Well, I've noticed that my friends and I have been disagreeing a lot lately. It makes me afraid that we might drift apart

In [8]:
train_test_val_split.push_to_hub("phr-mental-therapy-dataset-conversational-format")

Creating parquet from Arrow format: 100%|██████████| 70/70 [00:02<00:00, 34.55ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:16<00:00, 16.18s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 31.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.90s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 33.05ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/vibhorag101/phr-mental-therapy-dataset-conversational-format/commit/409dfb3d07ccde908bd5efe9388ef0ec2f06a352', commit_message='Upload dataset', commit_description='', oid='409dfb3d07ccde908bd5efe9388ef0ec2f06a352', pr_url=None, pr_revision=None, pr_num=None)