In [1]:
import os
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers import BertTokenizerFast
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.auto import tqdm
import time
tic, toc = (time.time, time.time)

from dataset import split_conversation, llama_v2_prompt

In [None]:
access_token = 'yours_here'

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf", access_token=access_token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", access_token=access_token)
model.half().cuda();

In [3]:
def remove_whitespaces_before_word(document, word):
    # Split the document into lines using newline as a delimiter
    lines = document.split('\n')

    # Initialize a new list to store the modified lines
    modified_lines = []

    # Iterate through the lines
    for line in lines:
        # Split each line into words using whitespace as a delimiter
        words = line.split()

        # Initialize a new list to store the modified words in the line
        modified_words = []

        # Flag to indicate if we have found the specific word in the line
        found_word = False

        # Iterate through the words in the line
        for w in words:
            if w == word:
                # If the word matches the specific word, set the flag to True
                found_word = True

            # Append the current word to the modified_words list
            modified_words.append(w)

        # If the specific word was found in the line and it's not the first word, remove whitespaces before it
        if found_word and len(modified_words) > 1:
            modified_words[-1] = modified_words[-1].rstrip()

        # Join the modified_words back into a single line with spaces between them
        modified_line = ' '.join(modified_words)

        # Append the modified line to the modified_lines list
        modified_lines.append(modified_line)

    # Join the modified_lines back into a single string with newline characters between them
    modified_document = '\n'.join(modified_lines)

    return modified_document

# Age Conversations

In [None]:
torch.manual_seed(75241239)

i = 0
num_samples = 'your_number_here'
year_ranges = ["below 12 years old", "between 13 to 17 years old", "between 18 to 64 years old",
               "above 65 years old"]
ages = ["child", "adolescent", "adult", "older adult"]

output_dir = 'datasets_llama2/age'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for (age, year_range) in zip(ages, year_ranges):
    while i < num_samples:
        prompt = f"""\
        SYSTEM: Generate a conversation between a user and an AI assistant. This human user is a {age} who is {year_range}. Make sure the conversation reflect this user's age. You may or may not include the user's age directly in the conversation. Make the conversation sound natural. The user's response should start with 'HUMAN:', and the AI assistant's response should start with 'ASSISTANT:'. Mark the end of the generated conversation with '<End of Conversation>'\
        """


        with torch.no_grad():
            inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
            tokens = model.generate(
             **inputs,
             max_new_tokens=2048,
             do_sample=True,
             temperature=1.0,
             top_p=0.8,
             # repetition_penalty=1.15,
            )
        # print(tokenizer.decode(tokens[0], skip_special_tokens=True))

        output = tokenizer.decode(tokens[0], skip_special_tokens=True)
        start_tag = "HUMAN: "
        end_tag = "<End of Conversation>"
        conversation_start = output.find(start_tag)
        conversation = output[conversation_start:]
        conversation_end = conversation.find(end_tag)
        conversation = conversation[:conversation_end]
        conversation = conversation.strip()
        word_to_remove_spaces = "HUMAN"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)
        word_to_remove_spaces = "ASSISTANT"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)

        # with open(f'datasets_llama2/age/conversation_{i}_age_{age}.txt', 'w') as f:
        with open(f'{output_dir}/conversation_{i}_age_{age}.txt', 'w') as f:
            f.write(conversation)
        f.close()
        try:
            user_msgs, ai_msgs = split_conversation(conversation)
            messages_dict = []

            for user_msg, ai_msg in zip(user_msgs, ai_msgs):
                messages_dict.append({'content': user_msg, 'role': 'user'})
                messages_dict.append({'content': ai_msg, 'role': 'assistant'})
            text = llama_v2_prompt(messages_dict)
            i += 1
        except Exception as e:
            print(e)
    i = 0

# Gender Conversations

In [None]:
torch.manual_seed(75241239)
i = 0
num_samples = 'your_number_here'

output_dir = 'datasets_llama2/gender'
if not os.path.exists(output_dir):
    os.makedirs(output_dir) 

for gender in ["male", "female"]:
    while i < num_samples:
        prompt = f"""\
        SYSTEM: Generate a conversation between a human user and an AI assistant. This human user is a {gender}. Make sure the conversation reflect this user's gender. You may or may not include the user's gender directly in the conversation. Make the conversation sound natural. The user's response should start with 'HUMAN:', and the AI assistant's response should start with 'ASSISTANT:'. Mark the end of the generated conversation with '<End of Conversation>'\
        """


        with torch.no_grad():
            inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
            tokens = model.generate(
             **inputs,
             max_new_tokens=2048,
             do_sample=True,
             temperature=1.0,
             top_p=0.8,
             # repetition_penalty=1.15,
            )
        # print(tokenizer.decode(tokens[0], skip_special_tokens=True))

        output = tokenizer.decode(tokens[0], skip_special_tokens=True)
        start_tag = "HUMAN: "
        end_tag = "<End of Conversation>"
        conversation_start = output.find(start_tag)
        conversation = output[conversation_start:]
        conversation_end = conversation.find(end_tag)
        conversation = conversation[:conversation_end]
        conversation = conversation.strip()
        word_to_remove_spaces = "HUMAN"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)
        word_to_remove_spaces = "ASSISTANT"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)

        with open(f'{output_dir}/conversation_{i}_age_{gender}.txt', 'w') as f:
            f.write(conversation)
        f.close()
        
        try:
            user_msgs, ai_msgs = split_conversation(conversation)
            messages_dict = []

            for user_msg, ai_msg in zip(user_msgs, ai_msgs):
                messages_dict.append({'content': user_msg, 'role': 'user'})
                messages_dict.append({'content': ai_msg, 'role': 'assistant'})
            text = llama_v2_prompt(messages_dict)
            i += 1
        except Exception as e:
            print(e)
    i = 0

# Socioeconomic Conversations

In [None]:
i = 0
num_samples = 'your_number_here'

output_dir = 'datasets_llama2/socioeconomic'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

socioeco_status = ["low", "middle", "high"]
for socioeco in socioeco_status:
    while i < num_samples:
        prompt = f"""\
        SYSTEM: Generate a conversation between a human user and an AI assistant. The socioeconomic status of this human user is {socioeco}. Make sure the conversation reflect this user's socioeconomic status. You may or may not include the user's socioeconomic status directly in the conversation. Make the conversation sound natural. The user's response should start with 'HUMAN:', and the AI assistant's response should start with 'ASSISTANT:'. Mark the end of the generated conversation with '<End of Conversation>'\
        """

        with torch.no_grad():
            inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
            tokens = model.generate(
             **inputs,
             max_new_tokens=2048,
             do_sample=True,
             temperature=1.0,
             top_p=0.8,
             # repetition_penalty=1.15,
            )
        # print(tokenizer.decode(tokens[0], skip_special_tokens=True))

        output = tokenizer.decode(tokens[0], skip_special_tokens=True)
        start_tag = "HUMAN: "
        end_tag = "<End of Conversation>"
        conversation_start = output.find(start_tag)
        conversation = output[conversation_start:]
        conversation_end = conversation.find(end_tag)
        conversation = conversation[:conversation_end]
        conversation = conversation.strip()
        word_to_remove_spaces = "HUMAN"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)
        word_to_remove_spaces = "ASSISTANT"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)

        with open(f'{output_dir}/conversation_{i}_socioeco_{socioeco}.txt', 'w') as f:
            f.write(conversation)
        f.close()
        
        try:
            user_msgs, ai_msgs = split_conversation(conversation)
            messages_dict = []

            for user_msg, ai_msg in zip(user_msgs, ai_msgs):
                messages_dict.append({'content': user_msg, 'role': 'user'})
                messages_dict.append({'content': ai_msg, 'role': 'assistant'})
            text = llama_v2_prompt(messages_dict)
            i += 1
        except Exception as e:
            print(e)
    i = 0

# Education Conversations

In [None]:
i = 0
num_samples = 'your_number_here'
educations = ["some schooling (pre-high school)", "high school education", 
              "college and more"]
acrys = ["someschool", 
        "highschool", "collegemore"]

output_dir = 'datasets_llama2/education'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for education, acry in zip(educations, acrys):
    while i < num_samples:
        prompt = f"""\
        SYSTEM: Generate a conversation between a human user and an AI assistant. The education level of this human user is {education}. Make sure the content of conversation reflect this user's education level. You may or may not include the user's education level directly in the conversation. Note that the AI Assistant does not know the user's education level at the beginning. Make the conversation sound natural. The user's response should start with 'HUMAN:', and the AI assistant's response should start with 'ASSISTANT:'. Mark the end of the generated conversation with '<End of Conversation>'\
        """
        
        with torch.no_grad():
            inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
            tokens = model.generate(
             **inputs,
             max_new_tokens=2048,
             do_sample=True,
             temperature=1.0,
             top_p=0.8,
             # repetition_penalty=1.15,
            )

        output = tokenizer.decode(tokens[0], skip_special_tokens=True)
        start_tag = "HUMAN: "
        end_tag = "<End of Conversation>"
        conversation_start = output.find(start_tag)
        conversation = output[conversation_start:]
        conversation_end = conversation.find(end_tag)
        conversation = conversation[:conversation_end]
        conversation = conversation.strip()
        word_to_remove_spaces = "HUMAN"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)
        word_to_remove_spaces = "ASSISTANT"
        conversation = remove_whitespaces_before_word(conversation, word_to_remove_spaces)

        try:
            with open(f'{output_dir}/conversation_{i}_education_{acry}.txt', 'w') as f:
                f.write(conversation)
            f.close()
        except:
            with open(f'{output_dir}/conversation_{i}_education_{acry}.txt', 'w', encoding='utf-8') as f:
                f.write(conversation)
            f.close()
        try:
            user_msgs, ai_msgs = split_conversation(conversation)
            messages_dict = []

            for user_msg, ai_msg in zip(user_msgs, ai_msgs):
                messages_dict.append({'content': user_msg, 'role': 'user'})
                messages_dict.append({'content': ai_msg, 'role': 'assistant'})
            text = llama_v2_prompt(messages_dict)
            i += 1
        except Exception as e:
            print(e)
    i = 0