In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


In [4]:

# class DialogSandbox:
#     def __init__(self, model_name='gpt2', device='cuda'):
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
#         self.device = device

#     def generate_dialog(self, prompt, max_length=100, num_return_sequences=1, temperature=1.0, top_p=0.9):
#         """
#         Generate dialog based on a given prompt using GPT-2.
        
#         :param prompt: Prompt for the dialog generation.
#         :param max_length: Maximum length of the generated dialog.
#         :param num_return_sequences: Number of dialog sequences to generate.
#         :param temperature: Sampling temperature for diversity.
#         :param top_p: Nucleus sampling parameter for controlling diversity.
#         :return: Generated dialog sequences.
#         """
#         # Prepare the prompt
#         input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        
#         # Generate sequences
#         # chat_outputs = self.model.generate(
#         #     input_ids,
#         #     max_length=max_length + len(input_ids[0]),
#         #     num_return_sequences=num_return_sequences,
#         #     temperature=temperature,
#         #     top_p=top_p,
#         #     pad_token_id=self.tokenizer.eos_token_id
#         # )

#         chat_outputs = self.model.generate(
#             input_ids,
#             max_length=max_length + len(input_ids[0]),
#             num_return_sequences=num_return_sequences,
#             temperature=temperature,
#             top_p=top_p,
#             pad_token_id=self.tokenizer.eos_token_id,
#             do_sample=True  # Ensure sampling is enabled to support num_return_sequences > 1
#         )

#         # generation_config = self.model.generate(
#         #     max_new_tokens=150, 
#         #     num_return_sequences=100,
#         #         do_sample=True, 
#         #         top_p=0.9, 
#         #         temperature=1.0, 
#         #         no_repeat_ngram_size=3,
#         #         # bad_words_ids=bad_words_ids,
#         #         pad_token_id=self.tokenizer.pad_token_id
#         #     )
        
#         # Decode and return the sequences
#         return [self.tokenizer.decode(output, skip_special_tokens=True) for output in chat_outputs]

# # Example usage
# if __name__ == "__main__":
#     sandbox = DialogSandbox(model_name='gpt2', device='cpu')  # Use 'cuda' if GPU is available
#     prompt = "The following is a conversation between two old friends, John and Sarah, who unexpectedly meet at a park: John: Hey Sarah! It's been a long time. How have you been?"
#     dialogs = sandbox.generate_dialog(prompt, max_length=50, num_return_sequences=3, temperature=0.8, top_p=0.95)
    
#     for i, dialog in enumerate(dialogs, 1):
#         print(f"Dialog {i}:\n{dialog}\n")


In [5]:
class DialogSandbox:
    def __init__(self, model_name='gpt2', device='cpu'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.device = device

    
    def generate_dialog(self, prompt, initial_sentence, max_length=100, num_return_sequences=1, temperature=1.0, top_p=0.9):
        """
        Generate dialog based on a given prompt using GPT-2, excluding the initial prompt from the output.
        
        :param prompt: Complete prompt for the dialog generation, including instructions.
        :param initial_sentence: The initial sentence of the dialogue to include in the output.
        :param max_length: Maximum length of the generated dialog.
        :param num_return_sequences: Number of dialog sequences to generate.
        :param temperature: Sampling temperature for diversity.
        :param top_p: Nucleus sampling parameter for controlling diversity.
        :return: Generated dialog sequences, starting with the initial sentence.
        """
        # Prepare the prompt
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)

        # Generate sequences
        chat_outputs = self.model.generate(
            input_ids,
            max_length=max_length + len(input_ids[0]),
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True  # Ensure sampling is enabled
        )
        
        # Decode the generated sequences and trim the instruction part
        generated_dialogs = [self.tokenizer.decode(output, skip_special_tokens=True) for output in chat_outputs]
        trimmed_dialogs = [dialog.replace(prompt, initial_sentence, 1) for dialog in generated_dialogs]

        return trimmed_dialogs

# Example usage
if __name__ == "__main__":
    sandbox = DialogSandbox(model_name='gpt2', device='cpu')  # Use 'cuda' if GPU is available
    # prompt = "The following is a conversation between two old friends, John and Sarah, who unexpectedly meet at a park:"
    # initial_sentence = "John: Hey Sarah! It's been a long time. How have you been?"
    
    prompt = "Here's a chat between two parents discussing their kids' first day at school:",
    initial_sentence= "Sam: How did Jamie take to their first day at school? Mine was a bit teary-eyed."

    # prompt = "The following is a conversation between two old friends, John and Sarah, who unexpectedly meet at a park:"
    # initial_sentence = "John: Hey Sarah! It's been a long time. How have you been?"
    dialogs = sandbox.generate_dialog(prompt, initial_sentence, max_length=100, num_return_sequences=3, temperature=0.8, top_p=0.95)
    
    for i, dialog in enumerate(dialogs, 1):
        print(f"Dialog {i}:\n{dialog}\n")


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [6]:
class DialogSandbox:
    def __init__(self, model_name='gpt2', device='cpu'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.device = device

    
    def generate_dialog(self, prompt, initial_sentence, max_length=100, num_return_sequences=1, temperature=1.0, top_p=0.9):
        """
        Generate dialog based on a given prompt using GPT-2, excluding the initial prompt from the output.
        
        :param prompt: Complete prompt for the dialog generation, including instructions.
        :param initial_sentence: The initial sentence of the dialogue to include in the output.
        :param max_length: Maximum length of the generated dialog.
        :param num_return_sequences: Number of dialog sequences to generate.
        :param temperature: Sampling temperature for diversity.
        :param top_p: Nucleus sampling parameter for controlling diversity.
        :return: Generated dialog sequences, starting with the initial sentence.
        """
        full_prompt = " "+ prompt+ " " +initial_sentence

        # Prepare the prompt
        input_ids = self.tokenizer.encode(full_prompt, return_tensors='pt').to(self.device)

        # Generate sequences
        chat_outputs = self.model.generate(
            input_ids,
            max_length=max_length + len(input_ids[0]),
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True  # Ensure sampling is enabled
        )
        
        # Decode the generated sequences and trim the instruction part
        generated_dialogs = [self.tokenizer.decode(output, skip_special_tokens=True) for output in chat_outputs]
        trimmed_dialogs = [dialog.replace(full_prompt, initial_sentence, 1) for dialog in generated_dialogs]

        return trimmed_dialogs

if __name__ == "__main__":
    sandbox = DialogSandbox(model_name='gpt2', device='cpu')  # Use 'cuda' if GPU is available
    # prompt = "The following is a conversation between two old friends, John and Sarah, who unexpectedly meet at a park:"
    # initial_sentence = "John: Hey Sarah! It's been a long time. How have you been?"

    prompt = "This is a conversation between two college roommates, Mia and Alex, who haven't seen each other in years and unexpectedly bump into each other at an airport."
    initial_sentence = "Mia: Alex? Is that really you? It's been ages!"

    # prompt = "A chat between siblings planning a surprise birthday party for their mother. They are discussing who will do what for the party preparations."
    # initial_sentence = "Chris: So, have we decided who's going to distract mom on the day of the party?"

    # prompt = "The following is a conversation between friends discussing their opinions on a new blockbuster movie they just watched together."
    # initial_sentence = "Jordan: That movie was incredible, wasn't it? What did you think?"

    dialogs = sandbox.generate_dialog(prompt, initial_sentence, max_length=100, num_return_sequences=3, temperature=0.8, top_p=0.95)
    
    for i, dialog in enumerate(dialogs, 1):
        print(f"Dialog {i}:\n{dialog}\n")


Dialog 1:
Mia: Alex? Is that really you? It's been ages! Mia: Yeah. I think I know you. I'm just thinking about you. I know you're really lonely. It seems so hard to me to just sit on your bed and let you go. Mia: I'm so sorry! I didn't mean to do that to you. But now you're awake and you're telling me this. What was that? I thought I just saw you two go out together. It's like you can't wait to talk to me again.



Dialog 2:
Mia: Alex? Is that really you? It's been ages!

Alex: I'm sorry.

Mia: So if you don't mind, just stay there until you get your luggage and a beer. We'll be there! Alex: You'll be there too.

Mia: I'm happy we're not in this together.

Alex: You know, I'm pretty much my new boyfriend, after all, because I'm so happy and excited to be out with you.

Mia: So it's okay

Dialog 3:
Mia: Alex? Is that really you? It's been ages! Alex: I'm just glad you're okay. Mia: Oh! That was really weird. I'm really glad you're okay. You're doing well. You're making friends, and I'm

In [None]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class DialogSandbox:
    def __init__(self, model_name='gpt2', device='cpu'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.device = device
    
    def generate_conversations(self, num_conversations=10):
        conversations = []
        for _ in range(num_conversations):
            prompt, initial_sentence = self._select_random_prompt()
            dialog = self.generate_dialog(prompt, initial_sentence, 
                                          max_length=random.randint(50, 150), 
                                          num_return_sequences=1, 
                                          temperature=random.uniform(0.7, 1.2), 
                                          top_p=random.uniform(0.8, 0.95))
            conversations.append(dialog[0])
        return conversations

    
    def _select_random_prompt(self):
        prompts = [
            ("After a music festival", "Alex: Did you see the final act last night? Unbelievable!"),
            ("Planning a trip", "Jamie: So, where are we off to next? Mountains or beach?"),
            ("Discussing a new series", "Sam: Have you started watching 'The Great Adventure' yet?"),
            ("Morning routines", "Casey: I've switched to yoga in the mornings. Best decision ever."),
            ("Workout motivation", "Jordan: How do you get yourself to the gym every day? I need tips."),
            ("Favorite books", "Taylor: I just finished reading 'Ocean Depths'. Blew my mind."),
            ("Weekend plans", "Morgan: Got any exciting plans for the weekend?"),
            ("Sharing recipes", "Riley: I tried that new pasta recipe you sent. My family loved it!"),
            ("Pet stories", "Quinn: My cat did the funniest thing yesterday."),
            ("Tech gadgets", "Parker: Thinking of getting a smartwatch. Which one do you use?"),
            ("Study tips", "Robin: Finals are coming up. How do you keep yourself focused?"),
            ("Vacation memories", "Drew: Remember that trip to Paris? Found our goofy photo at the Louvre."),
            ("Movie night", "Charlie: Movie night this Friday? I'll bring the popcorn."),
            ("Career advice", "Bailey: I'm thinking of switching careers. Ever done something that bold?"),
            ("Fashion trends", "Taylor: How do you always stay ahead with fashion trends?"),
            ("Concert experiences", "Alex: Just got tickets to Lively Beats fest! Ever been?"),
            ("Childhood memories", "Casey: What's your happiest childhood memory?"),
            ("Dinner disaster stories", "Jordan: Ever had a complete kitchen fail? I just did."),
            ("Learning languages", "Sam: How's your Spanish going? Picked up any new phrases?"),
            ("Outdoor adventures", "Jamie: Ever been rock climbing? Looking for a buddy to try it with."),
            ("Health and wellness", "Morgan: I'm trying to drink more water. Got any tips for staying hydrated?"),
            ("Photography tips", "Riley: Your photos are amazing. Got any tips for a newbie?"),
            ("Book club suggestions", "Quinn: Our book club needs a new read. Any genre preferences?"),
            ("Weekend DIY projects", "Parker: Planning a DIY project this weekend. Ever built a bookshelf?"),
            ("Coffee lovers", "Robin: Discovered a new coffee shop downtown. Fancy a cup this weekend?"),
            ("Music recommendations", "Drew: In a music rut. Send me your top three songs?"),
            ("Fitness challenges", "Charlie: Signed up for a 10k run. Ever done one?"),
            ("Budget travel tips", "Bailey: Planning a budget trip to Asia. Any advice?"),
            ("Cultural exchange", "Taylor: What's a tradition from your culture I should know about?"),
            ("Sustainability practices", "Alex: Trying to live more sustainably. Where do I start?")
        ]

        
        return random.choice(prompts)


    def generate_dialog(self, prompt, initial_sentence, max_length=100, num_return_sequences=1, temperature=1.0, top_p=0.9):
        """
        Generate dialog based on a given prompt using GPT-2, excluding the initial prompt from the output.
        
        :param prompt: Complete prompt for the dialog generation, including instructions.
        :param initial_sentence: The initial sentence of the dialogue to include in the output.
        :param max_length: Maximum length of the generated dialog.
        :param num_return_sequences: Number of dialog sequences to generate.
        :param temperature: Sampling temperature for diversity.
        :param top_p: Nucleus sampling parameter for controlling diversity.
        :return: Generated dialog sequences, starting with the initial sentence.
        """
        full_prompt = " "+ prompt+ " " +initial_sentence

        # Prepare the prompt
        input_ids = self.tokenizer.encode(full_prompt, return_tensors='pt').to(self.device)

        # Generate sequences
        chat_outputs = self.model.generate(
            input_ids,
            max_length=max_length + len(input_ids[0]),
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True  # Ensure sampling is enabled
        )
        
        # Decode the generated sequences and trim the instruction part
        generated_dialogs = [self.tokenizer.decode(output, skip_special_tokens=True) for output in chat_outputs]
        trimmed_dialogs = [dialog.replace(full_prompt, initial_sentence, 1) for dialog in generated_dialogs]

        return trimmed_dialogs
    
if __name__ == "__main__":
    sandbox = DialogSandbox(model_name='gpt2', device='cpu')
    conversations = sandbox.generate_conversations(num_conversations=20)
    for dialog in conversations:
        print(dialog)
        print("------")


Jamie: Ever been rock climbing? Looking for a buddy to try it with. We're here for you. We'll be throwing this weekend in a remote, rural location just off of the main road.


Bicycle rentals at this location are also available for rent. Please visit our bike rentals page for more information on what rentals we have available.
------
Drew: In a music rut. Send me your top three songs?

Reddy: We love our podcasts and their audience needs some help! They would love some help writing for a podcast if you'd like. Send me your top four song requests for the podcast.

Randy: A great list of good song recommendations. Thanks Drew

Drew: It's important to find a good listener in a podcast, but that's about it. Here's a video of how they approached making the list

Randy: We want a podcast to stand on its own. We're not saying just show people all
------
Taylor: I just finished reading 'Ocean Depths'. Blew my mind. I really loved the book and it's a great read. I have read many of the books fr

In [16]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import os

class DialogSandbox:
    def __init__(self, model_name='gpt2', device='cpu'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.device = device
    
    def generate_conversations(self, num_conversations=10):
        conversations = []
        for _ in range(num_conversations):
            prompt, initial_sentence = self._select_random_prompt()
            dialog = self.generate_dialog(prompt, initial_sentence, 
                                          max_length=random.randint(50, 150), 
                                          num_return_sequences=1, 
                                          temperature=random.uniform(0.7, 1.2), 
                                          top_p=random.uniform(0.8, 0.95))
            conversations.append(dialog[0])
        print(conversations)
        return conversations

    
    def _select_random_prompt(self):
        # /users/k/n/kngongiv/Research/private_llm_generation/dialogue/stage1/summary/data/prefix/dialog_prefix.json
        project_dir = os.path.dirname("./stage1/summary/")
        with open(os.path.join(project_dir, f"data/prefix/dialog_prefix.json"), "r") as f:
            prefix_resource = json.load(f)
    
        scenario = random.choice(prefix_resource["dialog_prefixes"])
         #    scenario = random.choice(self.prefix_resource["dialog_prefixes"])
        prefix = scenario["prefix"]
        initial_sentence = scenario["initial_sentence"]
        return prefix, initial_sentence


    def generate_dialog(self, prompt, initial_sentence, max_length=100, num_return_sequences=1, temperature=1.0, top_p=0.9):
        """
        Generate dialog based on a given prompt using GPT-2, excluding the initial prompt from the output.
        
        :param prompt: Complete prompt for the dialog generation, including instructions.
        :param initial_sentence: The initial sentence of the dialogue to include in the output.
        :param max_length: Maximum length of the generated dialog.
        :param num_return_sequences: Number of dialog sequences to generate.
        :param temperature: Sampling temperature for diversity.
        :param top_p: Nucleus sampling parameter for controlling diversity.
        :return: Generated dialog sequences, starting with the initial sentence.
        """
        full_prompt = " "+ prompt+ " " +initial_sentence
        print(f"{full_prompt=}")
        # Prepare the prompt
        input_ids = self.tokenizer.encode(full_prompt, return_tensors='pt').to(self.device)

        # Generate sequences
        chat_outputs = self.model.generate(
            input_ids,
            max_length=max_length + len(input_ids[0]),
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True  # Ensure sampling is enabled
        )
        
        # Decode the generated sequences and trim the instruction part
        generated_dialogs = [self.tokenizer.decode(output, skip_special_tokens=True) for output in chat_outputs]
        trimmed_dialogs = [dialog.replace(full_prompt, initial_sentence, 1) for dialog in generated_dialogs]

        return trimmed_dialogs
    
if __name__ == "__main__":
    sandbox = DialogSandbox(model_name='gpt2', device='cpu')
    conversations = sandbox.generate_conversations(num_conversations=3)
    for dialog in conversations:
        print(dialog)
        print("------")


full_prompt=' A conversation between a novice cook and a more experienced friend about cooking tips: Taylor: I always end up burning the garlic. Any tips?'


In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# class NonInteractiveDialogSandbox:
#     def __init__(self, model_name='microsoft/DialoGPT-large', device='cpu'):
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
#         self.device = device

#     def generate_dialog(self, initial_prompts, num_exchange=5, max_length=1000):
#         """
#         Simulate a dialogue based on initial prompts.
        
#         :param initial_prompts: A list of strings representing the initial exchanges.
#         :param num_exchange: Number of exchanges to simulate.
#         :param max_length: Maximum length of the dialogue.
#         """
#         chat_history_ids = None
#         for step, prompt in enumerate(initial_prompts + [""] * (num_exchange - len(initial_prompts))):
#             # Encode the current exchange, add the eos_token and return a tensor in Pytorch
#             new_user_input_ids = self.tokenizer.encode(prompt + self.tokenizer.eos_token, return_tensors='pt').to(self.device)
            
#             # Append the new user input tokens to the chat history
#             bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

#             # Generate a response
#             chat_history_ids = self.model.generate(bot_input_ids, max_length=max_length, pad_token_id=self.tokenizer.eos_token_id)

#         # Decode and print the dialogue
#         dialogue = self.tokenizer.decode(chat_history_ids[0], skip_special_tokens=True)
#         return dialogue

# # Example usage
# if __name__ == "__main__":
#     sandbox = NonInteractiveDialogSandbox(model_name='microsoft/DialoGPT-large', device='cpu')  # Use 'cuda' if GPU is available
    
#     initial_prompts = [
#         ">> User: How's the weather today?",
#         ">> DialoGPT: It's sunny and clear, a beautiful day!"
#     ]
    
#     dialogue = sandbox.generate_dialog(initial_prompts, num_exchange=5)
#     print("Generated Dialogue:\n", dialogue)


In [None]:

# if __name__ == "__main__":
#     sandbox = DialogSandbox(model_name='gpt2', device='cpu')  # Use 'cuda' if GPU is available
    
#     # Full prompt used for generation
#     prompt = "The following is a conversation between two old friends, John and Sarah, who unexpectedly meet at a park: John: Hey Sarah! It's been a long time. How have you been?"

#     # Initial sentence you want to appear at the start of each generated dialogue
#     initial_sentence = "John: Hey Sarah! It's been a long time. How have you been?"

#     prompt="Alice, a software engineer, runs into her college friend Bob, now a teacher, at a tech conference. They start catching up on their careers and share insights from their professional paths:"
#     # Now including the initial_sentence argument in the method call
#     dialogs = sandbox.generate_dialog(prompt, initial_sentence, max_length=50, num_return_sequences=3, temperature=0.8, top_p=0.95)
    
#     for i, dialog in enumerate(dialogs, 1):
#         print(f"Dialog {i}:\n{dialog}\n")



In [None]:
1. Character Context and Background
Prompt: "Alice, a software engineer, runs into her college friend Bob, now a teacher, at a tech conference. They start catching up on their careers and share insights from their professional paths."
2. Open-ended and Engaging Starters
Prompt: "During a long car ride, Charlie asks Diana about her most unforgettable travel experience, sparking a conversation about adventure, culture, and the lessons learned from traveling."
3. Alternating Turns
Prompt: "Ethan and Fiona debate the best superhero movie of the decade. Each presents their case, citing specific scenes, character development, and overall impact, trying to sway the other's opinion."
4. Emotional and Situational Variability
Prompt: "George is feeling down about a missed job opportunity, and Hannah offers comfort. Their conversation explores feelings of disappointment, strategies for coping, and optimism for the future."
5. Adaptation and Follow-up
Prompt: "Ian, passionate about environmental conservation, talks to Jenny, who is curious but skeptical. Ian provides compelling arguments and facts, responding to Jenny's doubts with patience and enthusiasm."
6. Closure or Continuation
Prompt: "After a heated discussion about a book they both read, Kevin suggests to Laura that they start a monthly book club. They brainstorm potential books, members, and meeting logistics, ending the conversation with plans to draft a list of invitees."
Generating the Dialogs
To generate dialogs based on these prompts, you could use the following structured approach for each prompt:

Initialize the Conversation: Start with the given scenario, explicitly stating the setting and the characters involved.

Guide the Dialog: Use the prompt to guide the initial exchanges, ensuring the conversation stays on topic and follows the intended emotional and situational direction.

Diversify Responses: Encourage variability in responses to include reflections, counterpoints, and new questions, which naturally extend the dialog.

Ensure Coherence: Maintain the flow of conversation, ensuring that each response is coherent with the previous lines, and reflects the progression of the dialog.