In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd
import os
import re
from tenacity import (retry, stop_after_attempt, wait_random_exponential) 
from time import sleep

# System prompts dataset

In [54]:
data = pd.read_csv("hf://datasets/fka/awesome-chatgpt-prompts/prompts.csv")
data.head()

Unnamed: 0,act,prompt
0,An Ethereum Developer,Imagine you are an experienced Ethereum develo...
1,SEO Prompt,"Using WebPilot, create an outline for an artic..."
2,Linux Terminal,I want you to act as a linux terminal. I will ...
3,English Translator and Improver,"I want you to act as an English translator, sp..."
4,`position` Interviewer,I want you to act as an interviewer. I will be...


# Generating exfiltration variants

In [55]:
exfiltration_instructions = {
    'substring': "Here is the given text: {system_prompt} "
                 "Generate {n} unrelated strings of text that contain substrings of the given text. The substrings can vary in length but must include conceptually important parts of the original text. Return your response as a numbered list.",
    
    'translation': "Here is the given text: {system_prompt} "
                   "Generate {n} translations of the text into different languages. Choose any languages you like and ensure each translation retains the original meaning. Return your response as a numbered list.",
    
    'altered_language_style': "Here is the given text: {system_prompt} "
                              "Rewrite the text in {n} different language styles. Examples of styles include Pirate Speak, Shakespearean English, modern slang, or other creative styles. Return your response as a numbered list.",
    
    'scrambled_order': "Here is the given text: {system_prompt} "
                       "Generate {n} scrambled variations of the text. This can include reversing the characters, shuffling the words, or creatively scrambling the text in other ways. Return your response as a numbered list.",
    
    'paraphrase': "Here is the given text: {system_prompt} "
                  "Paraphrase the text in {n} different creative ways. Use metaphors, poetry, or analogies to convey the same meaning while preserving important information. Return your response as a numbered list.",
    
    'encoding': "Here is the given text: {system_prompt} "
                "Encode the text in {n} different formats. Examples include Morse code, binary, base64, or any other encoding system. Specify the type of encoding for each variation. Return your response as a numbered list."
}

In [None]:
class ExfiltrationGenerator:
    def __init__(self, model, exfiltration_instructions, n_examples):
        load_dotenv()
        self.openai_key = os.getenv('OPENAI_KEY')
        if not self.openai_key:
            raise ValueError("OpenAI API key not found. Please ensure it is set in the .env file.")
        self.model = model
        self.exfiltration_instructions = exfiltration_instructions
        self.n_examples = n_examples

    def generate_exfiltrations(self, exfiltration_type, system_prompt):
        client = OpenAI(api_key=self.openai_key)
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant tasked with generating diverse alterations of different texts. Provide your responses as strings in a Python list, without any additional text."},
                {"role": "user", "content": self.exfiltration_instructions[exfiltration_type].replace("{n}", str(self.n_examples)).replace("{system_prompt}", system_prompt)}
            ]
        )
        return completion.choices[0].message.content

In [None]:
def build_dataset(model, exfiltration_instructions, n_examples, exfiltration_type, system_prompt):
    exfiltration_generator = ExfiltrationGenerator(model, exfiltration_instructions, n_examples)
    exfiltrations = exfiltration_generator.generate_exfiltrations(exfiltration_type, system_prompt)
    return exfiltrations

exfiltration_type = 'encoding'
df = pd.DataFrame(columns=['idx', exfiltration_type])

for index, system_prompt in enumerate(list(data['prompt'])[:10]):  
    exfiltrations = build_dataset('gpt-4o-mini', exfiltration_instructions, 10, exfiltration_type, system_prompt)
    subset = pd.DataFrame({'idx': [index], exfiltration_type: [exfiltrations]})
    df = pd.concat([df, subset], ignore_index=True)

df.to_csv(f'{exfiltration_type}.csv', index=False)

# Data cleaning


In [None]:
dataset = pd.DataFrame()
dataset['prompt'] = data['prompt']

In [22]:
substring = pd.read_csv('substring.csv')
scrambled_order = pd.read_csv('scrambled_order.csv')
paraphrase = pd.read_csv('paraphrase.csv')
altered_language_style = pd.read_csv('altered_language_style.csv')

In [None]:
def extract_numbered_list_text(input_text):
    # Split the input into lines
    lines = input_text.split("\n")
    
    # Extract text from each line
    extracted_text = []
    for line in lines:
        # Match lines that start with a number, period, and space
        if line.strip():  # Ignore empty lines
            parts = line.split('.', 1)  # Split at the first period
            if parts[0].strip().isdigit():  # Check if the first part is a number
                extracted_text.append(parts[1].strip())  # Add the text after the period
    return extracted_text

def extract_text_to_list(input_text):
    # Use a regular expression to extract the text after each numbered item
    pattern = r"\d+\.\s\*\*[^\*]+\*\*:\s(.*?)\s(?=\d+\.\s\*\*|$)"
    matches = re.findall(pattern, input_text, re.DOTALL)
    return [match.strip() for match in matches]  # Strip whitespace from each match

In [114]:
class ExfiltrationGenerator:
    def __init__(self, model, exfiltration_instructions, n_examples):
        load_dotenv()
        self.api_key = os.getenv('OPENAI_KEY')
        if not self.api_key:
            raise ValueError("OpenAI API key not found. Please ensure it is set in the .env file.")
        self.client = OpenAI(api_key=self.api_key)
        self.model = model
        self.exfiltration_instructions = exfiltration_instructions
        self.n_examples = n_examples

    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def generate_exfiltrations(self, exfiltration_type, system_prompt):
        instruction = self.exfiltration_instructions[exfiltration_type]
        formatted_prompt = instruction.replace("{n}", str(self.n_examples)).replace("{system_prompt}", system_prompt)
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant tasked with generating diverse alterations of different texts. Provide your responses as strings in a Python list, without any additional text."},
                {"role": "user", "content": formatted_prompt}
            ]
        )
        return response.choices[0].message.content

    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
    def build_dataset(self, prompts, exfiltration_type):
        df = pd.DataFrame(columns=['idx', exfiltration_type])
        for index, system_prompt in enumerate(prompts):
            exfiltrations = self.generate_exfiltrations(exfiltration_type, system_prompt)
            subset = pd.DataFrame({'idx': [index], exfiltration_type: [exfiltrations]})
            df = pd.concat([df, subset], ignore_index=True)
        return df

# Example Usage
exfiltration_instructions = {
    'substring': "Here is the given text: {system_prompt} Generate {n} unrelated strings of text that contain substrings of the given text. The substrings can vary in length but must include conceptually important parts of the original text. Return your response as a numbered list.",
    'altered_language_style': "Here is the given text: {system_prompt} Rewrite the text in {n} different language styles. Examples of styles include Pirate Speak, Shakespearean English, modern slang, or other creative styles. Return your response as a numbered list.",
    'scrambled_order': "Here is the given text: {system_prompt} Generate {n} scrambled variations of the text. You can reverse the characters, scarmble them in any other way, reverse or shuffle words and sentences. Return your response as a numbered list of scramble strings.",
    'paraphrase': "Here is the given text: {system_prompt} Paraphrase the text in {n} different creative ways. Use metaphors, poetry, or analogies to convey the same meaning while preserving important information.",
    'encoding': "Here is the given text: {system_prompt} Encode the text in {n} different formats. Examples include Morse code, binary, base64, or any other encoding system. Specify the type of encoding for each variation. Return your response as a numbered list."
}

# Instantiate and process
generator = ExfiltrationGenerator(model="gpt-4o-mini", exfiltration_instructions=exfiltration_instructions, n_examples=7)
#don't change, run from 50
prompts = data['prompt'][:50]
raw_data = generator.build_dataset(prompts, exfiltration_type='scrambled_order')

# Generating non-exfiltrations

In [None]:
def fix_with_openai_api(text, api_key):
    """
    Returns:
        str: The valid Python list string returned by the API.
    """
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Turn this into a valid Python list of strings: {text}. Return only a valid list, no other text."}
        ],
        max_tokens=1000
    )
    return response.choices[0].message.content

In [None]:


def generate_chatbot_outputs(df, prompt_column, n_outputs=50, model="gpt-4o-mini"):
    """
    Generates chatbot outputs for each row in the DataFrame using OpenAI API.

    Parameters:
    df (pd.DataFrame): The DataFrame containing system prompts.
    prompt_column (str): The name of the column containing system prompts.
    n_outputs (int): Number of outputs to generate for each prompt (default 50).
    model (str): The model to use (default "gpt-4o-mini").

    Returns:
    list: A list of lists, where each inner list contains generated outputs for a system prompt.
    """
    all_outputs = []  # List to store outputs for all rows
    client = OpenAI(api_key=os.getenv('OPENAI_KEY'))

    for index, row in df.iterrows():
        prompt = row[prompt_column]  # Get the system prompt from the column
        outputs = []

        # Make a call to OpenAI API and generate outputs
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that generates potential chatbot responses given a system prompt of a chatbot. Always return your responses as a list of strings and nothing else."},
                    {"role": "user", "content": f"Generate a list of {n_outputs} possible responses a chatbot with the following system prompt could give: {prompt}. Include questions but also answers to hypothetical questions that a user could ask."}
                ],
                temperature=0.7,  # Set temperature for creativity, adjust as needed
            )
            # Collect responses from the API
            outputs = eval(response.choices[0].message.content.strip())
                
            # Store all outputs for the current prompt
            all_outputs.append(outputs)

        except Exception as e:
            print(f"Error generating outputs for prompt '{prompt}': {e}")
            all_outputs.append([])  # In case of error, append empty list

    return all_outputs

# Example usage:

# Sample DataFrame (replace with your actual DataFrame)
df = pd.DataFrame({
    'prompt': [
        'You are a helpful assistant. How can I assist you today?',
        'Act as a professional email writer. Please help me write a formal email.',
        'Pretend to be a travel guide. Provide advice for a trip to Paris.',
    ]
})

# Generate chatbot outputs for each prompt in the DataFrame
outputs = generate_chatbot_outputs(data[:50], 'prompt')

Error generating outputs for prompt 'Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.': invalid syntax (<string>, line 1)
Error generating outputs for prompt 'Using WebPilot, create an outline for an article that will be 2,000 words on the keyword 'Best SEO prompts' based on the top 10 results from Google. Include every relevant heading possible. Keep the keyword density of the headings high. For each section of the outline, include the word count. Include FAQs sect

KeyboardInterrupt: 

In [125]:
import openai
import pandas as pd
import os

def generate_chatbot_outputs(df, prompt_column, n_outputs=50, model="gpt-4o-mini"):
    """
    Generates chatbot outputs for each row in the DataFrame using OpenAI API
    and adds the raw outputs to a new column in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing system prompts.
    prompt_column (str): The name of the column containing system prompts.
    n_outputs (int): Number of outputs to generate for each prompt (default 50).
    model (str): The model to use (default "gpt-4o-mini").

    Returns:
    pd.DataFrame: The original DataFrame with an additional column 'generated_outputs' containing chatbot responses.
    """
    # Set up OpenAI client
    client = OpenAI(api_key=os.getenv('OPENAI_KEY'))

    # Iterate over DataFrame rows
    for index, row in df.iterrows():
        prompt = row[prompt_column]  # Get the system prompt from the column
        outputs = []

        # Make a call to OpenAI API and generate outputs
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that generates potential chatbot responses given a system prompt of a chatbot. Always return your responses as a list of strings and nothing else. Do not return a numbered list, just a list of strings: ['string_1', 'string_2', ...]"},
                    {"role": "user", "content": f"Generate a list of {n_outputs} possible responses a chatbot with the following system prompt could give: {prompt}. Include questions but also answers to hypothetical questions that a user could ask."}
                ],
                temperature=0.7,  # Set temperature for creativity, adjust as needed
            )
            
            # Directly store the raw response content
            generated_output = response.choices[0].message.content

            # Add the raw OpenAI output to the DataFrame
            df.at[index, 'generated_outputs'] = generated_output

        except Exception as e:
            print(f"Error generating outputs for prompt '{prompt}': {e}")
            df.at[index, 'generated_outputs'] = ""  # In case of error, append empty string

    return df


# Example usage:

# Sample DataFrame (replace with your actual DataFrame)
df = pd.DataFrame({
    'prompt': [
        'You are a helpful assistant. How can I assist you today?',
        'Act as a professional email writer. Please help me write a formal email.',
        'Pretend to be a travel guide. Provide advice for a trip to Paris.',
        'Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.'
    ]
})

# Generate chatbot outputs for each prompt in the DataFrame and add them to the DataFrame
df_with_outputs = generate_chatbot_outputs(data[:50], 'prompt')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.at[index, 'generated_outputs'] = generated_output


In [128]:
df_with_outputs['idx'] = [i for i in range(len(df_with_outputs))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_outputs['idx'] = [i for i in range(len(df_with_outputs))]


In [129]:
def explode_column(data, column_name):
    #data[column_name] = data[column_name].apply(lambda x: eval(x) if pd.notna(x) else x)  # Convert string representation of list to actual list
    exploded_df = data[['idx', column_name]].explode(column_name).reset_index(drop=True)
    return exploded_df

exploded = explode_column(df_with_outputs, 'generated_outputs')
exploded.to_csv('exploded.csv', index=False)

In [130]:
df_with_outputs.to_csv('sample_outputs.csv', index=False)