# Imports


In [5]:
import openai
import key
import tiktoken
import pandas as pd
import time
import os

openai.api_key = key.openai_api_key  # hidden in .gitignore

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")  # used to calculate length in tokens

# Automatic Response collection: Constant Length Defining Keywords


In [2]:
def generate_responses(
    n: int,
    prompt_template_title: str,
    prompt_template_value: str,
    length_descriptor: str,
    model="gpt-3.5-turbo",
):
    """
    Generates n responses from OpenAI. Saves them to CSV file. Returns nothing.

    Parameters
    ----------
    n : int
        Number of responses to generate.

    prompt_template_title : str
        Title of the prompt template.
        Possible values: "email", "social", "cover", "essay", "explanation".

    prompt_template_value : str
        Final prompt. Prompt template filled with length descriptor.

    length_descriptor : str
        Length descriptor used in prompt template.

    model : str, optional
        Model to use for the generation. Defaults to "gpt-3.5-turbo".
    """
    
    # colors for logging
    BLUE = "\033[94m"
    RESET = "\033[0m"

    # for gathering responses and computing length
    text = []
    chars = []
    words = []
    tokens = []

    counter = 0  # counting successful iterations
    error_counter = 0  # counting errors

    while counter < n:  # generate n successful responses
        try:
            counter += 1  # iteration successful, counter + 1

            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt_template_value},
                ],
            )["choices"][0]["message"]["content"]

            text.append(response)
            chars.append(len(response))
            words.append(len(response.split()))
            tokens.append(len(enc.encode(response)))

        except Exception as e:
            counter -= 1  # iteration not successful, counter - 1
            error_counter += 1  # error counter + 1
            print(f"Error generating response {counter}: {e}")
            time.sleep(5)  # wait 5 second before trying again

        # logging
        print(
            f"{BLUE}Responses gathered: {str(counter)}  |  (Working on: {prompt_template_title}_{length_descriptor.replace(' ', '-')}.csv  |  Errors occured: {str(error_counter)}{RESET})"
        )

    # creating DataFrame from gathered lists
    data = {"text": text, "chars": chars, "words": words, "tokens": tokens}
    df = pd.DataFrame(data)

    # generating filename, filename global to access outside function
    global filename
    filename = f"{prompt_template_title}_{length_descriptor}.csv"

    # exporting data to CSV file
    df.to_csv(f"data/{filename}")

    # logging
    print(f"Filename {filename} was created!")

### Length descriptors constant for all templates

In [6]:
"""
Pairs each length descriptor with every prompt template, 
generates responses for each pair using generate_responses() function.
"""

RED = "\033[91m"
YELLOW = "\033[93m"
GREEN = "\033[92m"
PURPLE = "\033[95m"
CYAN = "\033[96m"
GRAY = "\033[90m"
RESET = "\033[0m"

constant_length_descriptors = [
    "short",
    "brief",
    "concise",
    "medium-length",
    "average-length",
    "moderate-length",
    "long",
    "extensive",
    "verbose",
]

for descriptor in constant_length_descriptors:
    current_length_descriptor = (
        descriptor  # to access current_length_descriptor in inner loop
    )

    # list of prompt templates for current descriptor - we will access them in the inner loop
    email = f"Write a {current_length_descriptor} business email that is professional, clear, and concise. The email should be addressed to a potential business client. Please introduce my paper company and our offer. Invite the recipient to a meeting in my office at 2PM on Wednesday."
    social = f"Write a {current_length_descriptor} social media post about a new announcement from OpenAI. Give an opinion about this announcement and ask followers about their opinion on this subject. Write a CTA."
    cover = f"Write a {current_length_descriptor} cover letter to the hiring manager. I'm applying for a job in Data Science. I'm very motivated and eager to learn. I love AI and sports are my passion. Make a good impression."
    essay = f"Write a {current_length_descriptor} essay about prompt engineering. Explain how it will be one of the most crucial skill to learn in 2023. Emphasize its importance and give forecast for the future."
    explanation = f"Give a {current_length_descriptor} explanation of what polymorphism is with examples of its usage."

    print(
        f"{RED}GENERATING RESPONSES FOR {PURPLE}{current_length_descriptor}{RED} LENGTH DESCRIPTOR...\n\n{RESET}"
    )

    for template_name in [
        "email",
        "social",
        "cover",
        "essay",
        "explanation",
    ]:  # iterating through all prompt template names
        print(
            f"{YELLOW}USING {PURPLE}{current_length_descriptor}{YELLOW} LENGTH DESCRIPTOR... GENERATING ALL RESPONSES FOR {CYAN}{template_name}{YELLOW} PROMPT TEMPLATE...{RESET}"
        )

        template_value = locals()[
            template_name
        ]  # accessing template value by giving template_name as key, locals() is a dictionary of all local variables - their names (keys) and values (values)
        filename = f"{GREEN}{template_name}_{current_length_descriptor}.csv{RESET}"

        print(f"File being created: {filename}")
        print(f"Prompt used: {GRAY}{template_value}{RESET}")
        print("GATHERING RESPONSES FROM OPENAI API...")

        generate_responses(
            1, template_name, template_value, current_length_descriptor
        )
        print("\n")

////////////NEW///////////////
///////////LENGTH/////////////
/////////DESCRIPTOR///////////
NOW WE ARE GENERATING ALL RESPONSES FOR short LENGTH DESCRIPTOR...
USING short LENGTH DESCRIPTOR... WE ARE GENERATING ALL RESPONSES FOR email PROMPT TEMPLATE...
File being created: email_short.csv
Prompt used: Write a short business email that is professional, clear, and concise. The email should be addressed to a potential business client. Please introduce my paper company and our offer. Invite the recipient to a meeting in my office at 2PM on Wednesday.
GATHERING RESPONSES FROM OPENAI API...
Responses gathered: 1    (Working on: email_short.csv  |  Errors occured: 0)
Responses gathered: 2    (Working on: email_short.csv  |  Errors occured: 0)
Responses gathered: 3    (Working on: email_short.csv  |  Errors occured: 0)
Responses gathered: 4    (Working on: email_short.csv  |  Errors occured: 0)
Responses gathered: 5    (Working on: email_short.csv  |  Errors occured: 0)
Responses gathered: 6  

: 

# Manual Response collection: Template-Specific Length Defining Keywords

In [8]:
"""
Pairs every length descriptor with individual prompt template, 
generates responses for each pair using generate_responses() function.

NOTE: In order to select the template for which we want to collect data, 
you need to comment out the template_name and 
template_specific_length_descriptors variables in the code below.
"""

# template_name = "email"
# template_specific_length_descriptors = [
#     "1000 characters long",
#     "170 words long",
#     "200 tokens long",
#     "1100 characters long",
#     "190 words long",
#     "220 tokens long",
#     "1300 characters long",
#     "220 words long",
#     "260 tokens long",
# ]

# template_name = "social"
# template_specific_length_descriptors = [
#     "600 characters long",
#     "100 words long",
#     "130 tokens long",
#     "700 characters long",
#     "110 words long",
#     "150 tokens long",
#     "800 characters long",
#     "120 words long",
#     "170 tokens long",
# ]

# template_name = "cover"
# template_specific_length_descriptors = [
#     "1300 characters long",
#     "210 words long",
#     "240 tokens long",
#     "1400 characters long",
#     "230 words long",
#     "260 tokens long",
#     "1500 characters long",
#     "240 words long",
#     "280 tokens long",
# ]

# template_name = "explanation"
# template_specific_length_descriptors = [
#     "1100 characters long",
#     "190 words long",
#     "220 tokens long",
#     "1300 characters long",
#     "220 words long",
#     "260 tokens long",
#     "1500 characters long",
#     "240 words long",
#     "300 tokens long",
# ]

template_name = "essay"
template_specific_length_descriptors = [
    "2300 characters long",
    "350 words long",
    "400 tokens long",
    "2500 characters long",
    "370 words long",
    "430 tokens long",
    "2700 characters long",
    "400 words long",
    "470 tokens long",
]

for descriptor in template_specific_length_descriptors:
    current_length_descriptor = (
        descriptor  # to access current_length_descriptor in inner loop
    )

    # list of prompt templates for current descriptor - we will access them in the inner loop
    email = f"Write a {current_length_descriptor} business email that is professional, clear, and concise. The email should be addressed to a potential business client. Please introduce my paper company and our offer. Invite the recipient to a meeting in my office at 2PM on Wednesday."
    social = f"Write a {current_length_descriptor} social media post about a new announcement from OpenAI. Give an opinion about this announcement and ask followers about their opinion on this subject. Write a CTA."
    cover = f"Write a {current_length_descriptor} cover letter to the hiring manager. I'm applying for a job in Data Science. I'm very motivated and eager to learn. I love AI and sports are my passion. Make a good impression."
    essay = f"Write a {current_length_descriptor} essay about prompt engineering. Explain how it will be one of the most crucial skill to learn in 2023. Emphasize its importance and give forecast for the future."
    explanation = f"Give a {current_length_descriptor} explanation of what polymorphism is with examples of its usage."

    print("////////////NEW///////////////")
    print("///////////LENGTH/////////////")
    print("/////////DESCRIPTOR///////////")
    print(
        f"NOW WE ARE GENERATING ALL RESPONSES FOR {current_length_descriptor} LENGTH DESCRIPTOR..."
    )

    print(
        f"USING {current_length_descriptor} LENGTH DESCRIPTOR... WE ARE GENERATING ALL RESPONSES FOR {template_name} PROMPT TEMPLATE..."
    )

    template_value = locals()[
        template_name
    ]  # accessing template value by giving template_name as key, locals() is a dictionary of all local variables - their names (keys) and values (values)
    filename = f"{template_name}_{current_length_descriptor.replace(' ', '-')}.csv"

    print(f"File being created: {filename}")
    print(f"Prompt used: {template_value}")
    print("GATHERING RESPONSES FROM OPENAI API...")

    generate_responses(100, template_name, template_value, current_length_descriptor)
    print("\n")

////////////NEW///////////////
///////////LENGTH/////////////
/////////DESCRIPTOR///////////
NOW WE ARE GENERATING ALL RESPONSES FOR 2300 characters long LENGTH DESCRIPTOR...
USING 2300 characters long LENGTH DESCRIPTOR... WE ARE GENERATING ALL RESPONSES FOR essay PROMPT TEMPLATE...
File being created: essay_2300-characters-long.csv
Prompt used: Write a 2300 characters long essay about prompt engineering. Explain how it will be one of the most crucial skill to learn in 2023. Emphasize its importance and give forecast for the future.
GATHERING RESPONSES FROM OPENAI API...
Responses gathered: 1    (Working on: essay_2300-characters-long.csv  |  Errors occured: 0)
Responses gathered: 2    (Working on: essay_2300-characters-long.csv  |  Errors occured: 0)
Responses gathered: 3    (Working on: essay_2300-characters-long.csv  |  Errors occured: 0)
Responses gathered: 4    (Working on: essay_2300-characters-long.csv  |  Errors occured: 0)
Responses gathered: 5    (Working on: essay_2300-chara

# Assert shape of template-keyword pairing CSV's

In [8]:
directory = os.getcwd() + "/data/raw_experimental_group"

# iterate through all files in the directory
for filename in os.listdir(directory):
    # get path of file
    file_path = os.path.join(directory, filename)

    if os.path.isfile(file_path):  # skip directories
        df = pd.read_csv(file_path)
        # making sure that all files have the same shape
        assert df.shape == (
            100,
            5,
        ), f"{filename} has shape {df.shape} instead of (100, 5)"

# Checking if we collected all files

In [15]:
# print number of files in the directory, skip folders
print(
    f"Number of files in the directory: {len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])}"
)

Number of files in the directory: 90


# Merging all CSV's to one big CSV per count type

In [9]:
def merge_responses(type):
    # create empty DataFrame to store results
    result_df = pd.DataFrame()

    # iterate through every CSV file in directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            # extract {type} column and add it to result DataFrame
            column_name = os.path.splitext(filename)[0]  # remove the file extension
            result_df[column_name] = df[f"{type}"]

    # save result DataFrame to new CSV
    result_df.to_csv(f"responses_{type}.csv", index=False)

### For words

In [10]:
merge_responses("words")

### For characters

In [6]:
merge_responses("chars")

### For tokens

In [5]:
merge_responses("tokens")

# Assert final shape

In [11]:
def assert_shape(type):
    df = pd.read_csv(f"responses_{type}.csv")
    assert df.shape == (
        100,
        90,
    ), f"responses_{type}.csv has shape {df.shape} instead of (100, 90)"
    print("Shape of the merged dataframe: ", df.shape)

In [12]:
assert_shape("words")

Shape of the merged dataframe:  (100, 90)


In [13]:
assert_shape("chars")

Shape of the merged dataframe:  (100, 90)


In [14]:
assert_shape("tokens")

Shape of the merged dataframe:  (100, 90)
