# Data Collection - Experimental Group

Experimental group was gathered by collecting responses from ChatGPT ("gpt-3.5-turbo" model) using `generate_responses.py` function.

This notebook automates data collection by making API requests and fetching 100 responses for each template-keyword combination (elements of `prompt_templates.py` and `length_defnining_keywords.py`). 

The responses are then quantified in terms of word count, character count, and token count. 

The data was stored in CSV files and later merged for statistical analysis. 

Overall, the experimental group provided the data necessary to examine the influence of length-defining keywords on response length.

# Setup


In [1]:
# LIBRARIES
import pandas as pd
import os
from typing import Dict, List  # type hinting

# FILES
from utils.generate_responses import generate_responses
from utils import length_defining_keywords, prompt_templates

# CONSTANTS
N_RESPONSES = 1  # set number of responses to generate per prompt
PROMPT_TITLE_TEMPLATE = (
    prompt_templates.prompt_title_template
)  # Dict[str, str] -> prompt titles (str) and templates (str) with placeholders for length defining keywords
CONSTANT_LENGTH_DESCRIPTORS = (
    length_defining_keywords.constant_length_descriptors
)  # List[str] -> constant length defining keywords (str)
TEMPLATE_SPECIFIC_TITLE_DESCRIPTORS = (
    length_defining_keywords.template_specific_title_descriptors
)  # Dict[str, List[str]] -> prompt titles (str) with template specific length defining keywords (List[str])

# COLORS FOR PRINTING
GRAY = "\033[90m"
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
PURPLE = "\033[95m"
CYAN = "\033[96m"
RESET = "\033[0m"

# Response Collection: Constant Length Defining Keywords

In [2]:
def constant_collection(
    constant_length_descriptors: List[str], prompt_title_template: Dict[str, str]
) -> None:
    for descriptor in constant_length_descriptors:
        print(
            f"{RED}GENERATING RESPONSES FOR {PURPLE}{descriptor}{RED} LENGTH DESCRIPTOR...\n{RESET}"
        )
        for prompt_title, template in prompt_title_template.items():
            print(
                f"{YELLOW}USING {PURPLE}{descriptor}{YELLOW} LENGTH DESCRIPTOR... GENERATING ALL RESPONSES FOR {CYAN}{prompt_title}{YELLOW} PROMPT TEMPLATE...{RESET}"
            )
            print(f"File being created: {GREEN}{prompt_title}_{descriptor}.csv{RESET}")

            prompt = template.format(
                descriptor
            )  # insert descriptor into template string
            print(f"Prompt used: {GRAY}{prompt}{RESET}")

            generate_responses(N_RESPONSES, prompt, prompt_title, descriptor)

In [4]:
constant_collection(CONSTANT_LENGTH_DESCRIPTORS, PROMPT_TITLE_TEMPLATE)

[91mGENERATING RESPONSES FOR [95mshort[91m LENGTH DESCRIPTOR...
[0m
[93mUSING [95mshort[93m LENGTH DESCRIPTOR... GENERATING ALL RESPONSES FOR [96memail[93m PROMPT TEMPLATE...[0m
File being created: [92memail_short.csv[0m
Prompt used: [90mWrite a short business email that is professional, clear, and concise. The email should be addressed to a potential business client. Please introduce my paper company and our offer. Invite the recipient to a meeting in my office at 2PM on Wednesday.[0m


KeyboardInterrupt: 

# Response Collection: Template-Specific Length Defining Keywords

In [4]:
def template_specific_collection(
    template_specific_title_descriptors: Dict[str, List[str]],
    prompt_title_template: Dict[str, str],
) -> None:
    for prompt_title, descriptors in template_specific_title_descriptors.items():
        print(
            f"{RED}GENERATING RESPONSES FOR {PURPLE}{prompt_title}{RED} TEMPLATE...\n{RESET}"
        )
        for descriptor in descriptors:
            print(
                f"{YELLOW}USING {PURPLE}{descriptor}{YELLOW} LENGTH DESCRIPTOR... GENERATING ALL RESPONSES FOR {CYAN}{prompt_title}{YELLOW} PROMPT TEMPLATE...{RESET}"
            )
            print(f"File being created: {GREEN}{prompt_title}_{descriptor}.csv{RESET}")

            prompt = prompt_title_template[prompt_title].format(descriptor)
            print(f"Prompt used: {GRAY}{prompt}{RESET}")

            generate_responses(N_RESPONSES, prompt, prompt_title, descriptor)

In [5]:
template_specific_collection(TEMPLATE_SPECIFIC_TITLE_DESCRIPTORS, PROMPT_TITLE_TEMPLATE)

[91mGENERATING RESPONSES FOR [95memail[91m TEMPLATE...
[0m
[93mUSING [95m1000 characters long[93m LENGTH DESCRIPTOR... GENERATING ALL RESPONSES FOR [96memail[93m PROMPT TEMPLATE...[0m
File being created: [92memail_1000 characters long.csv[0m
Prompt used: [90mWrite a 1000 characters long business email that is professional, clear, and concise. The email should be addressed to a potential business client. Please introduce my paper company and our offer. Invite the recipient to a meeting in my office at 2PM on Wednesday.[0m
[94mResponses gathered: 1  |  Working on: email_1000 characters long.csv  |  Errors occured: 0[0m
Filename email_1000 characters long.csv was created!

[93mUSING [95m170 words long[93m LENGTH DESCRIPTOR... GENERATING ALL RESPONSES FOR [96memail[93m PROMPT TEMPLATE...[0m
File being created: [92memail_170 words long.csv[0m
Prompt used: [90mWrite a 170 words long business email that is professional, clear, and concise. The email should be addresse

KeyboardInterrupt: 

# Assert shape of all CSV's

If runs without output, all CSV's have the same shape.

In [15]:
desired_number_of_responses = N_RESPONSES
desired_number_of_columns = 4

directory = os.getcwd() + "/data/raw_experimental_group"

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    if filename.endswith(".csv"):
        df = pd.read_csv(file_path)
        # making sure that all files have same shape
        assert df.shape == (
            desired_number_of_responses,
            desired_number_of_columns,
        ), f"{filename} has shape {df.shape} instead of ({desired_number_of_responses}, {desired_number_of_columns})"

# Checking if we collected all files

In [20]:
csv_count = 0

for file in os.listdir(directory):
    if file.endswith(".csv"):
        csv_count += 1

# Print the number of CSV files
print("Number of CSV files:", csv_count)

Number of CSV files: 90


# Merging all CSV's to one big CSV per count type

In [21]:
def merge_responses(type):
    # create empty DataFrame to store results
    result_df = pd.DataFrame()

    # iterate through every CSV file in directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            # extract {type} column and add it to result DataFrame
            column_name = os.path.splitext(filename)[0]  # remove file extension
            result_df[column_name] = df[f"{type}"]

    # save result DataFrame to new CSV
    result_df.to_csv(
        f"data/preprocessed_experimental_group/responses_{type}.csv", index=False
    )

    # log
    print(
        f"File created: responses_{type}.csv\n\nDirectory: data/preprocessed_experimental_group/responses_{type}.csv"
    )

### For words

In [22]:
merge_responses("words")

File created: responses_words.csv

Directory: data/preprocessed_experimental_group/responses_words.csv


### For characters

In [23]:
merge_responses("chars")

File created: responses_chars.csv

Directory: data/preprocessed_experimental_group/responses_chars.csv


### For tokens

In [24]:
merge_responses("tokens")

File created: responses_tokens.csv

Directory: data/preprocessed_experimental_group/responses_tokens.csv


# Assert final shape

In [25]:
desired_number_of_responses = 1
desired_number_of_columns = 90


def assert_shape(type):
    df = pd.read_csv(f"data/preprocessed_experimental_group/responses_{type}.csv")
    assert df.shape == (
        desired_number_of_responses,
        desired_number_of_columns,
    ), f"responses_{type}.csv has shape {df.shape} instead of (100, 90)"
    print(f"Shape of the merged dataframe: {df.shape}")

In [26]:
assert_shape("words")

Shape of the merged dataframe: (1, 90)


In [27]:
assert_shape("chars")

Shape of the merged dataframe: (1, 90)


In [28]:
assert_shape("tokens")

Shape of the merged dataframe: (1, 90)
