# Sampling the full df

In [30]:
import pandas as pd

df_full = pd.read_pickle("../data/full_df_posts.pkl")
df_full["n_tokens"] = df_full["caption"].apply(lambda x: len(x.split(" ")))
df_full = df_full.query("n_tokens > 10")

df_sample = pd.concat(
    [
        df_full.query("country == 'US' & has_disclosures").sample(1000),
        df_full.query("country == 'US' & ~has_disclosures").sample(1000),
    ]
).to_pickle("../data/df_sample.pkl")

KeyboardInterrupt: 

# Data Generation Example

In [1]:
import sys

sys.path.append("..")

In [2]:
from instasynth.config import Config, logger
from instasynth import data_generation, utils


In [5]:
from pathlib import Path

Config.PROMPTS_FOLDER = Path("../instasynth/prompts")
Config.RESULTS_FOLDER = Path("../instasynth/results")

In [7]:
from importlib import reload
reload(data_generation)
reload(utils)

<module 'instasynth.utils' from '../instasynth/utils.py'>

In [3]:
import os
from typing import List

import pandas as pd


DATA_PATH = "../data"
SAMPLE_DATASET_SIZE = 1000
ORIGINAL_PICKLE_FILENAME = "df_sample.pkl"


def get_sampled_dataset(
    original_pickle_filename: str,
    sample_size: int,
    fixed_sample: bool = False,
    random_state: int = 18,
    keep_columns: List[str] = ["caption", "has_disclosures"],
):
    dataset_filename = f"{DATA_PATH}/sample_{sample_size}_{random_state}.csv"

    # Check if the sample dataset already exists
    if fixed_sample and os.path.exists(dataset_filename):
        logger.info("Loading sample dataset...")
        sample_dataset = pd.read_csv(dataset_filename)
    else:
        logger.info("Creating sample dataset...")
        full_df_filename = f"{DATA_PATH}/{original_pickle_filename}"
        full_df = pd.read_pickle(full_df_filename)

        # Split the dataset into sponsored and unsponsored based on the 'has_disclosures' column
        sponsored = full_df.query("has_disclosures").sample(
            sample_size, random_state=random_state
        )
        unsponsored = full_df.query("~has_disclosures").sample(
            sample_size, random_state=random_state
        )

        sample_dataset = pd.concat([sponsored, unsponsored])[keep_columns]
        sample_dataset.to_csv(dataset_filename, index=False)

    return sample_dataset


def format_examples(examples: list, delimiter: str = "####") -> str:
    """Format examples for display."""
    # return [f"{delimiter}{example}{delimiter}\n" for example in examples]
    return "".join(
        [f"<POST#{index}> {example}\n" for index, example in enumerate(examples)]
    )


def get_caption_examples(number_examples, sponsored=False):
    """Retrieve a set of caption examples from the dataset."""
    sample_dataset = get_sampled_dataset(
        original_pickle_filename=ORIGINAL_PICKLE_FILENAME,
        sample_size=int(SAMPLE_DATASET_SIZE / 2),
    )
    examples = (
        sample_dataset.query(f"has_disclosures == {sponsored}")["caption"]
        .sample(number_examples)
        .tolist()
    )
    examples = [
        example.replace("\n", "").replace('"', " ").replace("'", " ")
        for example in examples
    ]

    return format_examples(examples)

In [7]:
experiment_identifier = "nonsponsored_random_examples_exp_1"
prompt_name = "nonsponsored_random_examples"

number_of_examples = 5
number_of_captions = 15
examples = get_caption_examples(number_examples=number_of_examples, sponsored=False)

parameters = {
    "number_of_examples": number_of_examples,
    "number_of_captions": number_of_captions,
    "examples": f"<EXAMPLES> {examples} </EXAMPLES>\n\n Now I will give you instructions: ",
}

# chatgpt_parameters = {"functions": functions, "function_call": function_call}

experiment_output = data_generation.create_experiment(
    experiment_identifier,
    prompt_name,
    parameters,
)

2023-08-21 12:00:57,871 - INFO - Creating sample dataset...
2023-08-21 12:00:57,930 - INFO - Running experiment nonsponsored_random_examples_exp_1 with prompt nonsponsored_random_examples and parameters {'number_of_examples': 5, 'number_of_captions': 15, 'examples': '<EXAMPLES> <POST#0> #BTS of today shoot with @kendalljenner x @ksubi x @dexternavy ⚡️💫✨.👗 @elinsvahn_💇 @hairinel💄 @hungvanngo #KendallJenner\n<POST#1> Bringing it back to one of our favorite pizza joints out there in this beautiful food-filled world: @modernapizza in New Haven, CT. 🍕💯 #DEVOURPOWER\n<POST#2> It has been said that courage is the most important of all the virtues because without courage, you can’t practice any other virtue consistently. Tonight, I am showing up & allowing myself to be seen. \u2063\u2063Reality tv. Media storms. Podcasts. Family feuds. Divorce. Legal battles. All things that I have navigated over the last few years. I’ve kept quiet for the most part & tried to allow the storms to pass. As I ha