# Main Results and Tables

This notebook contains the code to reproduce the main results and tables in the paper.

In [1]:
import sys

sys.path.append("..")

#### `agg_df` is the dataframe containing the results of the experiments. It is created by running the `evaluation.ipynb` notebook.

In [36]:
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

agg_df = pd.read_pickle("aggregated_df.pkl")

## Section 4 -- Selecting the model temperature

In [37]:
experiments = agg_df.index.tolist()
selected_columns = [
    "pct_unique_captions",
    "success_rate",
    "ad_detection_f1",
    "ad_detection_undisclosed_accuracy",
    "top_100_recall_cosine_sim",
    "avg_hashtags_per_post",
    "avg_user_tags_per_post",
    "hashtag_overlap",
    "user_tag_overlap",
    # "jaccard_similarity_3gram",
]

### Base prompt

### Table 1 -- Impact of temperature parameter on dataset characteristics

In [41]:
from instasynth import utils

base_prompts = [k for k in experiments if "base_prompt_v2_temperature_" in k]

column_map = {
    "pct_unique_captions": "Unique Captions (\%)",
    "success_rate": "Success Rate (\%)",
    # "ad_detection_f1": "Ad Detection F1",
    # "ad_detection_undisclosed_accuracy": "Undisc. Ad Detection Acc",
    # "top_100_recall_cosine_sim": "Top-100 Sim. Recall",
    # "avg_hashtags_per_post": "Avg. Hashtags",
    # "avg_user_tags_per_post": "Avg. User Tags",
    "hashtag_overlap": "Hashtag Overlap (\%)",
    "user_tag_overlap": "User Tag Overlap (\%)",
}
index_map = {k: f"{k.split('_')[-1]}" for k in base_prompts}
temperature_table = agg_df.loc[base_prompts][selected_columns].sort_index()

for k in ["pct_unique_captions", "success_rate", "hashtag_overlap", "user_tag_overlap"]:
    temperature_table[k] = temperature_table[k] * 100

utils.generate_latex_table(
    temperature_table.T,
    table_caption="Impact of temperature parameter on dataset characteristics.",
    table_label="tab:temperatures",
    columns_rename_map=index_map,
    index_rename_map=column_map,
)

In [None]:
\begin{table}
\centering
\caption{Impact of temperature parameter on dataset characteristics.}
\label{tab:temperatures}
\begin{tabular}{lccccc}
\toprule
 & \bfseries 0 & \bfseries 0.25 & \bfseries 0.5 & \bfseries 0.7 & \bfseries 1 \\
\midrule
\bfseries Unique Captions (\%) & 42.32 & 92.48 & 97.51 & 99.41 & 100.00 \\
\bfseries Success Rate (\%) & 100.00 & 95.24 & 87.74 & 83.81 & 74.83 \\
\bfseries ad_detection_f1 & 0.65 & 0.64 & 0.64 & 0.64 & 0.64 \\
\bfseries ad_detection_undisclosed_accuracy & 0.78 & 0.80 & 0.78 & 0.74 & 0.76 \\
\bfseries top_100_recall_cosine_sim & 0.10 & 0.10 & 0.10 & 0.11 & 0.10 \\
\bfseries avg_hashtags_per_post & 0.73 & 1.02 & 0.97 & 0.96 & 0.88 \\
\bfseries avg_user_tags_per_post & 0.32 & 0.28 & 0.29 & 0.33 & 0.25 \\
\bfseries Hashtag Overlap (\%) & 0.46 & 0.98 & 0.98 & 0.95 & 0.81 \\
\bfseries User Tag Overlap (\%) & 0.00 & 0.00 & 0.00 & 0.07 & 0.07 \\
\bottomrule
\end{tabular}
\end{table}


In [40]:
selected_base = "base_prompt_v2_temperature_0.7"

### Selecting the datasets for the remaining prompt strategies

Fixed examples

In [10]:
fixed_examples = [k for k in experiments if "fixed_examples" in k]
agg_df.loc[fixed_examples][selected_columns].sort_index()

Unnamed: 0,pct_unique_captions,success_rate,ad_detection_f1,ad_detection_undisclosed_accuracy,top_100_recall_cosine_sim,avg_hashtags_per_post,avg_user_tags_per_post,hashtag_overlap,user_tag_overlap
fixed_examples_ht_v1,1.0,0.810526,0.62158,0.409204,0.091266,0.962709,0.160942,0.010367,0.0
fixed_examples_ht_v2,0.854455,0.877551,0.542289,0.225124,0.105941,1.043564,0.667327,0.010771,0.011782
fixed_examples_post_v1,0.999017,0.86747,0.606127,0.333333,0.098328,1.146509,0.186824,0.013147,0.001423
fixed_examples_post_v2,0.966102,0.94958,0.660793,0.399254,0.124626,1.077767,0.527418,0.011609,0.011236


In [11]:
selected_fixed_examples = "fixed_examples_post_v2"

Random examples

In [12]:
random_examples = [
    k for k in experiments if "random_examples" in k and "imitation" not in k
]
selected_random_examples = "random_examples_post_v2"

Imitation examples

In [13]:

imitation_examples = [k for k in experiments if "imitation" in k]
selected_imitation = "imitation_random_examples_ht_v2_temperature_0.7"

In [14]:
selected_experiments = [selected_base, selected_fixed_examples, selected_random_examples, selected_imitation, "Real"]

In [76]:
agg_df.loc[selected_experiments][selected_columns]

Unnamed: 0,pct_unique_captions,success_rate,ad_detection_f1,ad_detection_undisclosed_accuracy,top_100_recall_cosine_sim,avg_hashtags_per_post,avg_user_tags_per_post,hashtag_overlap,user_tag_overlap,jaccard_similarity_3gram
base_prompt_v2_temperature_0.7,0.994094,0.838095,0.636948,0.74005,0.11122,0.955709,0.333661,0.009516,0.000655,0.009895
fixed_examples_post_v2,0.966102,0.94958,0.660793,0.399254,0.124626,1.077767,0.527418,0.011609,0.011236,0.017144
random_examples_post_v2,0.999004,0.91791,0.709398,0.512438,0.10259,1.053785,0.563745,0.028432,0.045154,0.022285
imitation_random_examples_ht_v2_temperature_0.7,1.0,0.915493,0.707019,0.232587,0.101779,1.16996,0.704545,0.034249,0.040456,0.024766
Real,0.999005,,0.787307,0.442786,,2.775124,1.730348,,,


In [77]:
selected_experiments

['base_prompt_v2_temperature_0.7',
 'fixed_examples_post_v2',
 'random_examples_post_v2',
 'imitation_random_examples_ht_v2_temperature_0.7',
 'Real']

## Section 4 -- Caption Composition Metrics

### Table 2 -- Impact of prompt-engineering strategies on dataset characteristics; values within brackets represent the number of unique entities.

In [42]:
caption_composition_columns = [
    "avg_caption_length",
    "std_caption_length",
    "avg_hashtags_per_post",
    "std_hashtags_per_post",
    # "total_hashtags",
    "n_unique_hashtags",
    "avg_user_tags_per_post",
    "std_user_tags_per_post",
    # "total_user_tags",
    "n_unique_user_tags",
    "avg_emojis_per_post",
    "std_emojis_per_post",
    # "total_emojis",
    "n_unique_emojis",
]

In [43]:
table_df = agg_df.copy()
for k in ["hashtags", "user_tags", "emojis"]:
    avg = f"avg_{k}_per_post"
    std = f"std_{k}_per_post"
    unique = f"n_unique_{k}"
    table_df[f"{k}"] = table_df.apply(
        lambda x: f"{x[avg]:.2f}$\pm${x[std]:.2f} ({x[unique]:.0f})", axis=1
    )
table_df["caption_length"] = table_df.apply(
    lambda x: f"{x['avg_caption_length']:.2f}$\pm${x['std_caption_length']:.2f}", axis=1
)

In [44]:
table_composition = table_df.loc[selected_experiments][
    ["caption_length", "hashtags", "user_tags", "emojis"]
]
column_map = {
    "caption_length": "Caption Length",
    "hashtags": "Hashtags",
    "n_unique_hashtags": "Unique",
    "user_tags": "User Tags",
    "n_unique_user_tags": "Unique",
    "emojis": "Emojis",
    "n_unique_emojis": "Unique",
}
index_map = {
    "base_prompt_v2_temperature_0.7": "Base Prompt",
    "fixed_examples_post_v2": "Fixed Examples",
    "random_examples_post_v2": "Random Examples",
    "imitation_random_examples_ht_v2_temperature_0.7": "Imitation",
}

In [46]:
utils.generate_latex_table(
    table_composition,
    table_caption="Impact of prompt-engineering strategies on dataset characteristics; values within brackets represent the number of unique entities.",
    table_label="tab:composition_metrics",
    columns_rename_map=column_map,
    index_rename_map=index_map,
)

In [None]:
\begin{table}
\centering
\caption{Impact of prompt-engineering strategies on dataset characteristics; values within brackets represent the number of unique entities.}
\label{tab:composition_metrics}
\begin{tabular}{lcccc}
\toprule
 & \bfseries Caption Length & \bfseries Hashtags & \bfseries User Tags & \bfseries Emojis \\
\midrule
\bfseries Base Prompt & 21.92$\pm$10.92 & 0.96$\pm$0.78 (484) & 0.33$\pm$0.48 (293) & 2.04$\pm$1.34 (257) \\
\bfseries Fixed Examples & 33.49$\pm$24.52 & 1.08$\pm$0.82 (591) & 0.53$\pm$1.14 (206) & 2.33$\pm$2.15 (232) \\
\bfseries Random Examples & 28.38$\pm$16.42 & 1.05$\pm$0.83 (685) & 0.56$\pm$0.73 (502) & 2.12$\pm$1.36 (325) \\
\bfseries Imitation & 35.65$\pm$15.65 & 1.17$\pm$0.90 (899) & 0.70$\pm$0.77 (595) & 2.15$\pm$1.44 (355) \\
\bfseries Real & 42.86$\pm$52.12 & 1.97$\pm$3.20 (1348) & 1.38$\pm$1.87 (996) & 1.88$\pm$2.72 (448) \\
\bottomrule
\end{tabular}
\end{table}


## Section 4 -- Content Analysis

### Table 3 -- Content overlap metrics between synthetic and real datasets.

In [48]:
content_analysis_columns = (
    ["hashtag_overlap", "user_tag_overlap"]
    + [c for c in agg_df.columns if "jaccard" in c]
    # + [c for c in agg_df.columns if "pronouns" in c]
)

In [49]:
column_map = {
    **{
        "hashtag_overlap": "Hashtag Overlap (\%)",
        "user_tag_overlap": "User Tag Overlap (\%)",
    },
    **{
        k: f"{k.split('_')[-1][0]}-gram Sim. (\%)"
        for k in content_analysis_columns
        if "jaccard" in k
    },
}

_selected_experiments = [k for k in selected_experiments if k != "Real"]

content_metrics_table = agg_df.loc[_selected_experiments][content_analysis_columns]
for k in content_analysis_columns:
    content_metrics_table[k] = content_metrics_table[k] * 100

utils.generate_latex_table(
    content_metrics_table.T,
    table_caption="Content overlap metrics between synthetic and real datasets.",
    table_label="tab:content_metrics",
    columns_rename_map=index_map,
    index_rename_map=column_map,
)

In [None]:
\begin{table}
\centering
\caption{Content overlap metrics between synthetic and real datasets.}
\label{tab:content_metrics}
\begin{tabular}{lcccc}
\toprule
 & \bfseries Base Prompt & \bfseries Fixed Examples & \bfseries Random Examples & \bfseries Imitation \\
\midrule
\bfseries Hashtag Overlap (\%) & 0.95 & 1.16 & 2.84 & 3.42 \\
\bfseries User Tag Overlap (\%) & 0.07 & 1.12 & 4.52 & 4.05 \\
\bfseries 1-gram Sim. (\%) & 12.72 & 15.16 & 19.75 & 21.24 \\
\bfseries 2-gram Sim. (\%) & 4.70 & 6.01 & 7.36 & 8.36 \\
\bfseries 3-gram Sim. (\%) & 0.99 & 1.71 & 2.23 & 2.48 \\
\bottomrule
\end{tabular}
\end{table}


### Table 4 -- Distribution of emoji skin tones across each dataset.

In [60]:
import emoji

skins_tones = [
    f"{k}_skin_tone" for k in ["light", "medium-light", "medium", "medium-dark", "dark"]
]


def get_skin_tone_counter(df: pd.DataFrame):
    df["emojis"] = df.caption.apply(
        lambda x: [emoji.demojize(k["emoji"]) for k in emoji.emoji_list(x)]
    )
    emoji_counter = Counter()
    df["emojis"].apply(emoji_counter.update)
    return {
        tone: sum((v for k, v in emoji_counter.items() if tone in k))
        for tone in skins_tones
    }

def sample_real(full_df: pd.DataFrame, seed: int):
    spons = full_df.query("sponsorship == 'sponsored'").sample(500, random_state=seed)
    nonspons = full_df.query("sponsorship == 'nonsponsored'").sample(
        500, random_state=seed
    )
    return pd.concat([spons, nonspons]).sample(frac=1)

In [None]:
from collections import defaultdict

bootstrap_skin_tone_counter = defaultdict(int)

for i in range(100):
    for k, v in get_skin_tone_counter(sample_real(full_data, i)).items():
        bootstrap_skin_tone_counter[k] += v

bootstrap_skin_tone_counter = {k: v / 100 for k, v in bootstrap_skin_tone_counter.items()}

In [63]:
skin_tone_counter = {
    f: get_skin_tone_counter(pd.read_pickle(f"../results/{f}/final_df.pkl"))
    for f in selected_experiments
    if f != "Real"
}
skin_tone_counter["Real"] = bootstrap_skin_tone_counter

In [67]:
column_map = {
    "light_skin_tone": "Light",
    "medium-light_skin_tone": "Medium-Light",
    "medium_skin_tone": "Medium",
    "medium-dark_skin_tone": "Medium-Dark",
    "dark_skin_tone": "Dark",
}

skin_tone_table = pd.DataFrame(skin_tone_counter)

utils.generate_latex_table(
    skin_tone_table.T,
    table_caption="Distribution of emoji skin tones across each dataset",
    table_label="tab:skin_tone",
    columns_rename_map=column_map,
    index_rename_map=index_map,
)

In [None]:
\begin{table}
\centering
\caption{Distribution of emoji skin tones across each dataset}
\label{tab:skin_tone}
\begin{tabular}{lccccc}
\toprule
 & \bfseries Light & \bfseries Medium-Light & \bfseries Medium & \bfseries Medium-Dark & \bfseries Dark \\
\midrule
\bfseries Base Prompt & 36.00 & 26.00 & 0.00 & 0.00 & 0.00 \\
\bfseries Fixed Examples & 27.00 & 17.00 & 50.00 & 0.00 & 0.00 \\
\bfseries Random Examples & 62.00 & 35.00 & 8.00 & 1.00 & 1.00 \\
\bfseries Imitation & 52.00 & 36.00 & 2.00 & 3.00 & 3.00 \\
\bfseries Real & 235.08 & 155.82 & 26.60 & 22.43 & 25.63 \\
\bottomrule
\end{tabular}
\end{table}

## Section 4 -- Embedding Similarity

### Table 5 -- Embedding similarity metrics between captions from each synthetic dataset and the real data.

In [72]:
embedding_similarity_columns = [
    c
    for c in agg_df.columns
    if "cosine" in c
    and ("avg" in c or "std" in c or "100_recall" in c)
]

In [73]:
columns_map = {
    "avg_cosine_sim": "Similarity",
    "top_100_recall_cosine_sim": "Top-100 Recall",
    # "synthetic_internal_cosine_sim": "Internal Similarity",
}

embedding_similarity_table = agg_df.loc[_selected_experiments][
    embedding_similarity_columns
].copy()
embedding_similarity_table["avg_cosine_sim"] = embedding_similarity_table.apply(
    lambda x: f"{x['avg_cosine_sim']:.2f}$\pm${x['std_cosine_sim']:.2f}", axis=1
)
embedding_similarity_table.drop(columns=["std_cosine_sim"], inplace=True)

utils.generate_latex_table(
    embedding_similarity_table,
    table_caption="Embedding similarity metrics between captions from each synthetic dataset and the real data.",
    table_label="tab:embedding_similarity",
    columns_rename_map=columns_map,
    index_rename_map=index_map,
)

In [None]:
\begin{table}
\centering
\caption{Embedding similarity metrics between captions from each synthetic dataset and the real data.}
\label{tab:embedding_similarity}
\begin{tabular}{lcc}
\toprule
 & \bfseries Similarity & \bfseries Top-100 Recall \\
\midrule
\bfseries Base Prompt & 0.83$\pm$0.02 & 0.11 \\
\bfseries Fixed Examples & 0.84$\pm$0.05 & 0.12 \\
\bfseries Random Examples & 0.83$\pm$0.03 & 0.10 \\
\bfseries Imitation & 0.83$\pm$0.03 & 0.10 \\
\bottomrule
\end{tabular}
\end{table}


## Section 4 -- Network-based Metrics

### Table 6 -- Network metrics for the three classes of networks analysed: hashtag co-occurrence (HT), usertag co-occurrence (UT), and hashtag-user bipartite network (HU). Columns correspond to the four synthetic datasets we analyse and the real dataset. The values reported are averaged over 100 network instances.

In [81]:
network_columns = [
    c
    for c in agg_df.columns
    if "NW" in c
    and (
        "nodes" in c
        or "edges" in c
        or "clustering" in c
        or "degree" in c
        or "assortativity" in c
    )
]

nw_table = agg_df.loc[selected_experiments][network_columns]
column_map = {
    k: k.replace("NW_", "")
    .replace("hashtag_usertag_", "HU ")
    .replace("hashtag_", "HT ")
    .replace("usertag_", "UT ")
    .replace("number_of_", "\# ")
    .replace("avg_", "Avg. ")
    .replace("nodes", "Nodes")
    .replace("edges", "Edges")
    .replace("degree", "Degree")
    .replace("clustering_coefficient", "Clustering Coeff.")
    .replace("assortativity", "Assortativity")
    .replace("_", "\_")
    for k in nw_table.columns
}

utils.generate_latex_table(
    nw_table.T,
    table_caption="Network metrics for the three classes of networks analysed: hashtag co-occurrence (HT), usertag co-occurrence (UT), and hashtag-user bipartite network (HU). Columns correspond to the four synthetic datasets we analyse and the real dataset. The values reported are averaged over 100 network instances.",
    table_label="tab:network_metrics",
    columns_rename_map=index_map,
    index_rename_map=column_map,
)

In [77]:
\begin{table}
\centering
\caption{Network metrics for the three classes of networks analysed: hashtag co-occurrence (HT), usertag co-occurrence (UT), and hashtag-user bipartite network (HU). Columns correspond to the four synthetic datasets we analyse and the real dataset. The values reported are averaged over 100 network instances.}
\label{tab:network_metrics}
\begin{tabular}{lccccc}
\toprule
 & \bfseries Base Prompt & \bfseries Fixed Examples & \bfseries Random Examples & \bfseries Imitation & \bfseries Real \\
\midrule
\bfseries HT \# Nodes & 484.00 & 591.00 & 685.00 & 899.00 & 1350.22 \\
\bfseries HT \# Edges & 237.00 & 308.00 & 356.00 & 505.00 & 5540.52 \\
\bfseries HT Avg. Clustering Coeff. & 0.02 & 0.05 & 0.08 & 0.12 & 0.74 \\
\bfseries HT Avg. Degree & 0.98 & 1.04 & 1.04 & 1.12 & 8.18 \\
\bfseries HT Assortativity & -0.04 & -0.03 & -0.05 & -0.07 & -0.08 \\
\bfseries UT \# Nodes & 293.00 & 206.00 & 502.00 & 598.00 & 996.62 \\
\bfseries UT \# Edges & 6.00 & 56.00 & 139.00 & 167.00 & 1682.58 \\
\bfseries UT Avg. Clustering Coeff. & 0.01 & 0.07 & 0.11 & 0.12 & 0.43 \\
\bfseries UT Avg. Degree & 0.04 & 0.54 & 0.55 & 0.56 & 3.35 \\
\bfseries UT Assortativity & 1.00 & 0.89 & 0.70 & 0.99 & 0.40 \\
\bfseries HU \# Nodes & 738.00 & 756.00 & 1144.00 & 1395.00 & 2241.48 \\
\bfseries HU \# Edges & 358.00 & 339.00 & 755.00 & 865.00 & 3171.58 \\
\bfseries HU Avg. Clustering Coeff. & 0.01 & 0.01 & 0.00 & 0.01 & 0.09 \\
\bfseries HU Avg. Degree & 0.97 & 0.90 & 1.32 & 1.24 & 2.83 \\
\bfseries HU Assortativity & 0.09 & -0.16 & -0.00 & -0.05 & -0.09 \\
\bottomrule
\end{tabular}
\end{table}


Unnamed: 0,NW_hashtag_number_of_nodes,NW_hashtag_number_of_edges,NW_hashtag_avg_clustering_coefficient,NW_hashtag_avg_degree,NW_hashtag_assortativity,NW_usertag_number_of_nodes,NW_usertag_number_of_edges,NW_usertag_avg_clustering_coefficient,NW_usertag_avg_degree,NW_usertag_assortativity,NW_hashtag_usertag_number_of_nodes,NW_hashtag_usertag_number_of_edges,NW_hashtag_usertag_avg_clustering_coefficient,NW_hashtag_usertag_avg_degree,NW_hashtag_usertag_assortativity
base_prompt_v2_temperature_0.7,484.0,237.0,0.017235,0.979339,-0.039934,293.0,6.0,0.010239,0.040956,1.0,738.0,358.0,0.007483,0.97019,0.091172
fixed_examples_post_v2,591.0,308.0,0.049758,1.042301,-0.028036,206.0,56.0,0.074973,0.543689,0.887895,756.0,339.0,0.005502,0.896825,-0.157903
random_examples_post_v2,685.0,356.0,0.0839,1.039416,-0.045502,502.0,139.0,0.112262,0.553785,0.701561,1144.0,755.0,0.001434,1.31993,-0.001486
imitation_random_examples_ht_v2_temperature_0.7,899.0,505.0,0.121153,1.123471,-0.066379,598.0,167.0,0.122074,0.558528,0.991808,1395.0,865.0,0.007478,1.240143,-0.046293
Real,1350.22,5540.52,0.735354,8.176736,-0.080289,996.62,1682.58,0.431627,3.354371,0.399323,2241.48,3171.58,0.090471,2.825327,-0.087781


## Section 4 - Downstream Task Performance

### Setup

In [84]:
disclosed_data = pd.read_pickle("../data/kim_sample_mini.pkl")
disclosed_data["sponsorship"] = disclosed_data["sponsorship"].apply(
    lambda x: "sponsored" if x == 1 else "nonsponsored"
)
undisclosed_data = pd.read_pickle("../data/ann_sample_ad_detection.pkl")
undisclosed_data["sponsorship"] = "sponsored"
real_data = (
    pd.read_pickle("../data/df_sample.pkl")
    .sample(1000, random_state=42)
    .dropna()
    .query("caption ! = ''")
)

### Table 7 --- Performance of the logistic regression model trained on different datasets. Acc. represents the accuracy in detecting undisclosed ads.

In [89]:
downstream_task_columns = [c for c in agg_df.columns if "ad_detection" in c]
downstream_task_columns.remove("ad_detection_accuracy")

In [91]:
columns_map = {
    "ad_detection_precision": "P",
    "ad_detection_recall": "R",
    "ad_detection_f1": "F1",
    "ad_detection_undisclosed_accuracy": "Undisc. Acc.",
}

downstream_task_table = agg_df.loc[selected_experiments][downstream_task_columns]

utils.generate_latex_table(
    downstream_task_table,
    table_caption="Performance of the logistic regression model trained on different datasets. Acc. represents the accuracy in detecting undisclosed ads.",
    table_label="tab:downstream_task",
    columns_rename_map=columns_map,
    index_rename_map=index_map,
)

In [None]:
\begin{table}
\centering
\caption{Performance of the logistic regression model trained on different datasets. Acc. represents the accuracy in detecting undisclosed ads.}
\label{tab:downstream_task}
\begin{tabular}{lcccc}
\toprule
 & \bfseries P & \bfseries R & \bfseries F1 & \bfseries Undisc. Acc. \\
\midrule
\bfseries Base Prompt & 0.53 & 0.79 & 0.64 & 0.74 \\
\bfseries Fixed Examples & 0.65 & 0.68 & 0.66 & 0.40 \\
\bfseries Random Examples & 0.63 & 0.82 & 0.71 & 0.51 \\
\bfseries Imitation & 0.72 & 0.69 & 0.71 & 0.23 \\
\bfseries Real & 0.66 & 0.88 & 0.76 & 0.49 \\
\bottomrule
\end{tabular}
\end{table}


### Setup for Table 8

In [92]:
from typing import Dict
from collections import defaultdict, Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()


def get_tfidf_vocabulary(df: pd.DataFrame) -> "np.array":
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(df["caption"].str.lower())
    vocab = tfidf.get_feature_names_out()
    return vocab


def get_vocab_per_class(df: pd.DataFrame) -> "np.array":
    return get_tfidf_vocabulary(
        df.query("sponsorship == 'sponsored'")
    ), get_tfidf_vocabulary(df.query("sponsorship == 'nonsponsored'"))


def create_vocabs_dict(
    df: pd.DataFrame, key_id: str = "", per_class: bool = True
) -> Dict[str, "np.array"]:
    vocab_dict = {}
    vocab_dict[f"{key_id}all"] = get_tfidf_vocabulary(df)
    if per_class:
        ad_vocab, non_ad_vocab = get_vocab_per_class(df)
        vocab_dict[f"{key_id}ad"] = ad_vocab
        vocab_dict[f"{key_id}non_ad"] = non_ad_vocab
    return vocab_dict


def compare_vocab_overlap(
    synthetic_data: pd.DataFrame,
    disclosed_data: pd.DataFrame,
    undisclosed_data: pd.DataFrame,
) -> Dict[str, float]:
    synthetic_vocab_dict = create_vocabs_dict(synthetic_data)
    disclosed_vocab_dict = create_vocabs_dict(disclosed_data, "disclosed_")
    undisclosed_vocab_dict = create_vocabs_dict(
        undisclosed_data, "undisclosed_", per_class=False
    )
    overlap_dict = defaultdict(dict)
    for k, v in {**disclosed_vocab_dict, **undisclosed_vocab_dict}.items():
        overlap_dict[k] = len(set(synthetic_vocab_dict["all"]) & set(v)) / len(
            set(synthetic_vocab_dict["all"])
        )
    return overlap_dict

In [93]:
vocab_overlap = {}

for exp in selected_experiments:
    data = (
        pd.read_pickle(f"../results/{exp}/final_df.pkl").dropna().query("caption != ''")
        if exp != "Real"
        else real_data.copy()
    )
    vocab_overlap[exp] = compare_vocab_overlap(
        data, disclosed_data=disclosed_data, undisclosed_data=undisclosed_data
    )

In [None]:
from collections import defaultdict

bootstrap_vocab_overlap = defaultdict(float)

for i in range(100):
    for k, v in compare_vocab_overlap(
        sample_real(full_data, i), disclosed_data, undisclosed_data
    ).items():
        bootstrap_vocab_overlap[k] += v

bootstrap_vocab_overlap = {k: v / 100 for k, v in bootstrap_vocab_overlap.items()}

vocab_overlap["Real"] = bootstrap_vocab_overlap

### Table 8 -- Overlap between unigrams from each dataset and the test sets.

In [97]:
ad_detection_overlap_columns = ["disclosed_ad", "undisclosed_all"]
column_map = {"disclosed_ad": "Disclosed", "undisclosed_all": "Undisclosed"}

ad_detection_overlap_table = pd.DataFrame(vocab_overlap).T[ad_detection_overlap_columns]
for c in ad_detection_overlap_table.columns:
    ad_detection_overlap_table[c] = ad_detection_overlap_table[c] * 100

utils.generate_latex_table(
    ad_detection_overlap_table,
    table_caption="Overlap between unigrams from each dataset and the test sets.",
    table_label="tab:ad_detection_overlap",
    columns_rename_map=column_map,
    index_rename_map=index_map,
)

In [None]:
\begin{table}
\centering
\caption{Overlap between unigrams from each dataset and the test sets.}
\label{tab:ad_detection_overlap}
\begin{tabular}{lcc}
\toprule
 & \bfseries Disclosed & \bfseries Undisclosed \\
\midrule
\bfseries Base Prompt & 56.67 & 48.46 \\
\bfseries Fixed Examples & 55.02 & 48.07 \\
\bfseries Random Examples & 52.77 & 45.98 \\
\bfseries Imitation & 50.25 & 44.61 \\
\bfseries Real & 44.24 & 40.22 \\
\bottomrule
\end{tabular}
\end{table}


In [605]:
_selected_experiments

['base_prompt_v2_temperature_0.7',
 'fixed_examples_post_v2',
 'random_examples_post_v2',
 'imitation_random_examples_ht_v2_temperature_0.7']

### Setup for Table 9

In [None]:
from instasynth import evaluation

disclosed_data = pd.read_pickle("../data/kim_sample_mini.pkl")
undisclosed_data = pd.read_pickle("../data/ann_sample_ad_detection.pkl")


def _evaluate_with_augmented_data(df: pd.DataFrame, real_data: pd.DataFrame):
    aug_data = pd.concat([df, real_data])
    clf_eval = evaluation.ClassificationAnalyser(
        aug_data.copy(),
        evaluation_data=disclosed_data,
        evaluation_data_ann=undisclosed_data,
    )
    return clf_eval.ad_detection_performance()


def evaluate_with_augmented_data(df: pd.DataFrame, full_df: pd.DataFrame):
    performance_dict = defaultdict(float)
    for i in range(100):
        real_data = sample_real(full_df, i)
        for k, v in _evaluate_with_augmented_data(df, real_data).items():
            performance_dict[k] += v
    return {k: v / 100 for k, v in performance_dict.items()}


augmented_performance = {
    exp: evaluate_with_augmented_data(
        pd.read_pickle(f"../results/{exp}/final_df.pkl"), full_data
    )
    for exp in selected_experiments
    if exp != "Real"
}

In [None]:
def sample_real_2k(full_df: pd.DataFrame, seed: int):
    spons = full_df.query("sponsorship == 'sponsored'").sample(1000, random_state=seed)
    nonspons = full_df.query("sponsorship == 'nonsponsored'").sample(
        1000, random_state=seed
    )
    return pd.concat([spons, nonspons]).sample(frac=1)


bootstrap_real_2k_performance = defaultdict(float)

for i in range(100):
    sample = sample_real_2k(full_data, i)
    clf_eval = evaluation.ClassificationAnalyser(
        sample.copy(),
        evaluation_data=disclosed_data,
        evaluation_data_ann=undisclosed_data,
    )
    for k, v in clf_eval.ad_detection_performance().items():
        bootstrap_real_2k_performance[k] += v
bootstrap_real_2k_performance = {
    k: v / 100 for k, v in bootstrap_real_2k_performance.items()
}

In [100]:
augmented_performance["Real"] = bootstrap_real_2k_performance

### Table 9 -- Performance of the logistic regression model trained on augmented datasets. Each synthetic dataset is augmented with a sample of 1k captions from the real data.

In [101]:
columns_map = {
    "ad_detection_precision": "P",
    "ad_detection_recall": "R",
    "ad_detection_f1": "F1",
    "ad_detection_undisclosed_accuracy": "Undisc. Acc.",
}

augment_performance_table = pd.DataFrame(augmented_performance).T.loc[
    selected_experiments
][downstream_task_columns]

utils.generate_latex_table(
    augment_performance_table,
    table_caption="Performance of the logistic regression model trained on augmented datasets. Each synthetic dataset is augmented with a sample of 1k captions from the real data.",
    table_label="tab:ad_detection_aug",
    columns_rename_map=columns_map,
    index_rename_map=index_map,
)

In [None]:
\begin{table}
\centering
\caption{Performance of the logistic regression model trained on augmented datasets. Each synthetic dataset is augmented with a sample of 1k captions from the real data.}
\label{tab:ad_detection_aug}
\begin{tabular}{lcccc}
\toprule
 & \bfseries P & \bfseries R & \bfseries F1 & \bfseries Undisc. Acc. \\
\midrule
\bfseries Base Prompt & 0.63 & 0.90 & 0.74 & 0.57 \\
\bfseries Fixed Examples & 0.68 & 0.89 & 0.77 & 0.47 \\
\bfseries Random Examples & 0.66 & 0.89 & 0.76 & 0.48 \\
\bfseries Imitation & 0.70 & 0.88 & 0.78 & 0.38 \\
\bfseries Real & 0.69 & 0.89 & 0.78 & 0.42 \\
\bottomrule
\end{tabular}
\end{table}
