# Bootstrapping real sample for calculating metrics for evaluation

This notebook includes the code used to generate the `metrics_bootstrap_real.pkl`, which contains the averaged metrics for the real dataset.

In [1]:
import sys

sys.path.append("..")

In [2]:
from pathlib import Path
from glob import iglob
from instasynth import evaluation, embedding_generation
import pandas as pd

test_spons_data = pd.read_pickle("../data/kim_sample_mini.pkl")
ann_data = pd.read_pickle("../data/ann_sample_ad_detection.pkl")
emb_storage = embedding_generation.EmbeddingStorage(
    Path("../embeddings/"), embedding_file_name="embeddings.pkl"
)
full_df = (
    pd.read_pickle("../data/full_df_posts.pkl")
    .dropna()
    .query("caption ! = '' & country == 'US'")
)
full_df["sponsorship"] = full_df.has_disclosures.apply(
    lambda x: "sponsored" if x else "nonsponsored"
)

def sample_real(seed: int):
    spons = full_df.query("sponsorship == 'sponsored'").sample(500, random_state=seed)
    nonspons = full_df.query("sponsorship == 'nonsponsored'").sample(
        500, random_state=seed
    )
    return pd.concat([spons, nonspons]).sample(frac=1)

Running 100 bootstrap samples...

In [None]:
anls = []

for i in range(100):
    real_data = sample_real(i)
    anls.append(
        evaluation.SingleExperimentAnalyser(real_data).analyse_experiment(
            real_dataset=real_data,
            test_dataset_ads=test_spons_data,
            test_dataset_ads_undisclosed=ann_data,
            embedding_storage=emb_storage,
            analyse_embeddings=True,
            analyse_internal_similarity=True,
            analyse_top_k_recall=False,
        )
    )

Creating the `metrics_bootstrap_real.pkl` dataframe

In [5]:
import numpy as np

anls_mean = {k: np.mean([x[k] for x in anls]) for k in anls[0].keys()}

pd.DataFrame(anls_mean, index=["Real"]).to_pickle("../data/metrics_bootstrap_real.pkl")

### Testing statistical significance of the averaged metrics 

In [7]:
import numpy as np

def bootstrap_mean_confidence_interval(single_metric_samples):
    sample_mean = np.mean(single_metric_samples)
    SE = np.std(single_metric_samples, ddof=1) / np.sqrt(len(single_metric_samples))
    z_value = 1.96  # For a 95% confidence level
    CI_lower = sample_mean - z_value * SE
    CI_upper = sample_mean + z_value * SE

    return CI_lower, CI_upper


def is_within_CI(single_metric_samples):
    metric_mean = np.mean(single_metric_samples)
    CI_lower, CI_upper = bootstrap_mean_confidence_interval(single_metric_samples)
    return (metric_mean >= CI_lower) and (metric_mean <= CI_upper)

metric_within_ci = {m: is_within_CI(np.array([k[m] for k in anls])) for m in anls[0].keys()}



In [8]:
metric_within_ci

{'pct_unique_captions': True,
 'avg_caption_length': True,
 'std_caption_length': True,
 'vocabulary_size': True,
 'avg_emojis_per_post': True,
 'std_emojis_per_post': True,
 'n_unique_emojis': True,
 'avg_hashtags_per_post': True,
 'std_hashtags_per_post': True,
 'total_hashtags': True,
 'n_unique_hashtags': True,
 'avg_user_tags_per_post': True,
 'std_user_tags_per_post': True,
 'total_user_tags': True,
 'n_unique_user_tags': True,
 'avg_fk_grade_level': True,
 'std_fk_grade_level': True,
 'avg_dalle_readability': True,
 'std_dalle_readability': True,
 'avg_1gram_per_post': True,
 'n_unique_1gram': True,
 'avg_2gram_per_post': True,
 'n_unique_2gram': True,
 'avg_3gram_per_post': True,
 'n_unique_3gram': True,
 'pct_first_person_pronouns': True,
 'pct_second_person_pronouns': True,
 'pct_third_person_pronouns': True,
 'jaccard_similarity_1gram': True,
 'jaccard_similarity_2gram': True,
 'jaccard_similarity_3gram': True,
 'hashtag_overlap': True,
 'user_tag_overlap': True,
 'ad_detect