# Section 4 -- Content Analysis

This notebook contains the code for the qualitative analyses in section 4 (e.g., vocabulary analyses, top ngrams etc.)

In [None]:
selected_experiments = [
    "base_prompt_v2_temperature_0.7",
    "fixed_examples_post_v2",
    "random_examples_post_v2",
    "imitation_random_examples_ht_v2_temperature_0.7",
    "Real",
]

In [None]:
from instasynth import evaluation

ta_real = evaluation.TextAnalyser(
    data=pd.read_pickle("../data/df_sample.pkl"), remove_stopwords=False
)
_ = ta_real._ngram_metrics()
real_vocab = {n: set([k[0] for k in ta_real._ngrams[n]]) for n in [1, 2, 3]}

In [None]:
from collections import Counter
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

sw = set(stopwords.words("english"))

tt = TweetTokenizer()


def ngram_overlap(experiment: str):
    ta = evaluation.TextAnalyser(
        data=pd.read_pickle(f"../results/{experiment}/final_df.pkl"),
        remove_stopwords=False,
    )
    _ = (ta._ngram_metrics(),)
    syn_vocab = {n: set([k[0] for k in ta._ngrams[n]]) for n in [1, 2, 3]}
    return {
        f"{n}gram_overlap": len(syn_vocab[n].intersection(real_vocab[n]))
        / len(syn_vocab[n])
        * 100
        for n in [1, 2, 3]
    }


def get_ngram_count(df: pd.DataFrame):
    df = df.query("caption != ''")
    df["caption"] = df.caption.str.lower()
    for n in [1, 2, 3]:
        df[f"{n}_gram"] = df.caption.apply(
            lambda x: ngrams(
                [w for w in tt.tokenize(x) if w not in sw and len(w) > 1], n
            )
        )
    return {
        n: Counter([k for l in df[f"{n}_gram"].tolist() for k in l]) for n in [1, 2, 3]
    }

In [None]:
ngram_counters = {
    f: get_ngram_count(pd.read_pickle(f"../results/{f}/final_df.pkl"))
    for f in selected_experiments
    if f != "Real"
}
ngram_counters["Real"] = get_ngram_count(pd.read_pickle("../data/df_sample.pkl"))

In [None]:
full_data = pd.read_pickle("../data/full_df_posts.pkl").query(
    "caption != '' and country == 'US'"
)
full_data["sponsorship"] = full_data.has_disclosures.apply(
    lambda x: "sponsored" if x else "nonsponsored"
)

In [None]:
def _sample_real(full_df: pd.DataFrame, seed: int):
    spons = full_df.query("sponsorship == 'sponsored'").sample(500, random_state=seed)
    nonspons = full_df.query("sponsorship == 'nonsponsored'").sample(
        500, random_state=seed
    )
    return pd.concat([spons, nonspons]).sample(frac=1)

In [None]:
from collections import defaultdict


def get_tag_count(exp: str, ngram_counters: dict, tag: str = "#"):
    return {
        k[0][0]: k[1]
        for k in ngram_counters[exp][1].most_common(100000)
        if k[0][0].startswith(tag)
    }


hashtag_counter = {
    f: get_tag_count(f, ngram_counters, "#")
    for f in selected_experiments
    if f != "Real"
}
usertag_counter = {
    f: get_tag_count(f, ngram_counters, "@")
    for f in selected_experiments
    if f != "Real"
}

bootstrap_tag_counter = defaultdict(dict)

for i in range(100):
    df = _sample_real(full_df=full_data, seed=i)
    for tag, c_tag in {"#": "hashtags", "@": "usertags"}.items():
        df[c_tag] = df.caption.str.lower().apply(
            lambda x: ngrams(
                [w for w in tt.tokenize(x) if w.startswith(tag) and len(w) > 1], 1
            )
        )
        tag_counter = Counter()
        df[c_tag].apply(tag_counter.update)
        for k, v in {k[0][0]: k[1] for k in tag_counter.most_common(100000)}.items():
            if k not in bootstrap_tag_counter[c_tag]:
                bootstrap_tag_counter[c_tag][k] = 0
            bootstrap_tag_counter[c_tag][k] += v

bootstrap_tag_counter = {
    k: {k2: v2 / 100 for k2, v2 in v.items()} for k, v in bootstrap_tag_counter.items()
}

In [None]:
hashtags = bootstrap_tag_counter["hashtags"]
usertags = bootstrap_tag_counter["usertags"]
sorted_hashtags = {
    k: hashtags[k] for k in sorted(hashtags, key=hashtags.get, reverse=True)
}
sorted_usertags = {
    k: usertags[k] for k in sorted(usertags, key=usertags.get, reverse=True)
}

hashtag_counter["Real"] = sorted_hashtags
usertag_counter["Real"] = sorted_usertags

In [None]:
ngram_counter = defaultdict(dict)

for exp, ngs in ngram_counters.items():
    for n, counts in ngs.items():
        ngram_counter[exp][n] = {
            " ".join(k[0]): k[1]
            for k in counts.most_common(100000)
            if not k[0][0].startswith("#") and not k[0][0].startswith("@")
        }

In [None]:
def get_top_n_entities(count_dict, topn=100):
    return set([k for k in count_dict.keys()][:topn])


def get_overlap_top_n_entities(count_dict1, count_dict2, topn=100):
    top_tokens_1 = get_top_n_entities(count_dict1, topn)
    top_tokens_2 = get_top_n_entities(count_dict2, topn)
    return len(top_tokens_1.intersection(top_tokens_2)) / topn * 100

In [None]:
top100_overlap = defaultdict(dict)

for exp in _selected_experiments:
    top100_overlap[exp]["hashtags"] = get_overlap_top_n_entities(
        hashtag_counter[exp], hashtag_counter["Real"], topn=100
    )
    top100_overlap[exp]["usertags"] = get_overlap_top_n_entities(
        usertag_counter[exp], usertag_counter["Real"], topn=100
    )
    for n in [1, 2, 3]:
        top100_overlap[exp][f"{n}grams"] = get_overlap_top_n_entities(
            ngram_counter[exp][n], ngram_counter["Real"][n], topn=100
        )

In [None]:
top100_syn_overlap = defaultdict(dict)
_selected_experiments = sorted(
    _selected_experiments
)  # Make sure the list is sorted for consistency

for i, exp1 in enumerate(_selected_experiments):
    for j, exp2 in enumerate(_selected_experiments):
        if j <= i:  # Only consider pairs in the upper triangle (including diagonal)
            key = f"{exp1} -> {exp2}"
            top100_syn_overlap[key]["hashtags"] = get_overlap_top_n_entities(
                hashtag_counter[exp1], hashtag_counter[exp2], topn=100
            )
            top100_syn_overlap[key]["usertags"] = get_overlap_top_n_entities(
                usertag_counter[exp1], usertag_counter[exp2], topn=100
            )
            for n in [1, 2, 3]:
                top100_syn_overlap[key][f"{n}grams"] = get_overlap_top_n_entities(
                    ngram_counter[exp1][n], ngram_counter[exp2][n], topn=100
                )

pd.DataFrame(top100_syn_overlap).T

In [None]:
for f in selected_experiments:
    print(f)
    print(get_top_n_entities(hashtag_counter[f], topn=10))

In [None]:
top100_not_in_real = defaultdict(dict)

for exp in _selected_experiments:
    top100_not_in_real[exp]["hashtags"] = get_top_n_entities(
        hashtag_counter[exp], topn=100
    ).difference(get_top_n_entities(hashtag_counter["Real"], topn=100))
    top100_not_in_real[exp]["usertags"] = get_top_n_entities(
        usertag_counter[exp], topn=100
    ).difference(get_top_n_entities(usertag_counter["Real"], topn=100))
    for n in [1, 2, 3]:
        top100_not_in_real[exp][f"{n}grams"] = get_top_n_entities(
            ngram_counter[exp][n], topn=100
        ).difference(get_top_n_entities(ngram_counter["Real"][n], topn=100))

In [None]:
top100_not_in_synthetic = defaultdict(dict)

for exp in _selected_experiments:
    top100_not_in_synthetic[exp]["hashtags"] = get_top_n_entities(
        hashtag_counter["Real"], topn=100
    ).difference(get_top_n_entities(hashtag_counter[exp], topn=100))
    top100_not_in_synthetic[exp]["usertags"] = get_top_n_entities(
        usertag_counter["Real"], topn=100
    ).difference(get_top_n_entities(usertag_counter[exp], topn=100))
    for n in [1, 2, 3]:
        top100_not_in_synthetic[exp][f"{n}grams"] = get_top_n_entities(
            ngram_counter["Real"][n], topn=100
        ).difference(get_top_n_entities(ngram_counter[exp][n], topn=100))

In [None]:
for f in _selected_experiments:
    print(f)
    print(list(top100_not_in_real[f]["3grams"])[:15])

In [None]:
for f in _selected_experiments:
    print(f)
    print(list(top100_not_in_synthetic[f]["3grams"])[:10])

In [None]:
for f in _selected_experiments:
    print(f)
    print(list(top100_not_in_real[f]["hashtags"])[:10])