# Code for reproducing the tables and figures from the paper

## General setup

Importing library, loading data, configuring plotting etc.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
plt.style.use("seaborn-darkgrid")

In [None]:
import sys

sys.path = list(set(sys.path))
# sys.path.remove("/Users/thales/src/instagram-time-splits")
sys.path.append("..")

In [None]:
DF_POSTS_PATH = "../data/df_posts_lite.pkl"
DF_PROFILES_PATH = "../data/df_profiles.pkl"

In [None]:
import pandas as pd

df_posts = pd.read_pickle(DF_POSTS_PATH)
df_profiles = pd.read_pickle(DF_PROFILES_PATH)

In [None]:
df_posts["is_sponsored"] = (df_posts.predicted_disclosure) | (df_posts.has_disclosures)
df_posts["sponsorship_type"] = df_posts.apply(lambda x: "disclosed" if x.has_disclosures else "undisclosed" if x.predicted_disclosure else "non-sponsored", axis=1)
# df["dt_year_mon"] = df.date.apply(lambda x: f"{x.split()[0].split('-')[0]}/{x.split()[0].split('-')[1]}")
df_posts["engagement"] = (df_posts.likes + df_posts.comments)

In [None]:
from src import utils
from importlib import reload

reload(utils)

## Table 1

### Setup

In [None]:
user_post_counts = df_posts.groupby("username").size()
df_profiles["n_posts"] = df_profiles.username.map(user_post_counts)

In [None]:
from src.utils import display_formatted

#
profile_per_country = (
    df_profiles.groupby(["country", "size"])
    .agg(
        {"followers": "mean", "is_verified": lambda x: (x == True).sum() / len(x) * 100}
    )
    .T
)
posts_per_country = (
    df_posts.groupby(["country", "size"])
    .agg({"shortcode": lambda x: x.count() / 50, "likes": "mean", "comments": "mean"})
    .T
)



In [None]:
display_formatted(pd.concat([profile_per_country, posts_per_country]), precision=1)

## Table 2

Stats aggregated dataset

In [None]:
agg_profile = (
    df_profiles.groupby("size")
    .agg(
        {"followers": "mean", "is_verified": lambda x: (x == True).sum() / len(x) * 100}
    )
    .T
)
agg_posts = (
    df_posts.groupby("size")
    .agg({"shortcode": lambda x: x.count() / 50, "likes": "mean", "comments": "mean"})
    .T
)
all_column_profile = df_profiles[["followers", "is_verified"]].mean()
all_column_post = df_posts[["likes", "comments"]].mean()
all_column = pd.concat([all_column_profile, all_column_post])
all_column["shortcode"] = df_posts["shortcode"].count()
all_column["is_verified"] = all_column["is_verified"] * 100

In [None]:
agg_df = pd.concat([agg_profile, agg_posts])
agg_df["All"] = all_column
display_formatted(agg_df, precision=1)

## Figure 8

In [None]:
sorted_dt = (
    df_posts.groupby(["dt_year_mon", "has_disclosures"])
    .engagement.mean()
    .sort_index()
    .unstack()
    .fillna(0)
    .stack()
)
dt_freq = (
    pd.DataFrame(sorted_dt)
    .rename({0: "engagement"}, axis=1)
    .reset_index()
    .rename({"dt_year_mon": "date"}, axis=1)
)
dt_freq["has_disclosures"] = dt_freq.has_disclosures.apply(
    lambda x: "Sponsored" if x else "Non-disclosed"
)

# Smooth time series with a 3-month window
window_size = 3
dt_freq_smoothed = (
    dt_freq.set_index("date")
    .groupby("has_disclosures")
    .rolling(window_size, min_periods=1)
    .mean()
    .reset_index()
)
# smoothed_line is actually the normal line, I just changed what dt each line is plotting
fig = go.Figure()
smoothed_line = px.line(
    dt_freq,
    x="date",
    y="engagement",
    color="has_disclosures",
    labels={
        "date": "Date (year/month)",
        "engagement": "Engagement",
        "has_disclosures": "",
    },
    category_orders={"has_disclosures": ["Non-disclosed", "Sponsored"]},
)
smoothed_line.data[0].line.update(dash="dot")
smoothed_line.data[0].showlegend = False
smoothed_line.data[0].line.color = "#838bfb"
smoothed_line.data[1].line.update(dash="dot")
smoothed_line.data[1].showlegend = False
smoothed_line.data[1].line.color = "#f48571"
fig.add_traces(smoothed_line.data)
# fig.update_traces(patch={"line": {"dash": "dot"}})
fig.add_trace(
    px.line(dt_freq_smoothed, x="date", y="engagement", color="has_disclosures").data[0]
)
fig.add_trace(
    px.line(dt_freq_smoothed, x="date", y="engagement", color="has_disclosures").data[1]
)
fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size=14))
)
fig.update_layout(xaxis_title="Date (year/month)", yaxis_title="Engagement")
fig.update_layout(yaxis=dict(tickfont=dict(size=14), titlefont=dict(size=18)))
fig.update_layout(xaxis=dict(tickfont=dict(size=14), titlefont=dict(size=18)))
fig.show()
# fig.write_image("../figures/engagement_disc_nonspons.pdf")#, width=1800, height=600)

## Figure 9

In [None]:
import numpy as np

def compute_cdf(data):
    sorted_data = np.sort(data)
    cdf_values = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    return sorted_data, cdf_values



# Extract engagements for both sponsored and non-disclosed posts
sponsored_engagements = df_posts[df_posts["has_disclosures"]]["engagement"]
nondisclosed_engagements = df_posts[~df_posts["has_disclosures"]]["engagement"]

# Compute CDF for both types of posts
sponsored_sorted, sponsored_cdf = compute_cdf(sponsored_engagements)
nondisclosed_sorted, nondisclosed_cdf = compute_cdf(nondisclosed_engagements)

# upper_limit_sponsored = np.percentile(sponsored_engagements, 99)
# upper_limit_nondisclosed = np.percentile(nondisclosed_engagements, 99)
# upper_limit = max(upper_limit_sponsored, upper_limit_nondisclosed)

# Plotting
cdf_fig = go.Figure()
cdf_fig.add_trace(
    go.Scatter(x=sponsored_sorted, y=sponsored_cdf, mode="lines", name="Sponsored")
)
cdf_fig.add_trace(
    go.Scatter(
        x=nondisclosed_sorted, y=nondisclosed_cdf, mode="lines", name="Non-disclosed"
    )
)

cdf_fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size=14)),
    xaxis_title="Engagement (log scale)",
    yaxis_title="CDF",
    yaxis=dict(tickfont=dict(size=14), titlefont=dict(size=18)),
    xaxis=dict(
        type="log", # setting the x-axis to logarithmic scale
        tickfont=dict(size=14), 
        titlefont=dict(size=18)
    ),
)

cdf_fig.show()
# cdf_fig.write_image("../figures/engagement_cdf_disc_nonspons.pdf")#, width=1800, height=600)

## Table 5

In [None]:
top10_brands_mega = pd.Series(
    df_posts.query("size == 'mega' and content_sponsor != ''")
    .content_sponsor.value_counts()[:10]
    .index,
    name="mega",
)
top10_brands_micro = pd.Series(
    df_posts.query("size == 'micro' and content_sponsor != ''")
    .content_sponsor.value_counts()[:10]
    .index,
    name="micro",
)
print(pd.concat([top10_brands_micro, top10_brands_mega], axis=1).to_latex())

## Table 6

In [None]:
top10_brands_per_country = [
    pd.Series(
        df_posts.query(f"country == '{country}' and content_sponsor != ''")
        .content_sponsor.value_counts()[:10]
        .index,
        name=country,
    )
    for country in df_posts.country.unique()
]
print(pd.concat(top10_brands_per_country, axis=1).to_latex())

## Figure 3

In [None]:
countries_df = []
years_df = []
ht = []
kw = []
ad = []

years = sorted(df_posts.year.unique())

for c in df_posts.country.unique():
    for y in years:
        countries_df.append(c)
        years_df.append(y)
        total_disclosures = (
            len(
                df_posts[
                    (df_posts.country == c)
                    & (df_posts.year == y)
                    & (df_posts.has_disclosures)
                ]
            )
            * 100
        )
        ht_freq = len(
            df_posts[
                (df_posts.country == c)
                & (df_posts.year == y)
                & (df_posts.has_sponsored_hashtags)
                & (~df_posts.has_sponsored_keywords_no_ht)
                & (~df_posts.is_ad)
            ]
        )
        ht.append(ht_freq / total_disclosures * 100 if total_disclosures else 0)
        kw_freq = len(
            df_posts[
                (df_posts.country == c)
                & (df_posts.year == y)
                & (~df_posts.has_sponsored_hashtags)
                & (df_posts.has_sponsored_keywords_no_ht)
                & (~df_posts.is_ad)
            ]
        )
        kw.append(kw_freq / total_disclosures * 100 if total_disclosures else 0)
        ad_freq = len(
            df_posts[
                (df_posts.country == c)
                & (df_posts.year == y)
                & (~df_posts.has_sponsored_hashtags)
                & (~df_posts.has_sponsored_keywords_no_ht)
                & (df_posts.is_ad)
            ]
        )
        ad.append(ad_freq / total_disclosures * 100 if total_disclosures else 0)

disclosure_features_year = pd.DataFrame(
    {"country": countries_df, "year": years_df, "HT": ht, "KW": kw, "AD": ad}
)

# Create a plot with 4 subplots for each country
fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=[k for k in disclosure_features_year.country.unique()],
)

for i, k in enumerate(disclosure_features_year.country.unique()):
    for trace in px.area(
        disclosure_features_year[
            (disclosure_features_year.country == k)
            & (disclosure_features_year.year > 2010)
        ],
        x="year",
        y=["HT", "KW", "AD"],
        labels={
            "value": "Percentage of disclosure type",
            "variable": "Type of disclosure",
            "country": "Country",
        },
    ).data:
        if i != 0:
            trace.showlegend = False
        fig.add_trace(trace, row=i // 2 + 1, col=i % 2 + 1)


# fig.update_layout(
#     legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size=14)),
#     xaxis_title="Engagement",
#     yaxis_title="Cumulative Proportion",
#     yaxis=dict(tickfont=dict(size=14), titlefont=dict(size=18)),
# )
title_font_size = 18
tick_font_size = 18

for i in range(1, 3):  # rows
    for j in range(1, 3):  # cols
        fig.update_xaxes(
            titlefont=dict(size=title_font_size),
            tickfont=dict(size=tick_font_size),
            row=i,
            col=j,
        )
        fig.update_yaxes(
            titlefont=dict(size=title_font_size),
            tickfont=dict(size=tick_font_size),
            row=i,
            col=j,
        )

# Update the font size of the subplot titles
title_font_size = 24
for annotation in fig["layout"]["annotations"]:
    annotation["font"]["size"] = title_font_size


fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size=14))
)

#
# fig.show()

## Figure 5

In [None]:
fig = make_subplots(
    rows=1, cols=2, subplot_titles=["Non-sponsored posts", "Sponsored posts"]
)
fig_non_sponsored = utils.plot_agg_timeseries(df[~df.has_disclosures], "caption_len")
fig_non_sponsored.update_layout(yaxis_range=[0, 100])
fig_sponsored = utils.plot_agg_timeseries(df[df.has_disclosures], "caption_len")
for trace in fig_non_sponsored.data:
    fig.add_trace(trace, row=1, col=1)
for trace in fig_sponsored.data:
    trace.showlegend = False
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(
    yaxis_range=[0, 90], xaxis_title="Date", yaxis_title="Avg. caption length", title=""
)
title_font_size = 24
tick_font_size = 18

# Update x and y axes for both subplots
fig.update_xaxes(
    titlefont=dict(size=title_font_size), tickfont=dict(size=tick_font_size), col=1
)
fig.update_xaxes(
    titlefont=dict(size=title_font_size), tickfont=dict(size=tick_font_size), col=2
)

fig.update_yaxes(
    titlefont=dict(size=title_font_size), tickfont=dict(size=tick_font_size), col=1
)
fig.update_yaxes(
    titlefont=dict(size=title_font_size), tickfont=dict(size=tick_font_size), col=2
)

# Update the font size of the subplot titles
for annotation in fig["layout"]["annotations"]:
    annotation["font"]["size"] = title_font_size

fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size=18))
)
fig.show()
# fig.write_image("../figures/caption_length_over_time.pdf", width=1800, height=600)