In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy.stats
import collections
import seaborn

seaborn.set_style("darkgrid")
plt.rcParams.update({"font.size": 24})

## Read Data

In [6]:
# read data

raw_df = pd.read_csv("raw_Mar31.csv", index_col=None)
for i, column in enumerate(raw_df.columns):
    end = "\n" if i % 5 == 0 and i > 0 else "\t"
    print(f"{column:30s}", end=end)
print()
print(f"total number of rows = {raw_df.shape[0]}")
n_videos = raw_df["Filename"].str.endswith(".mp4").sum()
n_videos_rp = ((raw_df != -1).all(axis=1) & raw_df["Filename"].str.endswith(".mp4")).sum()
print(f"total number of videos = {n_videos}")
print(f"total number of videos with RP output = {n_videos_rp}")

Filename                      	Advertiser                    	Date                          	Version                       	Country                       	Duration                      
Description                   	Brand/Product                 	ProductAdvertiser             	Title                         	Creative ID                   
Channel                       	Publication                   	Website                       	!!MISSING_KEY!! CreativeLabel_Outlet_PaidSocial	Ad Format                     
Endline                       	video_key                     	frames_analyzed               	(female, [0, 15), [-inf, 1.1))	(female, [0, 15), [1.1, 2.1)) 
(female, [0, 15), [2.1, 3.1)) 	(female, [0, 15), [3.1, 4.1)) 	(female, [0, 15), [4.1, 5.1)) 	(female, [0, 15), [5.1, 6.1)) 	(female, [0, 15), [6.1, 7.1)) 
(female, [0, 15), [7.1, 8.1)) 	(female, [0, 15), [8.1, 9.1)) 	(female, [15, 25), [-inf, 1.1))	(female, [15, 25), [1.1, 2.1))	(female, [15, 25), [2.1, 3.1))
(female, [15, 25), [3

## Filter Data

In [7]:
# filter data

data_df = raw_df.loc[(raw_df != -1).all(axis=1) & raw_df["Filename"].str.endswith(".mp4")].copy()
data_df.index = pd.RangeIndex(0, len(data_df))
print(data_df.shape)

(7648, 148)


## Check Data

In [8]:
# check cross section bins count equals aggregates

def describe(row):
    print(row["Creative ID"])
    rp_columns = [column for column in row.index if re.match(r"\((fe)?male", column)]
    for column in rp_columns:
        if row[column] != 0:
            print(f"\t{column} : {row[column]}")
    print(f"faces : {row.faces}")
    print(f"gender : male = {row.masculine_faces}, female = {row.feminine_faces}")
    print("age : ")
    for j in range(7):
        count = row[f"age_{j + 1}"]
        end = ", " if j < 6 else "\n"
        print(f"age_{j + 1} = {count}", end=end)
    print("monk skin tone : ")
    for k in range(10):
        count = row[f"mst_scale_{k + 1}"]
        end = ", " if k < 9 else "\n"
        print(f"mst_scale_{k + 1} = {count}", end=end)

for _, row in data_df.iterrows():
    rp_data = np.zeros((2, 7, 10), dtype=int)
    for i, gender in enumerate(["male", "female"]):
        for j in range(7):
            for k in range(10):
                age_lb = 0 if j == 0 else 10 * j + 5
                age_ub = 10 * j + 15
                mst_lb = "-inf" if k == 0 else k + 0.1
                mst_ub = k + 1.1
                column_name = f"({gender}, [{age_lb}, {age_ub}), [{mst_lb}, {mst_ub}))"
                if column_name in row:
                    rp_data[i, j, k] = row[column_name]
    assert rp_data.sum() == row["faces"]
    assert rp_data[0].sum() == row["masculine_faces"]
    assert rp_data[1].sum() == row["feminine_faces"]
    for j in range(7):
        assert rp_data[:,j].sum() == row[f"age_{j + 1}"]
    # for k in range(10):
    #     assert rp_data[:, :, k].sum() == row[f"mst_scale_{k + 1}"]

## Metadata Distributions

In [10]:
# metadata

data_df["country"] = data_df["Country"]
data_df["brand"] = data_df["ProductAdvertiser"]
data_df["product type"] = data_df["brand_category"]
data_df.loc[data_df["Brand/Product"].apply(lambda x: re.search("(\s|^)((toiletries)|(toilet))(\s|$)", x) is not None), 
            "product type"] = "toiletries"

def plural(metadata):
    if metadata == "country":
        return "countries"
    elif metadata == "brand":
        return "brands"
    elif metadata == "product type":
        return "product types"
    elif metadata == "product target gender":
        return "product target gender categories"
    else:
        return metadata + "s"

def target_gender(desc):
    desc = str(desc)
    f = re.search(r"(^|\s)(females?)|(wom(a|e)n)(\s|$)", desc.lower()) is not None
    m = re.search(r"(^|\s)((?<!fe)males?)|((?<!wo)m(a|e)n)(\s|$)", desc.lower()) is not None
    u = re.search(r"(^|\s)unisex(\s|$)", desc.lower()) is not None
    if f and not m and not u:
        return "female"
    elif m and not f:
        return "male"
    elif u and not f and not m:
        return "unisex"
    else:
        return "unspecified"

def plot_metadata_distribution(dist, metadata):
    plt.figure(figsize=(22, 10))
    x = [item[0] for item in dist]
    y = [item[1] for item in dist]
    total = sum(y)
    yper = [100*yy/total for yy in y]
    xx = np.arange(len(x))
    bc = plt.bar(xx, y, width=0.8, edgecolor="black", linewidth=2)
    plt.bar_label(bc, [f"{yy:.1f}%" for yy in yper], padding=10, rotation=0)
    bottom, top = plt.ylim()
    plt.ylim(bottom, top + 50)
    xrotation = 90 if len(x) > 12 else 0
    plt.xticks(xx, x, rotation=xrotation, ha="center", fontsize="small")
    plt.yticks(fontsize="small")
    plt.xlabel(metadata.upper())
    plt.ylabel("NUMBER OF ADS")
    metadata_in_title = plural(metadata)
    plt.title(f"DISTRIBUTION OF ADS ACROSS {metadata_in_title.upper()}")
    plt.savefig(f"plots/metadata/{metadata}.png", bbox_inches="tight", pad_inches=0, dpi=300)
    plt.close()

country_dist = sorted(collections.Counter(data_df["country"]).items(), key=lambda x: x[1], reverse=True)
brand_dist = sorted(collections.Counter(data_df["brand"]).items(), key=lambda x: x[1], reverse=True)
producttype_dist = sorted(collections.Counter(data_df["product type"]).items(), key=lambda x: x[1], reverse=True)
data_df["product target gender"] = data_df["Brand/Product"].apply(target_gender)
targetgender_dist = sorted(collections.Counter(data_df["product target gender"]).items(), key=lambda x: x[1], 
                           reverse=True)
total = len(data_df)

print("country")
for i, (country, count) in enumerate(country_dist):
    per = 100*count/total
    end = "\n" if i % 5 == 0 and i > 0 else "\t"
    print(f"{country:10s} : {count:4d} ({per:4.1f}%)", end=end)
plot_metadata_distribution(country_dist, "country")
print("\n")

print("brand")
for i, (brand, count) in enumerate(brand_dist):
    per = 100*count/total
    end = "\n" if i % 5 == 0 and i > 0 else "\t"
    print(f"{brand:10s} : {count:4d} ({per:4.1f}%)", end=end)
plot_metadata_distribution(brand_dist, "brand")
print("\n")

print("product type")
for i, (producttype, count) in enumerate(producttype_dist):
    per = 100*count/total
    end = "\n" if i % 5 == 0 and i > 0 else "\t"
    print(f"{producttype:10s} : {count:4d} ({per:4.1f}%)", end=end)
plot_metadata_distribution(producttype_dist, "product type")
print("\n")

print("product target gender")
for i, (targetgender, count) in enumerate(targetgender_dist):
    per = 100*count/total
    end = "\n" if i % 5 == 0 and i > 0 else "\t"
    print(f"{targetgender:10s} : {count:4d} ({per:4.1f}%)", end=end)
plot_metadata_distribution(targetgender_dist, "product target gender")

country
Germany    : 3202 (41.9%)	UK         : 1715 (22.4%)	France     :  817 (10.7%)	Italy      :  806 (10.5%)	USA        :  701 ( 9.2%)	Spain      :  227 ( 3.0%)
Brazil     :   54 ( 0.7%)	India      :   49 ( 0.6%)	Mexico     :   46 ( 0.6%)	Indonesia  :   30 ( 0.4%)	Canada     :    1 ( 0.0%)


brand
GARNIER    : 1825 (23.9%)	OAP        :  880 (11.5%)	LRP        :  631 ( 8.3%)	YSL        :  613 ( 8.0%)	VICHY      :  511 ( 6.7%)	LANCOME    :  504 ( 6.6%)
KIEHLS     :  429 ( 5.6%)	MNY        :  408 ( 5.3%)	MAYBELLINE :  346 ( 4.5%)	ARMANI     :  337 ( 4.4%)	KERASTASE  :  335 ( 4.4%)
NYX        :  286 ( 3.7%)	CERAVE     :  239 ( 3.1%)	LOP        :  110 ( 1.4%)	PRADA      :   84 ( 1.1%)	BIOTHERM   :   80 ( 1.0%)
REDKEN     :   30 ( 0.4%)	

product type
skincare   : 2429 (31.8%)	makeup     : 1963 (25.7%)	haircare   : 1640 (21.4%)	toiletries :  934 (12.2%)	fragrance  :  682 ( 8.9%)	

product target gender
unspecified : 4844 (63.3%)	female     : 2155 (28.2%)	male       :  460 ( 6.0%)	unisex  

In [21]:
def metadata_x_metadata_stacked(data_df, primary_metadata, secondary_metadata, figsize=(22, 10)):
    assert primary_metadata in ["brand", "country", "product type", "product target gender"]
    assert secondary_metadata in ["brand", "country", "product type", "product target gender"]
    n_primary_metadata_cats = data_df[primary_metadata].unique().size
    primary_metadata_dist = collections.Counter(data_df[primary_metadata].tolist())
    primary_metadata_cats = sorted(primary_metadata_dist.keys(), key=lambda cat: primary_metadata_dist[cat], 
                                   reverse=True)
    n_secondary_metadata_cats = data_df[secondary_metadata].unique().size
    secondary_metadata_dist = collections.Counter(data_df[secondary_metadata].tolist())
    secondary_metadata_cats = sorted(secondary_metadata_dist.keys(), key=lambda cat: secondary_metadata_dist[cat], 
                                   reverse=True)
    metadata_x_metadata_arr = np.zeros((n_primary_metadata_cats, n_secondary_metadata_cats), dtype=int)
    for (primary_metadata_cat, secondary_metadata_cat), df in data_df.groupby([primary_metadata, secondary_metadata]):
        i = primary_metadata_cats.index(primary_metadata_cat)
        j = secondary_metadata_cats.index(secondary_metadata_cat)
        metadata_x_metadata_arr[i, j] = len(df)
    metadata_x_metadata_arr = 100 * metadata_x_metadata_arr / metadata_x_metadata_arr.sum(axis=1, keepdims=True)
    bar_width = 0.8
    bottom = np.zeros(n_primary_metadata_cats, dtype=float)
    x = np.arange(n_primary_metadata_cats)
    plt.figure(figsize=figsize)
    for j, secondary_metadata_cat in enumerate(secondary_metadata_cats):
        height = metadata_x_metadata_arr[:, j]
        plt.bar(x, height, width=bar_width, bottom=bottom, label=secondary_metadata_cat, edgecolor="black", lw=2)
        bottom += height
    plt.legend(bbox_to_anchor=(1, 1))
    xrotation = 90 if n_primary_metadata_cats > 12 else 0
    plt.xticks(x, primary_metadata_cats, fontsize="small", rotation=xrotation)
    plt.xlabel(primary_metadata)
    plt.ylabel("PERCENTAGE OF ADS")
    plt.title(f"{secondary_metadata.upper()} DISTRIBUTION OF {primary_metadata.upper()} ADS")
    plt.savefig(f"plots/metadata_x_metadata/{primary_metadata}_x_{secondary_metadata}.png", bbox_inches="tight", 
                pad_inches=0, dpi=300)
    plt.close()

In [22]:
metadata_x_metadata_stacked(data_df, "brand", "product type")
metadata_x_metadata_stacked(data_df, "country", "product type")
metadata_x_metadata_stacked(data_df, "brand", "country", figsize=(22, 15))
metadata_x_metadata_stacked(data_df, "country", "brand", figsize=(22, 15))

## Create Data

In [25]:
# rp_data rows x gender x age x skin tone

gender_cats = ["male", "female"]
age_cats = [(0, 15), (15, 25), (25, 35), (35, 45), (45, 55), (55, 65), (65, 75)]
age_cats2 = ["<15y", "15y-25y", "25y-35y", "35y-45y", "45-75y"]
skintone_cats = [(-np.inf, 1.1), (1.1, 2.1), (2.1, 3.1), (3.1, 4.1), (4.1, 5.1),
                 (5.1, 6.1), (6.1, 7.1), (7.1, 8.1), (8.1, 9.1), (9.1, 10.1)]
skintone_cats2 = ["light", "medium", "dark"]

rp_data = np.zeros((len(data_df), len(gender_cats), len(age_cats), len(skintone_cats)), dtype=int)

for i, row in data_df.iterrows():
    for j, gender in enumerate(gender_cats):
        for k, age_bin in enumerate(age_cats):
            age_lb, age_ub = age_bin
            for l, skintone_bin in enumerate(skintone_cats):
                skintone_lb, skintone_ub = skintone_bin
                column = f"({gender}, [{age_lb}, {age_ub}), [{skintone_lb}, {skintone_ub}))"
                if column in row.index:
                    rp_data[i, j, k, l] += row[column]

rp_data2 = np.zeros((len(data_df), len(gender_cats), len(age_cats2), len(skintone_cats2)), dtype=int)
rp_data2[:, :, :4, 0] = rp_data[:, :, :4, :3].sum(axis=3)
rp_data2[:, :, :4, 1] = rp_data[:, :, :4, 3:7].sum(axis=3)
rp_data2[:, :, :4, 2] = rp_data[:, :, :4, 7:].sum(axis=3)
rp_data2[:, :, 4, 0] = rp_data[:, :, 4:, :3].sum(axis=(2,3))
rp_data2[:, :, 4, 1] = rp_data[:, :, 4:, 3:7].sum(axis=(2, 3))
rp_data2[:, :, 4, 2] = rp_data[:, :, 4:, 7:].sum(axis=(2,3))

## Demographic Identity vs Metadata

In [28]:
# one metadata vs one rp

def metadata_x_rp(df, rp_data, metadata, rp, confidence=0.05, condition=None):
    assert rp in ["gender", "age", "skin tone"]
    assert metadata in ["country", "brand", "product type", "product target gender"]
    filename = f"{rp}_x_{metadata}"
    if condition is not None:
        filename += f"_{condition}"
    metadata_cat_to_count = collections.Counter(df[metadata])
    metadata_cats = [metadata_cat for metadata_cat, count in metadata_cat_to_count.items() if count >= 30]
    removed_metadata_cats = [x for x in df[metadata].unique() if x not in metadata_cats]
    if len(metadata_cats) <= 1:
        return
    if condition is not None:
        print(condition)
    fw = open(f"text/rp_x_metadata/{filename}.txt", "w")
    title = f"{rp} vs {metadata}"
    if condition is not None:
        title += f" {condition}"
    print(title)
    print(f"{metadata} categories : {metadata_cats}")
    fw.write(f"{title}\n")
    fw.write(f"{metadata} cats = {metadata_cats}\n")
    print(f"removed categories : {removed_metadata_cats}")
    fw.write(f"removed {metadata} cats = {removed_metadata_cats}\n")
    if rp == "gender":
        rp_cats = gender_cats
    elif rp == "age":
        rp_cats = age_cats2
    else:
        rp_cats = skintone_cats2
    rp_arr = np.zeros((df[metadata].isin(metadata_cats).sum(), len(rp_cats)))
    metadata_arr = []

    j = 0
    for i, row in df.iterrows():
        metadata_value = row[metadata]
        if metadata_value in metadata_cats:
            metadata_arr.append(metadata_value)
            if rp == "gender":
                rp_arr[j] = 100 * rp_data[i].sum(axis=(1, 2))/rp_data[i].sum()
            elif rp == "age":
                rp_arr[j] = 100 * rp_data[i].sum(axis=(0, 2))/rp_data[i].sum()
            else:
                rp_arr[j] = 100 * rp_data[i].sum(axis=(0, 1))/rp_data[i].sum()
            j += 1
    metadata_arr = np.array(metadata_arr)

    most_common_metadata_cat = rp_arr.sum(axis=0).argmax()
    metadata_cats = sorted(metadata_cats, key=lambda x: rp_arr[:, most_common_metadata_cat][metadata_arr == x].mean(), 
                           reverse=True)
    
    plt.figure(figsize=(22, 10))
    x = np.arange(len(metadata_cats))
    group_width = 0.8
    bar_width = group_width/len(rp_cats)
    for i, rp_cat in enumerate(rp_cats):
        rp_cat_arr = rp_arr[:, i]
        y = [rp_cat_arr[metadata_arr == metadata_cat].mean() for metadata_cat in metadata_cats]
        bc = plt.bar(x + i * bar_width, y, width=bar_width, label=str(rp_cat), edgecolor="black", linewidth=2)
        plt.bar_label(bc, [f"{yy:.1f}%" for yy in y], padding=5, rotation=90, fontsize="small")
    bottom, top = plt.ylim()
    plt.ylim(bottom, top + 10)
    plt.ylabel("AVERAGE SCREEN TIME")
    xrotation = 90 if len(x) > 12 else 0
    plt.xticks(x + bar_width*(len(rp_cats) - 1)/2, metadata_cats, ha="center", rotation=xrotation)
    plt.yticks(fontsize="small")
    plt.xlabel(metadata.upper())
    condition_title = " (" + condition + ")" if condition is not None else ""
    plt.title(f"DISTRIBUTION OF {rp.upper()} SCREEN TIME ACROSS {plural(metadata).upper()}{condition_title}")
    plt.legend()
    plt.savefig(f"plots/rp_x_metadata/{filename}.png", bbox_inches="tight", pad_inches=0)
    plt.close()

    for i, rp_cat in enumerate(rp_cats):
        rp_cat_arr = rp_arr[:, i]
        metadata_to_rp_arr = {metadata_cat:rp_cat_arr[metadata_arr == metadata_cat] for metadata_cat in metadata_cats}
        metadata_sorted_by_mean = sorted(metadata_cats, key=lambda x: metadata_to_rp_arr[x].mean(), reverse=True)
        pvalues = []
        for j in range(len(metadata_cats) - 1):
            cat1, cat2 = metadata_sorted_by_mean[j], metadata_sorted_by_mean[j + 1]
            arr1, arr2 = metadata_to_rp_arr[cat1], metadata_to_rp_arr[cat2]
            _, pvalue = scipy.stats.ttest_ind(arr1, arr2, equal_var=False, alternative="greater")
            pvalues.append(pvalue)
        significant = np.array(pvalues)/(len(metadata_cats) - 1) < confidence
        print(f"{rp_cat} : ", end="")
        fw.write(f"{rp_cat}\n")
        for j in range(len(metadata_cats) - 1):
            cat = metadata_sorted_by_mean[j]
            sign = ">" if significant[j] else "="
            print(f"{cat} {sign} ", end="")
            fw.write(f"{cat} {sign} ")
        print(metadata_sorted_by_mean[-1])
        fw.write(f"{metadata_sorted_by_mean[-1]}\n")
    print()
    fw.write("\n")
    fw.close()

In [29]:
metadata_vars = ["country", "brand", "product type", "product target gender"]
rp_vars = ["gender", "age", "skin tone"]

for metadata in metadata_vars:
    for rp in rp_vars:
        metadata_x_rp(data_df, rp_data2, metadata, rp)

gender vs country
country categories : ['Mexico', 'Brazil', 'USA', 'France', 'Germany', 'Italy', 'Spain', 'UK', 'India', 'Indonesia']
removed categories : ['Canada']
male : India > UK > France > Germany > USA > Italy > Brazil > Mexico = Spain > Indonesia
female : Indonesia > Spain = Mexico > Brazil > Italy > USA > Germany > France > UK > India

age vs country
country categories : ['Mexico', 'Brazil', 'USA', 'France', 'Germany', 'Italy', 'Spain', 'UK', 'India', 'Indonesia']
removed categories : ['Canada']
<15y : Germany > USA > UK = Italy > Spain > France > Indonesia > Brazil > Mexico > India
15y-25y : Brazil > USA > Germany > France > UK > Italy > Indonesia = Spain > Mexico > India
25y-35y : India > Indonesia > Mexico > Spain > Italy > France > UK > Germany = Brazil > USA
35y-45y : Mexico > India > UK > Spain > France > Germany > Italy = USA > Brazil > Indonesia
45-75y : Germany > UK = France > Spain = Italy > Mexico > USA > India > Brazil > Indonesia

skin tone vs country
country cate

## Demographic Identity Diversity vs Metadata

In [8]:
# metadata vs rp var

def metadata_x_rp_std(df, rp_data, metadata, rp, confidence=0.05):
    assert rp in ["gender", "age", "skin tone"]
    assert metadata in ["country", "advertiser", "product type", "target gender"]
    if metadata == "country":
        column = "Country"
    elif metadata == "advertiser":
        column = "ProductAdvertiser"
    elif metadata == "product type":
        column = "brand_category"
    else:
        column = "targetgender"
    metadata_cat_to_count = collections.Counter(df[column])
    metadata_cats = [metadata_cat for metadata_cat, count in metadata_cat_to_count.items() if count >= 30]
    metadata_cats = sorted(metadata_cats, key=lambda x: metadata_cat_to_count[x], reverse=True)
    removed_metadata_cats = [x for x in df[column].unique() if x not in metadata_cats]
    print(f"{metadata} : {metadata_cats}")
    print(f"removed columns : {removed_metadata_cats}")
    if rp == "gender":
        rp_cats = gender_cats
    elif rp == "age":
        rp_cats = age_cats2
    else:
        rp_cats = skintone_cats2
    rp_arr = np.zeros((df[column].isin(metadata_cats).sum(), len(rp_cats)))
    metadata_arr = []

    j = 0
    for i, row in df.iterrows():
        metadata_value = row[column]
        if metadata_value in metadata_cats:
            metadata_arr.append(metadata_value)
            if rp == "gender":
                rp_arr[j] = rp_data[i].sum(axis=(1, 2))
            elif rp == "age":
                rp_arr[j] = rp_data[i].sum(axis=(0, 2))
            else:
                rp_arr[j] = rp_data[i].sum(axis=(0, 1))
            j += 1
    metadata_arr = np.array(metadata_arr)

    plt.figure(figsize=(25, 10))
    colors = plt.get_cmap("Paired").colors
    x = np.arange(len(metadata_cats))
    bar_width = 0.8
    rv = 2 * np.arange(len(rp_cats)).reshape(1, -1)/(len(rp_cats) - 1)
    n = rp_arr.sum(axis=1)
    mean = (rp_arr * rv).sum(axis=1)/n
    var = ((rp_arr * rv**2).sum(axis=1) - n * mean**2)/(n - 1 + 1e-8)
    stddev = np.sqrt(var)
    y = [stddev[metadata_arr == metadata_cat].mean() for metadata_cat in metadata_cats]
    metadata_cats = np.array(metadata_cats)
    metadata_cats = metadata_cats[np.argsort(y)]
    y = 100*np.sort(y)
    bc = plt.bar(x, y, width=bar_width, color=colors[2])
    plt.bar_label(bc, [f"{yy:.1f}" for yy in y], padding=5)
    bottom, top = plt.ylim()
    plt.ylim(bottom, top + 10)
    plt.ylabel(f"100 * AVERAGE {rp.upper()} STDDEV")
    plt.xticks(x, metadata_cats, fontsize="small")
    plt.xlabel(metadata)
    plt.title(f"DISTRIBUTION OF AVERAGE {rp.upper()} STDDEV OVER {metadata.upper()}")
    plt.savefig(f"plots/{rp}_diversity_x_{metadata}.png")
    plt.close()

    metadata_to_rp_arr = {metadata_cat:stddev[metadata_arr == metadata_cat] for metadata_cat in metadata_cats}
    metadata_sorted_by_mean = sorted(metadata_cats, key=lambda x: metadata_to_rp_arr[x].mean())
    pvalues = []
    for j in range(len(metadata_cats) - 1):
        cat1, cat2 = metadata_sorted_by_mean[j], metadata_sorted_by_mean[j + 1]
        arr1, arr2 = metadata_to_rp_arr[cat1], metadata_to_rp_arr[cat2]
        _, pvalue = scipy.stats.ttest_ind(arr1, arr2, equal_var=False, alternative="less")
        pvalues.append(pvalue)
    significant = np.array(pvalues)/(len(metadata_cats) - 1) < confidence
    for j in range(len(metadata_cats) - 1):
        cat = metadata_sorted_by_mean[j]
        sign = "<" if significant[j] else "="
        print(f"{cat} {sign} ", end="")
    print(metadata_sorted_by_mean[-1])
    print()

In [9]:
metadata_vars = ["country", "advertiser", "product type", "target gender"]
rp_vars = ["gender", "age", "skin tone"]

for metadata in metadata_vars:
    for rp in rp_vars:
        metadata_x_rp_std(data_df, rp_data2, metadata, rp)

country : ['Germany', 'UK', 'France', 'Italy', 'USA', 'Spain', 'Brazil', 'India', 'Mexico', 'Indonesia']
removed columns : ['Canada']
Italy < Brazil < Spain < UK < Germany < France = Mexico < USA < India = Indonesia

country : ['Germany', 'UK', 'France', 'Italy', 'USA', 'Spain', 'Brazil', 'India', 'Mexico', 'Indonesia']
removed columns : ['Canada']
USA < Italy < Germany < UK < France < Spain < Brazil < India < Indonesia < Mexico

country : ['Germany', 'UK', 'France', 'Italy', 'USA', 'Spain', 'Brazil', 'India', 'Mexico', 'Indonesia']
removed columns : ['Canada']
India = Mexico < Indonesia < Germany < Spain < Italy < France < Brazil < USA < UK

advertiser : ['GARNIER', 'OAP', 'LRP', 'YSL', 'VICHY', 'LANCOME', 'KIEHLS', 'MNY', 'MAYBELLINE', 'ARMANI', 'KERASTASE', 'NYX', 'CERAVE', 'LOP', 'PRADA', 'BIOTHERM', 'REDKEN']
removed columns : []
REDKEN < PRADA < OAP < MNY < LANCOME < VICHY < MAYBELLINE < KERASTASE < BIOTHERM < CERAVE < GARNIER < ARMANI < LOP < KIEHLS < NYX < LRP < YSL

advertiser

## Demographic Identity vs Metadata Pair

In [14]:
metadata_columns = ["Country", "ProductAdvertiser", "brand_category", "targetgender"]

for i in range(len(metadata_columns)):
    for j in range(i + 1, len(metadata_columns)):
        col1 = metadata_columns[i]
        col2 = metadata_columns[j]
        gdf = data_df.groupby([col1, col2]).count()["Creative ID"].reset_index().rename(columns={"Creative ID": "n"})
        print(f"{col1} x {col2}")
        n = (gdf.n >= 30).sum()
        gdf1 = gdf[gdf.n >= 30].groupby(col1).count()["n"].reset_index()
        n1 = (gdf1.n > 1).sum()
        cats1 = gdf1.loc[gdf1.n > 1, col1].values
        gdf2 = gdf[gdf.n >= 30].groupby(col2).count()["n"].reset_index()
        n2 = (gdf2.n > 1).sum()
        cats2 = gdf2.loc[gdf2.n > 1, col2].values
        print(f"{n} ({col1}, {col2}) pairs have atleast 30 samples")
        print(f"{n1} {col1}s have atleast 2 {col2}s with atleast 30 samples: {cats1}")
        print(f"{n2} {col2}s have atleast 2 {col1}s with atleast 30 samples: {cats2}")
        print()

Country x ProductAdvertiser
52 (Country, ProductAdvertiser) pairs have atleast 30 samples
6 Countrys have atleast 2 ProductAdvertisers with atleast 30 samples: ['France' 'Germany' 'Italy' 'Spain' 'UK' 'USA']
13 ProductAdvertisers have atleast 2 Countrys with atleast 30 samples: ['ARMANI' 'CERAVE' 'GARNIER' 'KERASTASE' 'KIEHLS' 'LANCOME' 'LOP' 'LRP'
 'MAYBELLINE' 'NYX' 'OAP' 'VICHY' 'YSL']

Country x brand_category
26 (Country, brand_category) pairs have atleast 30 samples
6 Countrys have atleast 2 brand_categorys with atleast 30 samples: ['France' 'Germany' 'Italy' 'Spain' 'UK' 'USA']
4 brand_categorys have atleast 2 Countrys with atleast 30 samples: ['fragrance' 'haircare' 'makeup' 'skincare']

Country x targetgender
20 (Country, targetgender) pairs have atleast 30 samples
6 Countrys have atleast 2 targetgenders with atleast 30 samples: ['France' 'Germany' 'Italy' 'Spain' 'UK' 'USA']
4 targetgenders have atleast 2 Countrys with atleast 30 samples: ['both' 'female' 'male' 'unspecified'

In [29]:
def metadata2_x_rp(df, rp_data, metadata1, metadata2, rp, confidence=0.05):
    assert rp in ["gender", "age", "skin tone"]
    assert metadata1 in ["country", "advertiser", "product type", "target gender"]
    assert metadata2 in ["country", "advertiser", "product type", "target gender"]
    if metadata1 == "country":
        column1 = "Country"
    elif metadata1 == "advertiser":
        column1 = "ProductAdvertiser"
    elif metadata1 == "product type":
        column1 = "brand_category"
    else:
        column1 = "targetgender"
    if metadata2 == "country":
        column2 = "Country"
    elif metadata2 == "advertiser":
        column2 = "ProductAdvertiser"
    elif metadata2 == "product type":
        column2 = "brand_category"
    else:
        column2 = "targetgender"
    metadata1_cats = df[column1].unique()
    metadata2_cats = df[column2].unique()
    
    for metadata1_cat in metadata1_cats:
        index = df[column1] == metadata1_cat
        fdf = df[index].copy()
        fdf.index = pd.RangeIndex(0, len(fdf))
        metadata1_in_condition = "product brand" if metadata1 == "advertiser" else metadata1
        metadata_x_rp(fdf, rp_data[index], metadata=metadata2, rp=rp, confidence=confidence, 
                      condition=f"{metadata1_in_condition}={metadata1_cat}")
    
    for metadata2_cat in metadata2_cats:
        index = df[column2] == metadata2_cat
        fdf = df[index].copy()
        fdf.index = pd.RangeIndex(0, len(fdf))
        metadata2_in_condition = "product brand" if metadata2 == "advertiser" else metadata2
        metadata_x_rp(fdf, rp_data[index], metadata=metadata1, rp=rp, confidence=confidence, 
                      condition=f"{metadata2_in_condition}={metadata2_cat}")

In [32]:
metadata_vars = ["country", "advertiser", "product type", "target gender"]
rp_vars = ["gender", "age", "skin tone"]

for i in range(len(metadata_vars)):
    for j in range(i + 1, len(metadata_vars)):
        for rp in rp_vars:
            metadata2_x_rp(data_df, rp_data2, metadata_vars[i], metadata_vars[j], rp)

country=USA
gender vs advertiser country=USA
advertiser categories : ['YSL', 'OAP', 'GARNIER', 'MAYBELLINE', 'CERAVE', 'ARMANI', 'NYX']
removed categories : ['KIEHLS', 'LANCOME', 'LRP', 'PRADA', 'REDKEN', 'VICHY']
male : CERAVE = YSL > OAP > ARMANI = MAYBELLINE > GARNIER > NYX
female : NYX > GARNIER > MAYBELLINE = ARMANI > OAP > YSL = CERAVE

country=France
gender vs advertiser country=France
advertiser categories : ['LANCOME', 'OAP', 'LRP', 'GARNIER', 'VICHY', 'MAYBELLINE', 'KIEHLS']
removed categories : ['ARMANI', 'BIOTHERM', 'CERAVE', 'KERASTASE', 'LOP', 'NYX', 'PRADA', 'YSL']
male : KIEHLS > LRP = VICHY > GARNIER > LANCOME > OAP > MAYBELLINE
female : MAYBELLINE > OAP > LANCOME > GARNIER > VICHY = LRP > KIEHLS

country=Germany
gender vs advertiser country=Germany
advertiser categories : ['GARNIER', 'MNY', 'LRP', 'VICHY', 'KERASTASE', 'KIEHLS', 'NYX', 'CERAVE', 'LANCOME', 'YSL', 'ARMANI', 'BIOTHERM', 'LOP']
removed categories : ['PRADA', 'REDKEN']
male : VICHY > LRP > ARMANI > NYX > 

In [25]:
print((data_df.groupby("Filename").count()["Creative ID"] > 1).sum())
print((data_df.groupby("Creative ID").count()["Filename"] > 1).sum())
print((data_df.groupby("Description").count()["Filename"] > 1).sum())
print((data_df.groupby(["Channel","frames_analyzed"], dropna=True).count()["Filename"] > 1).sum())

0
12
708
143


In [26]:
(data_df.groupby("frames_analyzed").count()["Filename"] > 1).sum()

261

In [29]:
gdf = data_df.groupby(["Country", "ProductAdvertiser", "brand_category", "targetgender"]).count()
(gdf["Filename"] >= 30).sum()

65

In [11]:
id_df = data_df[["Creative ID", "Description", "frames_analyzed", "Channel"]].copy()
id_df["sl"] = np.arange(len(id_df))
id_df.columns = ["creative_id", "description", "frames", "channel", "sl"]
id_df = id_df[["sl", "creative_id", "description", "frames", "channel"]]
id_df

Unnamed: 0,sl,creative_id,description,frames,channel
0,0,598023314,Fragrance bottle. American actress Adria Arjon...,450,Discovery Channel
1,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca
2,2,593974914,A man. Product. Computer demonstration.,302,13 Azteca
3,3,188023914,Woman with the product. Close up of the produc...,302,13 Azteca
4,4,138536114,A woman showing 3 fingers. 3 actions. Three wo...,602,7 Azteca
...,...,...,...,...,...
7643,7643,1761423105,Taapsee Pannu - actress. Nourishment of three ...,599,Ramdhenu
7644,7644,116492614,Woman says Hyaluron Acid\'s hydrating power . ...,744,Sony Pal
7645,7645,1778171205,Aishwarya Rai Bachchan (actress). Total Repair...,596,Colors Rishtey
7646,7646,1775317405,Aishwarya Rai Bachchan (actress). Total Repair...,595,PTC Punjabi


In [14]:
cid_df = id_df.join(id_df, how="cross", lsuffix="1", rsuffix="2")
cid_df = cid_df[cid_df["sl1"] < cid_df["sl2"]]
cid_df

Unnamed: 0,sl1,creative_id1,description1,frames1,channel1,sl2,creative_id2,description2,frames2,channel2
1,0,598023314,Fragrance bottle. American actress Adria Arjon...,450,Discovery Channel,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca
2,0,598023314,Fragrance bottle. American actress Adria Arjon...,450,Discovery Channel,2,593974914,A man. Product. Computer demonstration.,302,13 Azteca
3,0,598023314,Fragrance bottle. American actress Adria Arjon...,450,Discovery Channel,3,188023914,Woman with the product. Close up of the produc...,302,13 Azteca
4,0,598023314,Fragrance bottle. American actress Adria Arjon...,450,Discovery Channel,4,138536114,A woman showing 3 fingers. 3 actions. Three wo...,602,7 Azteca
5,0,598023314,Fragrance bottle. American actress Adria Arjon...,450,Discovery Channel,5,49036214,Silvia Navarro and Sofia Reyes telling why the...,601,5 Televisa
...,...,...,...,...,...,...,...,...,...,...
58468958,7644,116492614,Woman says Hyaluron Acid\'s hydrating power . ...,744,Sony Pal,7646,1775317405,Aishwarya Rai Bachchan (actress). Total Repair...,595,PTC Punjabi
58468959,7644,116492614,Woman says Hyaluron Acid\'s hydrating power . ...,744,Sony Pal,7647,1761419405,No ammonia. Fashion model dressed in pink show...,599,Zee TV HD
58476606,7645,1778171205,Aishwarya Rai Bachchan (actress). Total Repair...,596,Colors Rishtey,7646,1775317405,Aishwarya Rai Bachchan (actress). Total Repair...,595,PTC Punjabi
58476607,7645,1778171205,Aishwarya Rai Bachchan (actress). Total Repair...,596,Colors Rishtey,7647,1761419405,No ammonia. Fashion model dressed in pink show...,599,Zee TV HD


In [22]:
cid_df.dtypes

sl1              int64
creative_id1     int64
description1    object
frames1          int64
channel1        object
sl2              int64
creative_id2     int64
description2    object
frames2          int64
channel2        object
dtype: object

In [37]:
cid_df["creative_id_match"] = (cid_df["creative_id1"].notna() & cid_df["creative_id2"].notna() 
                               & (cid_df["creative_id1"] == cid_df["creative_id2"]))
cid_df["description_match"] = (cid_df["description1"].notna() 
                               & ~(cid_df["description1"].str.strip().str.lower().isin(["", "not available"]))
                               & cid_df["description2"].notna() 
                               & ~(cid_df["description2"].str.strip().str.lower().isin(["", "not available"])) 
                               & (cid_df["description1"].str.strip().str.lower() 
                                  == cid_df["description2"].str.strip().str.lower()))
cid_df["frames_match"] = (cid_df["frames1"].notna() & cid_df["frames2"].notna() 
                          & (cid_df["frames1"] == cid_df["frames2"]))
cid_df["channel_match"] = (cid_df["channel1"].notna() & (cid_df["channel1"].str.strip() != "") 
                           & cid_df["channel2"].notna() & (cid_df["channel2"].str.strip() != "") 
                           & (cid_df["channel1"].str.strip().str.lower() 
                              == cid_df["channel2"].str.strip().str.lower()))
print("creative ids match", cid_df["creative_id_match"].sum(), "times")
print("descriptions match", cid_df["description_match"].sum(), "times")
print("frames match", cid_df["frames_match"].sum(), "times")
print("channels match", cid_df["channel_match"].sum(), "times")

creative ids match 12 times
descriptions match 111097 times
frames match 5987297 times
channels match 4708 times


In [45]:
(data_df["Channel"] == "NaN").sum()

0

In [38]:
print("if creative ids match then descriptions match? ", 
      (cid_df["creative_id_match"] & ~cid_df["description_match"]).sum() == 0)

if creative ids match then descriptions match?  True


In [39]:
print("if descriptions match then frames match? ", (cid_df["description_match"] & ~cid_df["frames_match"]).sum() == 0)

if descriptions match then frames match?  False


In [41]:
(cid_df["description_match"] & cid_df["channel_match"] & ~cid_df["frames_match"]).sum()

39

In [40]:
cid_df[cid_df["description_match"] & ~cid_df["frames_match"]]

Unnamed: 0,sl1,creative_id1,description1,frames1,channel1,sl2,creative_id2,description2,frames2,channel2,creative_id_match,description_match,frames_match,channel_match
333620,43,721813914,Singer Dua Lipa on a bridge setting an eagle f...,602,2 Televisa,4756,654235014,Singer Dua Lipa on a bridge setting an eagle f...,600,SKY CINEMA BEST OF,False,True,False,False
512484,67,1813725205,A woman in the shower applying the product.,447,MTV,68,1813725805,A woman in the shower applying the product.,450,Bandeirantes,False,True,False,False
764903,100,946386614,LOREAL USA INC; ARMANI BEAUTY FRAGRANCES; COSM...,187,,103,947392314,LOREAL USA INC; ARMANI BEAUTY FRAGRANCES; COSM...,395,,False,True,False,False
764904,100,946386614,LOREAL USA INC; ARMANI BEAUTY FRAGRANCES; COSM...,187,,104,947397314,LOREAL USA INC; ARMANI BEAUTY FRAGRANCES; COSM...,1993,,False,True,False,False
764905,100,946386614,LOREAL USA INC; ARMANI BEAUTY FRAGRANCES; COSM...,187,,105,947441114,LOREAL USA INC; ARMANI BEAUTY FRAGRANCES; COSM...,180,,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58392474,7634,1814277105,Taapsee Pannu - actress. Nourishment of three ...,595,Balle Balle,7642,1773900705,Taapsee Pannu - actress. Nourishment of three ...,596,Ramdhenu,False,True,False,False
58392475,7634,1814277105,Taapsee Pannu - actress. Nourishment of three ...,595,Balle Balle,7643,1761423105,Taapsee Pannu - actress. Nourishment of three ...,599,Ramdhenu,False,True,False,False
58446010,7641,1769299505,Taapsee Pannu - actress. Nourishment of three ...,599,Ramdhenu,7642,1773900705,Taapsee Pannu - actress. Nourishment of three ...,596,Ramdhenu,False,True,False,True
58453659,7642,1773900705,Taapsee Pannu - actress. Nourishment of three ...,596,Ramdhenu,7643,1761423105,Taapsee Pannu - actress. Nourishment of three ...,599,Ramdhenu,False,True,False,True


In [42]:
cid_df[cid_df["description_match"] & cid_df["channel_match"] & ~cid_df["frames_match"]]

Unnamed: 0,sl1,creative_id1,description1,frames1,channel1,sl2,creative_id2,description2,frames2,channel2,creative_id_match,description_match,frames_match,channel_match
6180393,808,146842914,Actress Cate Blanchett diving out of a plane a...,240,Canal +,809,141533814,Actress Cate Blanchett diving out of a plane a...,241,Canal +,False,True,False,True
6249234,817,264427408,Australian actress Cate Blanchett wearing diff...,451,TF1 Series Film,818,264424808,Australian actress Cate Blanchett wearing diff...,299,TF1 Series Film,False,True,False,True
6325764,827,747186614,Satisfied customer talks about product that co...,600,6ter,868,264504108,Satisfied customer talks about product that co...,447,6ter,False,True,False,True
7748439,1013,141452514,Actress Julia Roberts out with friends in Pari...,600,TF1 Series Film,1015,141418114,Actress Julia Roberts out with friends in Pari...,900,TF1 Series Film,False,True,False,True
10785105,1410,264209708,Program against street harassment.,2527,M6,1425,558396514,Program against street harassment.,2507,M6,False,True,False,True
12291944,1607,264627908,Eagle flying over bridge. Singer Dua Lipa walk...,599,C News,1608,264622808,Eagle flying over bridge. Singer Dua Lipa walk...,299,C News,False,True,False,True
12483169,1632,616113614,Actress Cate Blanchett on a red Ferris wheel. ...,452,SAT1,1633,616124914,Actress Cate Blanchett on a red Ferris wheel. ...,302,SAT1,False,True,False,True
12911515,1688,197154214,Close-ups. Woman under water touching her skin...,680,VOX,1691,124408214,Close-ups. Woman under water touching her skin...,602,VOX,False,True,False,True
18472336,2415,58202414,Close-up. Woman using this face moisturiser. W...,693,VOX,2416,54261714,Close-up. Woman using this face moisturiser. W...,602,VOX,False,True,False,True
18870085,2467,1676174205,Close-up. Woman washing her hair in shower. Pr...,693,VOX,2469,1662776905,Close-up. Woman washing her hair in shower. Pr...,602,VOX,False,True,False,True


In [43]:
(cid_df["description_match"] & cid_df["frames_match"] & ~cid_df["channel_match"]).sum()

73042

In [46]:
cid_df[cid_df["description_match"] & cid_df["frames_match"] & ~cid_df["channel_match"] & cid_df["channel1"].notna() & cid_df["channel2"].notna()]

Unnamed: 0,sl1,creative_id1,description1,frames1,channel1,sl2,creative_id2,description2,frames2,channel2,creative_id_match,description_match,frames_match,channel_match
6219498,813,264739208,Black &amp; white. Model Aleksandar Rusic jump...,300,L\'equipe,1674,1883966305,Black &amp; white. Model Aleksandar Rusic jump...,300,SKY COMEDY,False,True,True,False
6325736,827,747186614,Satisfied customer talks about product that co...,600,6ter,840,95858814,Satisfied customer talks about product that co...,600,CStar,False,True,True,False
6325737,827,747186614,Satisfied customer talks about product that co...,600,6ter,841,95859814,Satisfied customer talks about product that co...,600,CStar,False,True,True,False
6417534,839,356035014,Women using products rich in vitamin C.,600,6ter,862,264589708,Women using products rich in vitamin C.,600,CStar,False,True,True,False
6601092,863,264511608,Satisfied customer talks about product that co...,447,Cherie 25,868,264504108,Satisfied customer talks about product that co...,447,6ter,False,True,True,False
11500500,1503,95859414,Actress Emma Watson looking at various screens...,900,C News,5556,152666114,Actress Emma Watson looking at various screens...,900,R101 TV,False,True,True,False
12582607,1645,362418514,Bottle. Clock going backwards. British actor R...,902,WELT,1647,361324514,Bottle. Clock going backwards. British actor R...,902,SKY CINEMA FUN,False,True,True,False
12850321,1680,1694904305,Actress Cate Blanchett wearing different red o...,452,RTL TELEVISION TELEVISION,1681,1694923905,Actress Cate Blanchett wearing different red o...,452,SAT1,False,True,True,False
25480682,3331,1809731305,Actress Amanda Seyfried raising question. Spli...,452,VOX,5194,321413914,Actress Amanda Seyfried raising question. Spli...,452,TOP CRIME,False,True,True,False
36493380,4771,361346614,Singer Dua Lipa on a bridge setting an eagle f...,902,RTL TELEVISION TELEVISION,4772,361326614,Singer Dua Lipa on a bridge setting an eagle f...,902,SIXX TELEVISION,False,True,True,False


In [47]:
cid_df[cid_df["frames_match"] & cid_df["channel_match"] & cid_df["channel1"].notna() & cid_df["channel2"].notna() & ~cid_df["description_match"]]

Unnamed: 0,sl1,creative_id1,description1,frames1,channel1,sl2,creative_id2,description2,frames2,channel2,creative_id_match,description_match,frames_match,channel_match
7660,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca,12,1957265005,Serum in a bottle. Vitamin C. Woman applies th...,602,13 Azteca,False,False,True,True
7663,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca,15,1714491605,Two men working out in a gym. They get a messa...,602,13 Azteca,False,False,True,True
7676,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca,28,1661493705,Macarena Garcia (Mexican actress) using the pr...,602,13 Azteca,False,False,True,True
7679,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca,31,458070714,Split screen. Different women. Woman Using the...,602,13 Azteca,False,False,True,True
7686,1,593974614,Men doing pushup. Product demonstration. resul...,602,13 Azteca,38,1678362005,A woman talking about the product. Close up of...,602,13 Azteca,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58331275,7626,2033744505,3 women showcasing their long hair with differ...,899,Goldmines Bollywood,7627,1987064905,Three women showing off their different hair c...,899,Goldmines Bollywood,False,False,True,True
58338924,7627,1987064905,Three women showing off their different hair c...,899,Goldmines Bollywood,7628,2037996705,3 women showcasing their long hair with differ...,899,Goldmines Bollywood,False,False,True,True
58338925,7627,1987064905,Three women showing off their different hair c...,899,Goldmines Bollywood,7629,2004625205,3 women showcasing their long hair with differ...,899,Goldmines Bollywood,False,False,True,True
58400116,7635,1799877605,John Abraham (actor) suggests this hair colour...,444,Raj TV,7636,1775317805,John Abraham (actor). Quick and easy like sham...,444,Raj TV,False,False,True,True
