In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
original_stereo = pd.read_csv('../predoc_info/predoc_toys_w_gender_label.csv')
original_items = original_stereo['item'].unique()

In [3]:
original_stereo

Unnamed: 0,Gender,item
0,BOY,vehicle toys
1,BOY,sport
2,BOY,military toys
3,BOY,race cars
4,BOY,outer space toys
...,...,...
170,NEUTRAL,scooter
171,NEUTRAL,drum set
172,NEUTRAL,puzzles
173,NEUTRAL,board games


In [4]:
azuk = pd.read_csv('../az_uk_query_suggestions.csv')
argos = pd.read_csv('../argos_query_suggestions.csv')
df = pd.concat([azuk, argos])
df

Unnamed: 0,platform,item,suggestions
0,Amazon_UK,vehicle toys,"['vehicle toys for toddlers 1-3', 'vehicle toy..."
1,Amazon_UK,sport,"['sport formula 99', 'sport formula 99 vitamin..."
2,Amazon_UK,military toys,"['military toys for boys age 8-12', 'military ..."
3,Amazon_UK,race cars,"['race cars for kids ages 3-5', 'race cars for..."
4,Amazon_UK,outer space toys,"['outer space toys for kids 5-7', 'outer space..."
...,...,...,...
161,Argos,scooter,"['scooter for adults', 'scooter for', 'lego ic..."
162,Argos,drum set,[]
163,Argos,puzzles,[]
164,Argos,board games,"['board game for kids', 'family board game for..."


In [5]:
stereo_toys = pd.read_csv('../predoc_info/predoc_stereotyped_items.csv', delimiter =',')
stereo_toys

Unnamed: 0,BOY,GIRL,NEUTRAL
0,vehicle toys,doll,toy animals
1,sport,domestic toys,books
2,military toys,educational art,educational teaching
3,race cars,clothes,musical games
4,outer space toys,dollhouses,games
...,...,...,...
67,toy rocket,barbie furniture set,
68,soccer ball,,
69,blue ipad,pink ipad,
70,toy robots,,


In [6]:
set(df['item'].unique()).difference(set(original_stereo['item'].unique()))

{'baby dolls', 'barbie', 'playhouses', 'soldier toys', 'toy vehicles'}

In [7]:
total = df['item'].nunique()
total

166

In [8]:
#df.to_csv('query_sugg_all_plat.csv', index=False)

In [9]:
def gender_association_measure(suggestions):
    gender = {'girl':0, 'boy': 0}
    if len(suggestions)<=2:
        return "no suggestion"
    suggestion_list = suggestions.split(',')
    for suggestion in suggestion_list:
        if 'girl' in suggestion:
                gender['girl'] += 1
        if 'boy' in suggestion:
            gender['boy'] += 1
    return gender

In [10]:
df['gender_association'] = df['suggestions'].apply(lambda x: gender_association_measure(x))
df

Unnamed: 0,platform,item,suggestions,gender_association
0,Amazon_UK,vehicle toys,"['vehicle toys for toddlers 1-3', 'vehicle toy...","{'girl': 0, 'boy': 5}"
1,Amazon_UK,sport,"['sport formula 99', 'sport formula 99 vitamin...","{'girl': 0, 'boy': 0}"
2,Amazon_UK,military toys,"['military toys for boys age 8-12', 'military ...","{'girl': 0, 'boy': 7}"
3,Amazon_UK,race cars,"['race cars for kids ages 3-5', 'race cars for...","{'girl': 1, 'boy': 3}"
4,Amazon_UK,outer space toys,"['outer space toys for kids 5-7', 'outer space...","{'girl': 0, 'boy': 0}"
...,...,...,...,...
161,Argos,scooter,"['scooter for adults', 'scooter for', 'lego ic...","{'girl': 0, 'boy': 0}"
162,Argos,drum set,[],no suggestion
163,Argos,puzzles,[],no suggestion
164,Argos,board games,"['board game for kids', 'family board game for...","{'girl': 0, 'boy': 0}"


In [11]:
def gender_annotate(gender_assoc_dict):
    try:
        if gender_assoc_dict['girl'] > gender_assoc_dict['boy']:
            return 'girl'
        elif gender_assoc_dict['girl'] == gender_assoc_dict['boy']:
            return 'neutral'
        else:
            return 'boy'
    except:
        return

In [12]:
df['gender'] = df['gender_association'].apply(lambda x: gender_annotate(x))
df

Unnamed: 0,platform,item,suggestions,gender_association,gender
0,Amazon_UK,vehicle toys,"['vehicle toys for toddlers 1-3', 'vehicle toy...","{'girl': 0, 'boy': 5}",boy
1,Amazon_UK,sport,"['sport formula 99', 'sport formula 99 vitamin...","{'girl': 0, 'boy': 0}",neutral
2,Amazon_UK,military toys,"['military toys for boys age 8-12', 'military ...","{'girl': 0, 'boy': 7}",boy
3,Amazon_UK,race cars,"['race cars for kids ages 3-5', 'race cars for...","{'girl': 1, 'boy': 3}",boy
4,Amazon_UK,outer space toys,"['outer space toys for kids 5-7', 'outer space...","{'girl': 0, 'boy': 0}",neutral
...,...,...,...,...,...
161,Argos,scooter,"['scooter for adults', 'scooter for', 'lego ic...","{'girl': 0, 'boy': 0}",neutral
162,Argos,drum set,[],no suggestion,
163,Argos,puzzles,[],no suggestion,
164,Argos,board games,"['board game for kids', 'family board game for...","{'girl': 0, 'boy': 0}",neutral


In [13]:
#queries with no gender association
df[df['gender'].isna()]

Unnamed: 0,platform,item,suggestions,gender_association,gender
7,Amazon_UK,doll-humanoid,[],no suggestion,
15,Amazon_UK,bug collection set,[],no suggestion,
19,Amazon_UK,volcano creator,[],no suggestion,
26,Amazon_UK,police station toy,[],no suggestion,
29,Amazon_UK,dragonballz,[],no suggestion,
...,...,...,...,...,...
158,Argos,wood blocks,[],no suggestion,
159,Argos,harry potter books,[],no suggestion,
162,Argos,drum set,[],no suggestion,
163,Argos,puzzles,[],no suggestion,


In [14]:
df.dropna(inplace = True)

In [15]:
#az_rand_num = az_rand.replace({'gender':{'boy':1, 'neutral':0, 'girl':-1}, 'Gender':{'BOY':1, 'NEUTRAL':0, 'GIRL':-1}})

### Match with actual list

## sys generated label

In [16]:
df.groupby('gender')['item'].count()

gender
boy        38
girl       51
neutral    81
Name: item, dtype: int64

In [17]:
rand_df = df[['platform', 'item','gender']]
rand_df = rand_df.reset_index(drop=True)

In [18]:
rand_df.groupby('gender')['item'].count()

gender
boy        38
girl       51
neutral    81
Name: item, dtype: int64

In [19]:
generated_list = rand_df.groupby(['platform', 'gender'])['item'].apply(list)
generated_list

platform   gender 
Amazon_UK  boy        [vehicle toys, military toys, race cars, actio...
           girl       [castle tent, dinosaur toy, sully costume, wwe...
           neutral    [sport, outer space toys, depots, machines, gi...
Argos      neutral    [sport, gears, helicopter, car toys, lego toys...
Name: item, dtype: object

In [20]:
list_item = generated_list.to_frame().reset_index()
list_item

Unnamed: 0,platform,gender,item
0,Amazon_UK,boy,"[vehicle toys, military toys, race cars, actio..."
1,Amazon_UK,girl,"[castle tent, dinosaur toy, sully costume, wwe..."
2,Amazon_UK,neutral,"[sport, outer space toys, depots, machines, gi..."
3,Argos,neutral,"[sport, gears, helicopter, car toys, lego toys..."


In [21]:
def match(row, gender_col, original_df):
    if row[gender_col] == 'boy':
        toys = list(original_df['BOY'].values)
    elif row[gender_col] == 'girl':
        toys = list(original_df['GIRL'].values)
    else:
        return
    union = len(row['item'])+len(toys)
    matched_items = set(row['item']).intersection(set(toys))
    return round(len(matched_items)/union,2)

In [22]:
list_item['count_match'] = list_item.apply(lambda x: match(x, 'gender',stereo_toys), axis = 1)
#list_df['frac_match'] = list_df['count_match']/list_df['total_item']
list_item

Unnamed: 0,platform,gender,item,count_match
0,Amazon_UK,boy,"[vehicle toys, military toys, race cars, actio...",0.26
1,Amazon_UK,girl,"[castle tent, dinosaur toy, sully costume, wwe...",0.34
2,Amazon_UK,neutral,"[sport, outer space toys, depots, machines, gi...",
3,Argos,neutral,"[sport, gears, helicopter, car toys, lego toys...",


## Randomize

In [23]:
def permutation(r_df, iteration, stereo_toys):
    #for i in range(10000):
    sys_label = r_df['gender'].values
    np.random.shuffle(sys_label)
    r_df['rand_gender'] = sys_label
    generated_list = r_df.groupby(['rand_gender'])['item'].apply(list)
    list_df = generated_list.to_frame().reset_index()
    list_df['jaccard_sim'] = list_df.apply(lambda x: match(x, 'rand_gender',stereo_toys), axis = 1)
    list_df['iteration'] = iteration
    list_df.drop(columns=['item'], inplace=True)
    list_df.dropna(inplace=True)
    return list_df

In [24]:
rand_df['platform']

0      Amazon_UK
1      Amazon_UK
2      Amazon_UK
3      Amazon_UK
4      Amazon_UK
         ...    
165        Argos
166        Argos
167        Argos
168        Argos
169        Argos
Name: platform, Length: 170, dtype: object

In [25]:
import warnings
warnings.filterwarnings('ignore')

result_pval = pd.DataFrame()
for i in range(100000):
    result = rand_df.groupby('platform')
    result = result.apply(lambda x: permutation(x, i, stereo_toys))
    result = result.reset_index('platform')
    result_pval = result_pval.append(result, ignore_index = True)

In [26]:
result_pval

Unnamed: 0,platform,rand_gender,jaccard_sim,iteration
0,Amazon_UK,boy,0.15,0
1,Amazon_UK,girl,0.18,0
2,Amazon_UK,boy,0.16,1
3,Amazon_UK,girl,0.14,1
4,Amazon_UK,boy,0.14,2
...,...,...,...,...
199995,Amazon_UK,girl,0.18,99997
199996,Amazon_UK,boy,0.15,99998
199997,Amazon_UK,girl,0.15,99998
199998,Amazon_UK,boy,0.18,99999


In [27]:
plot_pval = result_pval.groupby(['platform', 'rand_gender', 'jaccard_sim'])['iteration'].count()
plot_pval = plot_pval.reset_index()
plot_pval

Unnamed: 0,platform,rand_gender,jaccard_sim,iteration
0,Amazon_UK,boy,0.05,6
1,Amazon_UK,boy,0.06,40
2,Amazon_UK,boy,0.07,146
3,Amazon_UK,boy,0.08,436
4,Amazon_UK,boy,0.09,1161
5,Amazon_UK,boy,0.1,2655
6,Amazon_UK,boy,0.11,5064
7,Amazon_UK,boy,0.12,8120
8,Amazon_UK,boy,0.13,11735
9,Amazon_UK,boy,0.14,14375


In [28]:
def p_val_calc(sys_sim, random_sim, niteration, col_gender):
    gender = sys_sim['gender'].values[0]
    if gender == 'neutral':
        return
    platform = sys_sim['platform'].values[0]
    original_sim = sys_sim['count_match'].values[0]
    #print(sys_sim)
    random_plat_sim = random_sim[(random_sim[col_gender]==gender) & (random_sim['platform']==platform)]
    random_plat_sim = random_plat_sim[random_plat_sim['jaccard_sim']>=original_sim]
    #print(random_plat_sim)
    total = random_plat_sim['iteration'].sum()
    frac = total/niteration
    return frac

In [29]:
p_val = list_item.groupby(['platform','gender']).apply(lambda x: p_val_calc(x, plot_pval, 100000))
p_val = p_val.reset_index()
p_val.dropna(inplace=True)
p_val

TypeError: p_val_calc() missing 1 required positional argument: 'col_gender'

In [None]:
import matplotlib.pyplot as plt
az = plot_pval[(plot_pval['platform']=='Amazon') & (plot_pval['rand_gender']=='boy')]
az.plot.bar(x='jaccard_sim', y='iteration')

In [None]:
list_item.dropna()

In [None]:
df

## Randomize platforms

In [None]:
rand_plat_df = df[['platform', 'item', 'gender']].reset_index(drop=True)
rand_plat_df

In [None]:
import random
def swap(row):
    coin = random.randint(0, 1)
    if coin == 0:
        return row
    else:
        idx = pd.Index(row.index).values
        b, c = row.loc[idx[0]]['gender'], row.loc[idx[1]]['gender']
        temp = row.loc[idx[0]]['gender']
        row.loc[idx[0]]['gender'] = c
        row.loc[idx[1]]['gender'] = temp
    return row

In [None]:
def rand_plat(r_df, iteration, stereo_toys):
    #for i in range(10000):
    #sys_label = r_df['gender'].values
    #np.random.shuffle(sys_label)
    #r_df['rand_gender'] = sys_label
    generated_list = r_df.groupby(['gender'])['item'].apply(list)
    list_df = generated_list.to_frame().reset_index()
    list_df['jaccard_sim'] = list_df.apply(lambda x: match(x, 'gender',stereo_toys), axis = 1)
    list_df['iteration'] = iteration
    list_df.drop(columns=['item'], inplace=True)
    list_df.dropna(inplace=True)
    return list_df

In [None]:
rand_result_pval = pd.DataFrame()
for i in range(100000):
    rand_df = rand_plat_df.groupby('item').apply(lambda x: swap(x))
    rand_df.dropna(inplace=True)
    rand_result = rand_df.groupby('platform').apply(lambda x: rand_plat(x, i, stereo_toys))
    rand_result = rand_result.reset_index('platform')
    rand_result_pval = rand_result_pval.append(rand_result, ignore_index = True)

In [None]:
#rand_result_pval.to_csv('rand_plat.csv', index = False)
rand_result_pval = pd.read_csv('rand_plat.csv')
rand_result_pval

In [None]:
def difference(row):
    val = abs(row['jaccard_sim'].diff().values[1])
    return round(val,2)

In [None]:
diff_pa_val = rand_result_pval.groupby(['gender','iteration']).apply(lambda x: difference(x))
diff_pa_val = diff_pa_val.reset_index()
diff_pa_val

In [None]:
rand_plot_pval = diff_pa_val.groupby(['gender', 0])['iteration'].count()
rand_plot_pval = rand_plot_pval.reset_index()
rand_plot_pval

In [None]:
list_item

In [None]:
def rand_p_val_calc(sys_sim, random_sim, niteration, col_gender):
    gender = sys_sim['gender'].values[0]
    if gender == 'neutral':
        return
    platform = sys_sim['platform'].values[0]
    original_diff = abs(sys_sim['count_match'].diff().values[1])
    #print(sys_sim)
    random_plat_sim = random_sim[random_sim[col_gender]==gender]
    random_plat_sim = random_plat_sim[random_plat_sim[0]>=original_diff]
    #print(random_plat_sim)
    total = random_plat_sim['iteration'].sum()
    frac = total/niteration
    return frac

In [None]:
rand_p_val = list_item.groupby(['gender']).apply(lambda x: rand_p_val_calc(x, rand_plot_pval, 100000, 'gender'))
rand_p_val = rand_p_val.reset_index()
rand_p_val.dropna(inplace=True)
rand_p_val

In [None]:
import matplotlib.pyplot as plt
az = rand_plot_pval[(rand_plot_pval['platform']=='Target') & (rand_plot_pval['gender']=='girl')]
az.plot.bar(x='jaccard_sim', y='iteration')

### item gender for each item

In [None]:
df

In [None]:
new_df = df[['platform', 'item', 'gender_association']]

In [None]:
new_df = pd.concat([new_df.drop(['gender_association'], axis=1), new_df['gender_association'].apply(pd.Series)], axis=1)

In [None]:
#new_df.drop(columns=[0], inplace = True)
#new_df = new_df.dropna()
#new_df

In [None]:
g = new_df.groupby(['item'])['boy'].mean()
g = g.reset_index()
g

In [None]:
g['girl'] = new_df.groupby(['item'])['girl'].mean().values
g

In [None]:
g['diff'] = g['boy']-g['girl']
g

In [None]:
s = g.sort_values('diff', ascending=False).head(40)
s

In [None]:
test_df = stereo_toys.T
test_df['item'] = test_df.values.tolist()
new_df = test_df[['item']]
new_df.reset_index(inplace=True)
new_df.rename(columns={'index':'gender'},inplace=True)
#new_df = new_df.explode('item')
#new_df

In [None]:
def gender_score(row):
    if row == 'BOY':
        return 1
    elif row == 'GIRL':
        return -1
    else:
        return 0

In [None]:
new_df = new_df.explode('item')
new_df['gender_score'] = new_df['gender'].apply(lambda x: gender_score(x))
new_df

In [None]:
merged_df = pd.merge(g[['item', 'diff', 'boy', 'girl']], new_df[['item', 'gender_score']], how = 'outer', on ='item')
merged_df.drop_duplicates(keep='last', inplace=True)
merged_df

In [None]:
merged_df.sort_values(by='diff', ascending=True).head(45)

In [None]:
#merged_df.isna().sum()

In [None]:
#split_df = pd.concat([merged_df.drop(['score'], axis=1), merged_df['score'].apply(pd.Series)], axis=1)

In [None]:
def addcolor(a):

    if a== 1:
        return 'green'
    elif a== 0: 
        return 'black'
    elif a==-1: 
        return'orange'
def addsign(a):

    if a== 1:
        return 'x'
    elif a== 0: 
        return '+'
    elif a== -1: 
        return'o'
    #return a

merged_df['color'] = merged_df['gender_score'].apply(lambda x: addcolor(x))
merged_df['sign'] = merged_df['gender_score'].apply(lambda x: addsign(x))
#print(merged_df)
#val = 0. # this is the value where you want the data to appear on the y-axis.
x = list(merged_df['boy'].values)
#y = np.zeros_like(x)+val
y = list(merged_df['girl'].values)
col = list(merged_df['color'].values)
mark = list(merged_df['sign'].values)
n = list(merged_df['item'].values)
#fig, ax = plt.subplots()

for i, val in enumerate(n):
    x1 = x[i]    
    y1 = y[i]
    c1 = col[i]
    m = mark[i]
    plt.scatter(x1, y1, marker='x', c=c1)

#ticks = [-1, 0, 1]
#labels = ['GIRL', 'NEUTRAL', 'BOY']
#plt.yticks(ticks, labels)
#ticks = [min(x), 0, max(x)]
#plt.xticks(ticks, labels)
xpoints = ypoints = plt.xlim()
#plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0))
#plt.show()
plt.plot(xpoints, ypoints, color='k', lw=1, scalex=False, scaley=False)


#plt.show()
plt.xlabel("Similarity with Boy-Targeted Query")
plt.ylabel("Similarity with Girl-Targeted Query")

plt.savefig('SIGIR_gender_asso_scatter.pdf')

#plt.scatter(x, y, marker = merged_df['sign'].values, c=merged_df['color'])

In [None]:
import matplotlib.pyplot as plt
from numpy.random import random

colors = ['black', 'orange', 'green',]

lo = plt.scatter(random(10), random(10), marker='x', color=colors[0])
ll = plt.scatter(random(10), random(10), marker='x', color=colors[1])
l  = plt.scatter(random(10), random(10), marker='x', color=colors[2])


plt.legend((lo, ll, l,),
           ('Neutral', 'Girl', 'Boy'),
           scatterpoints=1,
           loc='best',
           ncol=3,
           fontsize=10, bbox_to_anchor=(1.1, 1.05))
#ax.legend()

plt.show()

In [None]:
boy = list(g[g['diff']>0]['item'].values)
r_boy = list(stereo_toys['BOY'].values)
inter = len(set(boy).intersection(set(r_boy)))
recall_boy = inter/len(r_boy)
recall_boy

In [None]:
girl = list(g[g['diff']<0]['item'].values)
r_girl = list(stereo_toys['GIRL'].values)
inter = len(set(girl).intersection(set(r_girl)))
recall_girl = inter/len(r_girl)
recall_girl

In [None]:
g.sort_values(by=['diff'], ascending= False)[60:70]

In [None]:
#g.sort_values(['girl', 'boy'], ascending=[True, False]).head(30)

In [None]:
gender_count = df.groupby(['item', 'gender'])['platform'].count()
gender_count = gender_count.reset_index()
gender = gender_count.loc[gender_count.groupby('item')['platform'].idxmax()]
gender

In [None]:
gender_list = gender.groupby('gender')['item'].apply(list)
gender_list

In [None]:
gender_list['neutral']

In [None]:
gender_list['girl']

In [None]:
gender_list['neutral']

### book for gender

## charts

In [None]:
gender_attach = df[['platform', 'item', 'gender_association']]
gender_attach

In [None]:
def gender_asso_count(row):
    try:
        if row['girl'] == 0 and row['boy'] == 0:
            return 'no gender'
        else:
            return 'gendered'
    except:
        return None

In [None]:
gender_attach['rate'] = gender_attach['gender_association'].apply(lambda x: gender_asso_count(x))
gender_attach

In [None]:
gender_attach = gender_attach.dropna()
gender_attach_count = gender_attach.groupby(['platform', 'rate'])['item'].count()
gender_attach_count = gender_attach_count.rename("count")
gender_attach_count = gender_attach_count.reset_index()
gender_attach_count

In [None]:
#frac = gender_attach_count.groupby(['platform']).agg({'count': 'sum'})
# Change: groupby state_office and divide by sum
gender_attach_count['count'] = gender_attach_count.groupby(['platform'])['count'].apply(lambda x:100 * x / total)#float(x.sum()))
gender_attach_count

In [None]:
table1 = pd.pivot_table(gender_attach_count, values='count', index=['platform'],columns=['rate'])
table1 = table1.reset_index()

In [None]:
#table = pd.pivot_table(platform_gender_count, values='count', index=['platform'],columns=['gender'])
#table=table.reset_index()

In [None]:
ax3 = table1.plot(x='platform', kind='bar', stacked=False, title='Gender association with items per platform')
#ax.figure.savefig('gender_platform.pdf')

In [None]:
platform_gender_count = df.groupby(['platform', 'gender'])['item'].count()
platform_gender_count = platform_gender_count.rename("count")
platform_gender_count = platform_gender_count.reset_index()
platform_gender_count

In [None]:
platform_gender_count.groupby('platform')['count'].sum()

In [None]:
table = pd.pivot_table(platform_gender_count, values='count', index=['platform'],columns=['gender'])
table=table.reset_index()

In [None]:
table

In [None]:
import seaborn as sns
ax = sns.barplot(x="platform", y="count", hue="gender", data=platform_gender_count)
ax.set(xlabel=None)
ax.set(ylabel=None)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=10)
ax.figure.savefig('SIGIR_gender_platform.pdf')

In [None]:
ax = table.plot(x='platform', kind='bar', stacked=False, title='Gender association with items per platform')
ax.set_xticklabels(table['platform'])
ax.figure.savefig('SIGIR_gender_platform.pdf')

In [None]:
list_table = pd.pivot_table(list_item, values='count_match', index=['platform'],columns=['gender'])
list_table = list_table.reset_index()
list_table

In [None]:
plt_list_item = list_item.dropna()

In [None]:
import seaborn as sns
ax = sns.barplot(x="platform", y="count_match", hue="gender", data=plt_list_item)
ax.set(xlabel=None)
ax.set(ylabel="Similarity Score")
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=10)
for container in :
    ax.bar_label(container)
#ax.figure.savefig('SIGIR_gender_count.pdf')

In [None]:
ax1 = list_table.plot(x='platform', kind='bar', stacked=False, title='Gender association match per platform')
ax1.figure.savefig('SIGIR_gender_count.pdf')