In [2]:


### Collect and clean all data from study 1
%run study_one_cleaning.py


### Setting framess for slogans ###
df_slogans = df_slogans.set_index('slogan', drop=False)
%run set_up_framing.py
set_up_framings(df_slogans, 'slogan')


157
Loading, formatting, and cleaning done
Contributors: 59 motivations: 59 demographics: 59


In [3]:
# Calculating effect size 
# taken from https://stackoverflow.com/questions/15436702/estimate-cohens-d-for-effect-size?rq=1
def cohens_d(x, y):
#     print(x.var())
#     print(y.var())
    lx = len(x) - 1
    ly = len(y)- 1
    md = abs(x.mean() - y.mean())        ## mean difference (numerator)
    pld_var = lx * x.var() + ly * y.var()
    pld_var = pld_var/(lx + ly)
    pld_var = np.sqrt(pld_var)
    return md/pld_var

def run_anova(x, non_param):
    counts = [x[k] for k in x.keys()]
    if non_param: 
        f, p = stats.kruskal(*counts)
    else: 
        f, p = stats.f_oneway(*counts)
    # degrees of freedom for ANOVA
    anova_btwn = len(x) - 1
    anova_wthn = (len([val for sublist in counts for val in sublist]) - (anova_btwn + 1))
    print('F ( ', anova_btwn, ', ', anova_wthn, ') =', ('%.3f' % f), ' p =', ('%.5f' % p))

# Update: now using PAIRED sample t-test
def run_tukey(motivations_dict):
#     descriptive_stats(motivations_dict, mot)
    # Get the data into correct form for tukey tests
    # The form is all data in a single list, with corrosponding catagory labels 
    # (in this case the slogan ids) in a different list
    unwrapped_data = []
    k_ids = []
    i = 1
    for k in motivations_dict.keys():
        unwrapped_data.extend(motivations_dict[k])
        # from: https://stackoverflow.com/questions/20426313/append-the-same-value-multiple-times-to-a-list
        k_ids.extend([k] * len(motivations_dict[k]))
        i += 1
    # convert to int 
    unwrapped_data = list(map(int, unwrapped_data))
    results = pairwise_tukeyhsd(unwrapped_data, k_ids, alpha=0.001)
    # convert tukey to df 
    # Taken from https://stackoverflow.com/questions/40516810/saving-statmodels-tukey-hsd-into-a-python-panda-dataframe
    df_tukey = pd.DataFrame(data=results._results_table.data[1:], columns=results._results_table.data[0])
    # get only tests where reject is true 
    df_tukey_true = df_tukey[df_tukey['reject'] == True]
    print("------------------------------------------------------")
    print('Tukey results where reject is true:')
    display(df_tukey_true.sort_values(by='meandiff'))
    print("------------------------------------------------------")
    
    t_test_p_vals = [] 
    for i, row in df_tukey_true.iterrows():
        # for multiple testing 
        f, p = stats.ttest_rel(motivations_dict[row.group1], motivations_dict[row.group2])
        degrees_of_freedom = (len(motivations_dict[row.group1]) - 1) + (len(motivations_dict[row.group1]) - 1)
        t_test_p_vals.append(p)
        effect_size = cohens_d(np.array(motivations_dict[row.group1]), np.array(motivations_dict[row.group2]))
        print('--------------------------------------------------------')
        print('$t_{' ,degrees_of_freedom , '} =', ('%.2f' % f), '$', 'p =', '$', ('%.3f' % p), '$','d=', '$', ('%.2f' % effect_size), '$')
        print(row.group1, ': mean =', ('%.2f' % np.mean(motivations_dict[row.group1])), 'std =', ('%.2f' % np.std(motivations_dict[row.group1])))
        print(row.group2, ': mean =', ('%.2f' % np.mean(motivations_dict[row.group2])), 'std =', ('%.2f' % np.std(motivations_dict[row.group2])))
        print('d = ', effect_size)
        print('--------------------------------------------------------')
    # correct for multiple tests
    r, p_vals, sidak, bonferroni = multipletests(t_test_p_vals, method='fdr_bh')
    print('Corrected p-values and rejections')
    print('--------------------------------------------------------')
    print('Reject the null? ', r)
    print('Corrected p values: ', p_vals)
        
    

In [4]:
### Descriptive stats
df_motivations = df_motivations.apply(pd.to_numeric)

### Descriptive Stats on Demographics and Motivations ###
print('Descriptive stats - Motivations')
print('------------------------------------------------' )
print('Mean motivations:')
print(df_motivations.mean())
print('------------------------------------------------' )
print('Std motivations:')
print(df_motivations.astype(int).std())


print('Descriptive stats - Demographics')
print('------------------------------------------------' )
print('Age')
print('------------------------------------------------' )
print('Mean age:', df_demographics['contr_age'].astype(int).mean())
print('Std age:', df_demographics['contr_age'].astype(int).std())
print('------------------------------------------------' )
print('Gender')
print('------------------------------------------------' )
print('Gender breakdown:')
print(df_demographics['gender'].value_counts())
print('------------------------------------------------' )
print('Country')
print('------------------------------------------------' )
print('Country counts:')
print(df_demographics['country'].value_counts())
print('------------------------------------------------' )
print('Education')
print('------------------------------------------------' )
print('Education levels:')
print(df_demographics['contr_edu'].value_counts())
print('------------------------------------------------' )

### Count Slogan Preferences ###

# Make new df for total count of each slogan
df_slogan_ids = df_slogans[['design_id', 'slogan', 'study', 'framing']].copy()
df_slogan_ids['count'] = 0
df_slogan_ids.index = df_slogan_ids['design_id']
df_slogan_ids = df_slogan_ids.drop(['design_id'], axis=1)


Descriptive stats - Motivations
------------------------------------------------
Mean motivations:
fun_motivation          3.830508
bored_motivation        2.779661
compare_motivation      3.372881
science_motivation      4.220339
learning_motivation     3.864407
id                     92.627119
dtype: float64
------------------------------------------------
Std motivations:
fun_motivation          1.191240
bored_motivation        1.451133
compare_motivation      1.244274
science_motivation      0.872329
learning_motivation     1.224148
id                     38.532968
dtype: float64
Descriptive stats - Demographics
------------------------------------------------
Age
------------------------------------------------
Mean age: 28.47457627118644
Std age: 15.568930081064735
------------------------------------------------
Gender
------------------------------------------------
Gender breakdown:
Female    33
Male      26
Name: gender, dtype: int64
------------------------------------------

In [5]:
# Get the count of each slogan for each participant based on Copeland Counting 
# At the end of this frame you should have a dict or df that contains each slogan 
# and an associated list of the counts of how many times each participant picked that slogan

dict_contributors = {k[1].id:[] for k in df_contributors.iterrows()}

# Reset total count
df_slogans['total_choices'] = 0
df_slogan_ids['count'] = 0

# Dict to hold all slogan counts, seperated by study
slogans = {k:[] for k in df_slogans['design_id']}
frames = {k:[] for k in df_slogans['framing']}


# Group contributions by a single user 
grouped_contributions = df_contributions.groupby('owner_id')

# Loop through each contributor:
for contr in df_contributors.iterrows():
#     print(contr[1].id)
    if (contr[1].id == 37) or (contr[1].id == 39):
        print(contr[1])
    try:
        # Get the choices of that contributor
        choices = grouped_contributions.get_group(contr[1].id).comparison.apply(lambda x: x.get('choice_id'))
        opp_choices = grouped_contributions.get_group(contr[1].id).comparison.apply(lambda x: x.get('compared_id'))
        
        # Drop any NaN or none values from both lists, convert opp_choices into ints (was list types)
        choices = [x for x in choices if str(x) != 'nan']
        opp_choices = [x for x in opp_choices if x is not None]
        opp_choices = [x[0] for x in opp_choices]

        # Convert lists both into series 
        choices = pd.Series(choices)
        opp_choices = pd.Series(opp_choices)
        
        # Get value counts of both (count of how many times each id appears)
        choice_counts = choices.value_counts()
        opp_counts = opp_choices.value_counts()
        
        # Concat counts into the same dataframe and fill any empty spots with 0
        df_counts = pd.concat([choice_counts, opp_counts], axis=1)
        df_counts = df_counts.reset_index()
        df_counts.columns = ['design_id','choices', 'opp_choices']
        df_counts = df_counts.fillna(0)
        
        # Get Copeland count of each slogan id, store in 'total' column
        df_counts['total'] = df_counts['choices'] - df_counts['opp_choices']
        
        dict_contributors[contr[1].id] = df_counts['choices'].values 
        
        # This is just number of times people voted for a slogan 
        # does not take into account to what it was compared to
        for i, row in df_counts.iterrows():
            past_count = df_slogan_ids.iloc[int(row.design_id - 1)]['count']
            df_slogan_ids.set_value(int(row.design_id), 'count', past_count + row.choices)

        # Merge with slogan dataframe to get the actual slogan (rather than just the id)
        df_slogan_choices = df_slogans[['id', 'slogan', 'design_id', 'framing']].merge(df_counts, on='design_id')
        
        # add copeland count of each slogan and frame to slogan dict
        for s in df_slogan_choices.iterrows():
            slogans[s[1].design_id].append(s[1].choices);
            frames[s[1].framing].append(s[1].choices);
        
    # Since we took out contributors who didn't finish, have to add this catch in case of a keyerror
    except KeyError:
        continue

    



In [6]:
df_dem_mot = df_motivations.merge(df_demographics[['gender', 'contr_age', 'id']], on='id')
motivations = [df_dem_mot[mot] for mot in list(df_motivations.columns.drop('id'))]


In [9]:
### One-way ANOVAs ###

# Do slogan preferences differ across participants
# Convert dict into list of lists
contrib_pref = [list(dict_contributors[x]) for x in dict_contributors.keys()]

print('Are some slogans more preferred than others?')
run_anova(slogans, False)
run_tukey(slogans)

print('Are some frames more preferred than others?')
run_anova(frames, False)
run_tukey(frames)


Are some slogans more preferred than others?
F (  17 ,  1044 ) = 12.169  p = 0.00000
------------------------------------------------------
Tukey results where reject is true:


Unnamed: 0,group1,group2,meandiff,lower,upper,reject
10,1,12,-2.4237,-3.6299,-1.2176,True
68,5,12,-2.4068,-3.6129,-1.2006,True
91,7,12,-2.2881,-3.4943,-1.082,True
110,9,12,-2.1525,-3.3587,-0.9464,True
101,8,12,-1.7797,-2.9858,-0.5735,True
16,1,18,-1.7627,-2.9689,-0.5565,True
74,5,18,-1.7458,-2.9519,-0.5396,True
4,1,6,-1.6271,-2.8333,-0.421,True
97,7,18,-1.6271,-2.8333,-0.421,True
62,5,6,-1.6102,-2.8163,-0.404,True


------------------------------------------------------
--------------------------------------------------------
$t_{ 116 } = 5.61 $ p = $ 0.000 $ d= $ 1.10 $
1 : mean = 3.37 std = 1.36
3 : mean = 1.85 std = 1.41
d =  1.0987872554331617
--------------------------------------------------------
--------------------------------------------------------
$t_{ 116 } = 5.34 $ p = $ 0.000 $ d= $ 1.12 $
1 : mean = 3.37 std = 1.36
6 : mean = 1.75 std = 1.55
d =  1.1156758386670445
--------------------------------------------------------
--------------------------------------------------------
$t_{ 116 } = 8.69 $ p = $ 0.000 $ d= $ 1.82 $
1 : mean = 3.37 std = 1.36
12 : mean = 0.95 std = 1.29
d =  1.822728603312147
--------------------------------------------------------
--------------------------------------------------------
$t_{ 116 } = 5.62 $ p = $ 0.000 $ d= $ 1.22 $
1 : mean = 3.37 std = 1.36
18 : mean = 1.61 std = 1.53
d =  1.2163118764398813
-------------------------------------------------

AttributeError: 'list' object has no attribute 'keys'

In [8]:
### Making Chart 3
import matplotlib.pyplot as plt


df_slogans['mean_count'] = [np.mean(slogans[x]) for x in slogans.keys()]
df_slogans['std_count'] = [np.std(slogans[x]) for x in slogans.keys()]

grp_frame = df_slogans.groupby('framing')

labels = df_slogans['design_id']

frames = ['Self-learn', 'Fun & Bored', 'Science', 'Compare']

nb_colors = len(plt.rcParams['axes.prop_cycle'])
colors = ['b', 'g', 'r', 'c']
symbols = [(3, 0, 0), '*', '+', (0, 3, 0)]

color_map = {f:colors.pop() for f in df_slogans.framing.unique()}
symbols_map = {f:symbols.pop() for f in df_slogans.framing.unique()}

plt.figure(num=None, figsize=(15, 1), dpi=200)

plots = []
avgs = []
avg_labels = []
for f in df_slogans.framing.unique():
    
    frame_group = grp_frame.get_group(f)
    
    y = [1 for f in frame_group['design_id']]
    x = frame_group['mean_count']
    p = plt.scatter(x, y, c=color_map[f], marker=symbols_map[f])
    
    plots.append(p)
    
plt.legend(plots,
           frames,
           scatterpoints=1,
           loc='lower left',
           bbox_to_anchor=(0,-1),
           ncol=3,
           fontsize=13)
    
    

plt.axes().get_yaxis().set_ticks([])
plt.xlim(left=1, right=4)
plt.xticks(list(plt.xticks()[0]) + avgs, list(plt.xticks()[0]), fontsize=10)
plt.margins(0.05)
plt.xlabel('mean score', fontsize=15)

# Tweak spacing to prevent clipping of tick-labels
plt.subplots_adjust(bottom=0.15)
plt.show()



<Figure size 3000x200 with 1 Axes>