In [1]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy import stats
from lib import utils
from lib import firth

In [2]:
# Load data
df_choice = pd.read_csv('choice_data.csv',sep=',',index_col=0)
df_rabbit = pd.read_csv('rabbit_data.csv',sep=',',index_col=0)

df_qid = df_choice.groupby(['front_ratio','single_amount'])['choice_value'].count().reset_index().reset_index().rename(columns={'index':'qid'}).iloc[:,:3]
df_choice = pd.merge(df_choice,df_qid,on=['front_ratio','single_amount'])

df_choice['part_control'] = (df_choice['group_value'] == False)*df_choice['part_value']
df_choice['part_treatment'] = (df_choice['group_value'] == True)*df_choice['part_value']
df_rabbit['part_control'] = (df_rabbit['group_value'] == False)*df_rabbit['part_value']
df_rabbit['part_treatment'] = (df_rabbit['group_value'] == True)*df_rabbit['part_value']

# Identify the obs with no change in choice
sum_choice_seq = df_choice.groupby('worker_id')['choice_value'].sum().to_frame()
analysis_workers = sum_choice_seq[(sum_choice_seq['choice_value'] != 24) & (sum_choice_seq['choice_value'] != 0)].index
all_seq_workers = sum_choice_seq[(sum_choice_seq['choice_value'] == 24)].index
all_single_workers = sum_choice_seq[(sum_choice_seq['choice_value'] == 0)].index

# Adjust worker_id
# Always choosing sequence: -88
# ALways choosing single: -99
df_choice['all_single'] = df_choice['worker_id'].isin(all_single_workers)
df_choice['all_seq'] = df_choice['worker_id'].isin(all_seq_workers)
df_choice['adj_worker_id'] = df_choice['worker_id'].isin(analysis_workers) * df_choice['worker_id'] + \
                                df_choice['worker_id'].isin(all_single_workers) * (-99) + \
                                df_choice['worker_id'].isin(all_seq_workers) * (-88)

# Create dummies
qid_dummies = pd.get_dummies(df_choice['qid'], prefix='qid')
worker_dummies = pd.get_dummies(df_choice['worker_id'], prefix='worker')
front_ratio_dummies = pd.get_dummies(df_choice['front_ratio'], prefix='front_ratio')
single_amount_dummies = pd.get_dummies(df_choice['single_amount'], prefix='single_amount')
df_choice = pd.concat([df_choice,qid_dummies,worker_dummies,front_ratio_dummies,single_amount_dummies],axis=1)

cols_qid = [i for i in qid_dummies if i!= qid_dummies.columns[0]]
cols_worker = [i for i in worker_dummies if i!= worker_dummies.columns[0]]
cols_front_ratio = [i for i in front_ratio_dummies if i!= front_ratio_dummies.columns[0]]
cols_single_amount = [i for i in single_amount_dummies if i!= single_amount_dummies.columns[0]]

# Covert boolean vairables to numerical variables
bool_cols = df_choice.select_dtypes(include=['bool']).columns
df_choice[bool_cols] = df_choice[bool_cols].astype(int)

df_analysis = df_choice[df_choice['worker_id'].isin(analysis_workers)]

x_cols = ['group_value','part_control','part_treatment']
cols_worker_analysis = [c for c in cols_worker if int(c.split('_')[1]) in analysis_workers]

wrong_workers = df_rabbit['worker_id'][df_rabbit['choice_correct'] == 0].unique()
cols_worker_wrong = [c for c in cols_worker if int(c.split('_')[1]) in wrong_workers]


In [9]:
df_choice['response_time'].max()/1000

349.386

In [5]:
tab_compare_choice = pd.DataFrame({key: [] for key in ['Group','Part','Intertemporal Choice','Count-the-Rabbits',r'$\Chi^2$']})

for g in df_choice['group'].unique():
    for p in df_choice['part'].unique():
        _rabbit = df_rabbit[(df_rabbit['group']==g)&(df_rabbit['part']==p)]['choice_value']
        _choice = df_choice[(df_choice['group']==g)&(df_choice['part']==p)]['choice_value']

        mean_rabbit = round(_rabbit.mean(),3)
        mean_choice = round(_choice.mean(),3)
        _chi = stats.chi2_contingency(pd.crosstab(_rabbit,_choice))
        chi_stat = f'{_chi.statistic:.3f} ($p$={_chi.pvalue:.3f})'
        
        new_row = [g,p,mean_choice,mean_rabbit,chi_stat]
        
        tab_compare_choice.loc[len(tab_compare_choice)] = new_row

tab_compare_choice

Unnamed: 0,Group,Part,Intertemporal Choice,Count-the-Rabbits,$\Chi^2$
0,limit-exposure,question,0.523,0.463,0.024 ($p$=0.878)
1,limit-exposure,no question,0.468,0.474,0.050 ($p$=0.823)
2,full-exposure,question,0.469,0.515,2.923 ($p$=0.087)
3,full-exposure,no question,0.448,0.476,0.424 ($p$=0.515)


In [8]:
tab_compare_choice = pd.DataFrame({key:[] for key in ['task','group','question','no question','chi']})
tab_compare_choice.columns = pd.MultiIndex.from_tuples([('Task', ''), ('Group',''),('Part', 'question'), ('Part', 'no question'),('$chi^2$','')])

task_list = ['Intertemporal Choice', 'Count-the-Rabbits']
group_list = df_choice['group'].unique()

for t in task_list:
    for g in group_list:
        if t == task_list[0]:
            _df = df_choice[df_choice['group']==g]
        else:
            _df = df_rabbit[df_rabbit['group']==g]

        mean_q = _df[_df['part_value']==True]['choice_value'].mean()
        mean_no_q = _df[_df['part_value']==False]['choice_value'].mean()
    
        _chi = stats.chi2_contingency(pd.crosstab(_df['choice_value'],_df['part_value']))
        chi_stat = f'{_chi.statistic:.3f} ($p$={_chi.pvalue:.3f})'
        
        new_row = [t,g,f'{mean_q:.3f}',f'{mean_no_q:.3f}',chi_stat]
        
        tab_compare_choice.loc[len(tab_compare_choice)] = new_row


tab_compare_choice.index = np.repeat('',len(tab_compare_choice))
utils.make_table(tab_compare_choice,'tables/chi_test_choice.tex')

In [4]:
tab_compare_choice

Unnamed: 0_level_0,Task,Group,Part,Part,$chi^2$
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,question,no question,Unnamed: 5_level_1
,Intertemporal Choice,limit-exposure,0.523,0.468,10.303 ($p$=0.001)
,Intertemporal Choice,full-exposure,0.469,0.448,1.470 ($p$=0.225)
,Count-the-Rabbits,limit-exposure,0.47,0.464,0.037 ($p$=0.847)
,Count-the-Rabbits,full-exposure,0.508,0.486,0.770 ($p$=0.380)


In [12]:
# Pooled regression
y = df_choice['choice_value']
X = sm.add_constant(df_choice[x_cols + cols_front_ratio + cols_single_amount])
mod = sm.Logit(y,X, data=df_choice)
result_1 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice['worker_id']})
result_1.summary()

Optimization terminated successfully.
         Current function value: 0.680128
         Iterations 4




0,1,2,3
Dep. Variable:,choice_value,No. Observations:,7056.0
Model:,Logit,Df Residuals:,7044.0
Method:,MLE,Df Model:,11.0
Date:,"Wed, 01 May 2024",Pseudo R-squ.:,0.01727
Time:,07:35:03,Log-Likelihood:,-4799.0
converged:,True,LL-Null:,-4883.3
Covariance Type:,cluster,LLR p-value:,2.23e-30

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.2907,0.149,-1.947,0.051,-0.583,0.002
group_value,0.0817,0.189,0.432,0.666,-0.289,0.453
part_control,0.0845,0.059,1.434,0.152,-0.031,0.200
part_treatment,0.2226,0.067,3.305,0.001,0.091,0.355
front_ratio_0.2,-0.2758,0.051,-5.383,0.000,-0.376,-0.175
front_ratio_0.3,-0.3331,0.059,-5.623,0.000,-0.449,-0.217
front_ratio_0.4,-0.2536,0.061,-4.158,0.000,-0.373,-0.134
front_ratio_0.5,-0.1929,0.071,-2.708,0.007,-0.332,-0.053
front_ratio_0.6,-0.4677,0.081,-5.763,0.000,-0.627,-0.309


In [13]:
# Pooled regression
y = df_choice['choice_value']
X = sm.add_constant(df_choice[x_cols + cols_qid])
mod = sm.Logit(y,X, data=df_choice)
result_1 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice['worker_id']})
result_1.summary()

Optimization terminated successfully.
         Current function value: 0.678242
         Iterations 4




0,1,2,3
Dep. Variable:,choice_value,No. Observations:,7056.0
Model:,Logit,Df Residuals:,7029.0
Method:,MLE,Df Model:,26.0
Date:,"Wed, 01 May 2024",Pseudo R-squ.:,0.01999
Time:,07:35:03,Log-Likelihood:,-4785.7
converged:,True,LL-Null:,-4883.3
Covariance Type:,cluster,LLR p-value:,7.038000000000001e-28

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.3317,0.158,-2.103,0.035,-0.641,-0.023
group_value,0.0848,0.190,0.446,0.655,-0.288,0.457
part_control,0.0852,0.059,1.438,0.150,-0.031,0.201
part_treatment,0.2183,0.067,3.254,0.001,0.087,0.350
qid_1,0.3461,0.090,3.842,0.000,0.170,0.523
qid_2,0.5239,0.091,5.761,0.000,0.346,0.702
qid_3,0.6188,0.106,5.860,0.000,0.412,0.826
qid_4,-0.2341,0.092,-2.557,0.011,-0.413,-0.055
qid_5,-0.1578,0.084,-1.890,0.059,-0.322,0.006


In [14]:
# Fixed-effect regression
y = df_analysis['choice_value']
X = sm.add_constant(df_analysis[x_cols + cols_qid + cols_worker_analysis])
mod = sm.Logit(y,X)
result_2 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_analysis['worker_id']})
# result_2.summary()

Optimization terminated successfully.
         Current function value: 0.434276
         Iterations 7


In [15]:
# Fixed-effect regression
y = df_choice['choice_value']
X = sm.add_constant(df_choice[x_cols + cols_qid + cols_worker_analysis + ['all_single','all_seq']])
mod = sm.Logit(y,X)
result_3 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice['adj_worker_id']})
# result_3.summary()

         Current function value: 0.271792
         Iterations: 35




In [16]:
# Firth regression
y = df_choice['choice_value']
X = sm.add_constant(df_choice[x_cols + cols_qid + cols_worker])
firth_reg_1 = firth.firthLogit(y,X)
firth_reg_1.fit()

  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 0 , LL= nan


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 1 , LL= nan


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 2 , LL= nan


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 3 , LL= nan


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 4 , LL= nan
iteration: 5 , LL= 1883.5240110830414


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 6 , LL= 1882.2520165338678


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 7 , LL= nan
iteration: 8 , LL= 1882.9608691697076
iteration: 9 , LL= 1882.339835323914


  return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))


iteration: 10 , LL= 1882.1751226731328


In [17]:
firth_reg_1.clusterSE(cluster_var=df_choice['adj_worker_id'])
wald_result_1 = firth_reg_1.wald(use_cluster=True)
wald_coef_result_1 = wald_result_1[wald_result_1['var_name'].isin(['const'] + x_cols + cols_front_ratio + cols_single_amount)]
wald_coef_result_1.to_csv('firth_reg_result.csv')

Confidence level:  0.95


  self.cluster_se = np.sqrt(np.diag(V_inv))


In [18]:
def draw_reg_col(result,col_name,var_names=None,digit=3):
    
    if var_names is None:
        var_names = result.params.index
    
    col_result = pd.DataFrame(columns=[col_name])

    for r in range(len(var_names)):
        _var = var_names[r]
        _param = str(round(result.params.loc[_var],digit)) + utils.get_star(result.pvalues.loc[_var])
        _se = '(' + str(round(result.bse.loc[_var],digit)) +')'
        col_result.loc['b_'+_var] = _param
        col_result.loc['se_'+_var] = _se

    col_result.loc['nobs'] = int(result.nobs)
    col_result.loc['aic'] = str(round(result.aic,digit))

    return col_result

def draw_reg_firth(result,col_name,var_names=None,digit=3):
    
    if var_names is None:
        var_names = result.index
    
    col_result = pd.DataFrame(columns=[col_name])

    for r in range(len(var_names)):
        _var = var_names[r]
        _param = str(round(result.coef.loc[_var],digit)) + utils.get_star(result.p_value.loc[_var])
        _se = '(' + str(round(result.bse.loc[_var],digit)) + ')'
        col_result.loc['b_'+_var] = _param
        col_result.loc['se_'+_var] = _se

    return col_result



# Draw regression tables
# var_name_list = result_1.params.index[1:]
var_name_list = x_cols
reg_col_1 = draw_reg_col(result_1,col_name='(1) Pooled',var_names=var_name_list)
reg_col_2 = draw_reg_col(result_2,col_name='(2) FE',var_names=var_name_list)
reg_col_3 = draw_reg_col(result_3,col_name='(3) FE',var_names=var_name_list)

# firth_result = wald_result_1.set_index('var_name')
# reg_col_firth = draw_reg_firth(firth_result,col_name='(3) Firth',var_names=result_1.params.index)

# reg_cols = reg_col_1.join([reg_col_2, reg_col_3])
# reg_cols.at['nobs','(3) Firth'] = len(y)
reg_cols = reg_col_1.join([reg_col_2,reg_col_3], how='outer')
reg_cols = reg_cols.reindex(reg_col_3.index).fillna('')
reg_cols

Unnamed: 0,(1) Pooled,(2) FE,(3) FE
b_group_value,0.085,0.047,-0.096
se_group_value,(0.19),(0.533),(0.154)
b_part_control,0.085,0.301,0.301
se_part_control,(0.059),(0.189),(0.187)
b_part_treatment,0.218$^{***}$,0.53$^{***}$,0.53$^{***}$
se_part_treatment,(0.067),(0.164),(0.163)
nobs,7056,4416,7056
aic,9625.349,4253.529,4259.531


In [19]:
# param_list = ['Group',
#             '',
#             r'Question$\cdot1\{\text{Group}=0\}$',
#             '',
#             r'Question$\cdot1\{\text{Group}=1\}$',
#             '',
#             r'$1\{\rho=0.2\}$',
#             '',
#             r'$1\{\rho=0.3\}$',
#             '',
#             r'$1\{\rho=0.4\}$',
#             '',
#             r'$1\{\rho=0.5\}$',
#             '',
#             r'$1\{\rho=0.6\}$',
#             '',
#             r'$1\{\eta=240\}$',
#             '',
#             r'$1\{\eta=280\}$',
#             '',
#             r'$1\{\eta=320\}$',
#             '',
#             'observations',
#             'aic']


param_list = ['Group',
            '',
            r'Question$\cdot1\{\text{Group}=0\}$',
            '',
            r'Question$\cdot1\{\text{Group}=1\}$',
            '',
            'observations',
            'aic']

reg_cols.index = param_list
utils.make_table(reg_cols,'tables/reg_choice.tex')

In [20]:
# df_analysis = df_choice[df_choice['worker_id'].isin(analysis_workers)]

df_qid_rabbit = df_rabbit.groupby(['front_amount','single_amount','diff_amount'])['choice_value'].count().reset_index().reset_index().rename(columns={'index':'qid'}).iloc[:,:-1]
df_rabbit = pd.merge(df_rabbit,df_qid_rabbit,on=['front_amount','single_amount','diff_amount'])

# Create dummy variables for rabbit data
qid_rabbit_dummies = pd.get_dummies(df_rabbit['qid'], prefix='qid')
worker_dummies = pd.get_dummies(df_rabbit['worker_id'], prefix='worker')
front_amount_dummies = pd.get_dummies(df_rabbit['front_amount'], prefix='front_amount')
single_amount_dummies = pd.get_dummies(df_rabbit['single_amount'], prefix='single_amount')
diff_amount_dummies = pd.get_dummies(df_rabbit['diff_amount'], prefix='diff_amount')
df_rabbit = pd.concat([df_rabbit,qid_rabbit_dummies,worker_dummies,front_amount_dummies,single_amount_dummies,diff_amount_dummies],axis=1)

cols_qid_rabbit = [i for i in qid_rabbit_dummies if i!= qid_rabbit_dummies.columns[0]]
cols_worker = [i for i in worker_dummies if i!= worker_dummies.columns[0]]
cols_front_rabbit = [i for i in front_amount_dummies if i!= front_amount_dummies.columns[0]]
cols_single_rabbit = [i for i in single_amount_dummies if i!= single_amount_dummies.columns[0]]
cols_diff_rabbit = [i for i in diff_amount_dummies if i!= diff_amount_dummies.columns[0]]

# Covert boolean vairables to numerical variables
bool_cols = df_rabbit.select_dtypes(include=['bool']).columns
df_rabbit[bool_cols] = df_rabbit[bool_cols].astype(int)

# Create regression sample
df_rabbit_analysis = df_rabbit[df_rabbit['worker_id'].isin(analysis_workers)]
df_rabbit_wrong = df_rabbit[df_rabbit['worker_id'].isin(wrong_workers)]

In [21]:
y = df_rabbit['choice_value']
X = sm.add_constant(df_rabbit[x_cols + cols_diff_rabbit + cols_front_rabbit + cols_single_rabbit])
reg_rabbit_1 = sm.Logit(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_rabbit['worker_id']})
reg_rabbit_1.summary()

Optimization terminated successfully.
         Current function value: 0.123110
         Iterations 9


0,1,2,3
Dep. Variable:,choice_value,No. Observations:,3504.0
Model:,Logit,Df Residuals:,3496.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 01 May 2024",Pseudo R-squ.:,0.8222
Time:,07:35:27,Log-Likelihood:,-431.38
converged:,True,LL-Null:,-2426.5
Covariance Type:,cluster,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.3312,0.309,-14.026,0.000,-4.936,-3.726
group_value,-0.7296,0.330,-2.208,0.027,-1.377,-0.082
part_control,0.5738,0.226,2.539,0.011,0.131,1.017
part_treatment,0.0624,0.318,0.196,0.844,-0.561,0.686
diff_amount_1,7.7970,0.377,20.681,0.000,7.058,8.536
front_amount_2,0.1060,0.274,0.387,0.699,-0.431,0.643
front_amount_3,-0.6546,0.228,-2.876,0.004,-1.101,-0.209
single_amount_8,0.1451,0.166,0.876,0.381,-0.180,0.470


In [22]:
y = df_rabbit['choice_value']
X = sm.add_constant(df_rabbit[x_cols + cols_qid_rabbit])
reg_rabbit_1 = sm.Logit(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_rabbit['worker_id']})
reg_rabbit_1.summary()

Optimization terminated successfully.
         Current function value: 0.121831
         Iterations 10


0,1,2,3
Dep. Variable:,choice_value,No. Observations:,3504.0
Model:,Logit,Df Residuals:,3489.0
Method:,MLE,Df Model:,14.0
Date:,"Wed, 01 May 2024",Pseudo R-squ.:,0.8241
Time:,07:35:28,Log-Likelihood:,-426.89
converged:,True,LL-Null:,-2426.5
Covariance Type:,cluster,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-5.6187,1.020,-5.506,0.000,-7.619,-3.619
group_value,-0.7318,0.331,-2.211,0.027,-1.380,-0.083
part_control,0.5810,0.228,2.551,0.011,0.135,1.027
part_treatment,0.0704,0.319,0.220,0.826,-0.555,0.696
qid_1,9.5170,1.097,8.673,0.000,7.366,11.668
qid_2,1.1295,1.168,0.967,0.333,-1.159,3.418
qid_3,9.1749,1.081,8.488,0.000,7.056,11.293
qid_4,1.3982,1.129,1.238,0.216,-0.815,3.612
qid_5,9.0060,1.147,7.849,0.000,6.757,11.255


In [23]:
y = df_rabbit['choice_value']
X = sm.add_constant(df_rabbit[x_cols + cols_qid_rabbit + cols_worker])
reg_rabbit_2 = sm.Logit(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_rabbit['worker_id']})
# reg_rabbit_2.summary()

         Current function value: 0.069891
         Iterations: 35




In [24]:
y = df_rabbit_analysis['choice_value']
X = sm.add_constant(df_rabbit_analysis[x_cols + cols_qid_rabbit + cols_worker_analysis])
reg_rabbit_3 = sm.Logit(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_rabbit_analysis['worker_id']})
# reg_rabbit_3.summary()

Optimization terminated successfully.
         Current function value: 0.081914
         Iterations 10


In [25]:
y = df_rabbit_wrong['choice_value']
X = sm.add_constant(df_rabbit_wrong[x_cols + cols_qid_rabbit + cols_worker_wrong])
reg_rabbit_4 = sm.Logit(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_rabbit_wrong['worker_id']})
# reg_rabbit_4.summary()


         Current function value: 0.262088
         Iterations: 35




In [26]:
# var_rabbit_list = reg_rabbit_1.params.index[1:]
var_rabbit_list = x_cols
rabbit_col_1 = draw_reg_col(reg_rabbit_1,col_name='(1) Pooled',var_names=var_rabbit_list)
rabbit_col_2 = draw_reg_col(reg_rabbit_2,col_name='(2) FE',var_names=var_rabbit_list)
rabbit_col_3 = draw_reg_col(reg_rabbit_3,col_name='(3) FE',var_names=var_rabbit_list)
rabbit_col_4 = draw_reg_col(reg_rabbit_4,col_name='(4) FE',var_names=var_rabbit_list)

rabbit_cols = rabbit_col_1.join([rabbit_col_2,rabbit_col_3,rabbit_col_4], how='outer')
rabbit_cols

Unnamed: 0,(1) Pooled,(2) FE,(3) FE,(4) FE
b_group_value,-0.732$^{*}$,-4.466$^{*}$,-0.21,-0.964$^{***}$
se_group_value,(0.331),(1.789),(1.771),(0.312)
b_part_control,0.581$^{*}$,1.357$^{*}$,1.618$^{*}$,2.032$^{**}$
se_part_control,(0.228),(0.606),(0.681),(0.778)
b_part_treatment,0.07,0.225,0.466,0.214
se_part_treatment,(0.319),(0.376),(0.444),(0.352)
nobs,3504,3504,2190,810
aic,883.789,1103.793,752.785,586.582


In [27]:
# param_list = ['Group',
#             '',
#             r'Question$\cdot1\{\text{Group}=0\}$',
#             '',
#             r'Questsion$\cdot1\{\text{Group}=1\}$',
#             '',
#             r'$1\{r_2 + r_3 > r_1\}$',
#             '',
#             r'$1\{r_2 =2\}$',
#             '',
#             r'$1\{r_2=3\}$',
#             '',
#             r'$1\{r_1=8\}$',
#             '',
#             'observations',
#             'aic']


param_list = ['Group',
            '',
            r'Question$\cdot1\{\text{Group}=0\}$',
            '',
            r'Questsion$\cdot1\{\text{Group}=1\}$',
            '',
            'observations',
            'aic']

rabbit_cols.index = param_list
utils.make_table(rabbit_cols,'tables/reg_rabbit.tex')

In [28]:
df_analysis['predict_choice'] = (result_2.predict() > 0.5).astype(int)
# df_analysis['predict_choice'] = firth_reg_1.predict()
outlier_threshold = df_analysis['response_time'].quantile(0.995)
df_response = df_analysis[df_analysis['response_time'] < outlier_threshold]

# create interactions
df_response['i_choice_group'] = df_response['predict_choice']*df_response['group_value']
df_response['i_choice_control'] = df_response['predict_choice']*df_response['part_control']
df_response['i_choice_treat'] = df_response['predict_choice']*df_response['part_treatment']


cols_i_front_ratio = []
cols_i_single_amount = []
cols_i_qid = []

for r in range(len(cols_front_ratio)):
    new_col = 'i_choice_'+ cols_front_ratio[r]
    df_response[new_col] = df_response['predict_choice'] * df_response[cols_front_ratio[r]]
    cols_i_front_ratio += [new_col]

for r in range(len(cols_single_amount)):
    new_col = 'i_choice_'+ cols_single_amount[r]
    df_response[new_col] = df_response['predict_choice'] * df_response[cols_single_amount[r]]
    cols_i_single_amount += [new_col]

for r in range(len(cols_qid)):
    new_col = 'i_choice_'+ cols_qid[r]
    df_response[new_col] = df_response['predict_choice'] * df_response[cols_qid[r]]
    cols_i_qid += [new_col]

  df_analysis['predict_choice'] = (result_2.predict() > 0.5).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis['predict_choice'] = (result_2.predict() > 0.5).astype(int)
  df_response['i_choice_group'] = df_response['predict_choice']*df_response['group_value']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_response['i_choice_group'] = df_response['predict_choice']*df_response['group_value']
  df_response['i_choice_control'] = df_response['predict_choice']*df_response['part_control']
A value is trying to be set on a copy of a slice fro

In [29]:
df_rabbit_analysis['predict_choice'] = (reg_rabbit_3.predict() > 0.5).astype(int)
# df_analysis['predict_choice'] = firth_reg_1.predict()
outlier_threshold = df_rabbit_analysis['response_time'].quantile(0.995)
df_response_rabbit = df_rabbit_analysis[df_rabbit_analysis['response_time'] < outlier_threshold]

# create interactions
df_response_rabbit['i_choice_group'] = df_response_rabbit['predict_choice']*df_response_rabbit['group_value']
df_response_rabbit['i_choice_control'] = df_response_rabbit['predict_choice']*df_response_rabbit['part_control']
df_response_rabbit['i_choice_treat'] = df_response_rabbit['predict_choice']*df_response_rabbit['part_treatment']


cols_i_front_rabbit = []
cols_i_single_rabbit = []
cols_i_diff_rabbit = []
cols_i_qid_rabbit = []

for r in range(len(cols_front_rabbit)):
    new_col = 'i_choice_'+ cols_front_rabbit[r]
    df_response_rabbit[new_col] = df_response_rabbit['predict_choice'] * df_response_rabbit[cols_front_rabbit[r]]
    cols_i_front_rabbit += [new_col]

for r in range(len(cols_single_rabbit)):
    new_col = 'i_choice_'+ cols_front_rabbit[r]
    df_response_rabbit[new_col] = df_response_rabbit['predict_choice'] * df_response_rabbit[cols_single_rabbit[r]]
    cols_i_single_rabbit += [new_col]

for r in range(len(cols_diff_rabbit)):
    new_col = 'i_choice_'+ cols_diff_rabbit[r]
    df_response_rabbit[new_col] = df_response_rabbit['predict_choice'] * df_response_rabbit[cols_diff_rabbit[r]]
    cols_i_diff_rabbit += [new_col]

for r in range(len(cols_qid_rabbit)):
    new_col = 'i_choice_'+ cols_qid_rabbit[r]
    df_response[new_col] = df_response['predict_choice'] * df_response[cols_qid_rabbit[r]]
    cols_i_qid += [new_col]

  df_rabbit_analysis['predict_choice'] = (reg_rabbit_3.predict() > 0.5).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rabbit_analysis['predict_choice'] = (reg_rabbit_3.predict() > 0.5).astype(int)
  df_response_rabbit['i_choice_group'] = df_response_rabbit['predict_choice']*df_response_rabbit['group_value']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_response_rabbit['i_choice_group'] = df_response_rabbit['predict_choice']*df_response_rabbit['group_value']
  df_response_rabbit['i_choice_control'] = df_response_rabbit['predict_choice']*df

In [30]:
y = df_response['response_time']/1000
x_cols_new = x_cols + ['predict_choice','i_choice_group','i_choice_control','i_choice_treat']
# X = sm.add_constant(df_response[x_cols_new + cols_single_amount + cols_i_single_amount + \
#                                 cols_front_ratio + cols_i_front_ratio + \
#                                 cols_worker_analysis])

X = sm.add_constant(df_response[x_cols_new + cols_qid + cols_i_qid + \
                                cols_worker_analysis])

mod_2nd = sm.OLS(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_response['worker_id']})

In [31]:
y = df_response_rabbit['response_time']/1000
x_cols_new = x_cols + ['predict_choice','i_choice_group','i_choice_control','i_choice_treat']
# X = sm.add_constant(df_response_rabbit[x_cols_new + cols_diff_rabbit + cols_i_diff_rabbit + \
#                                 cols_front_rabbit + cols_i_front_rabbit + \
#                                 cols_single_rabbit + cols_i_single_rabbit + \
#                                 cols_worker_analysis])

X = sm.add_constant(df_response_rabbit[x_cols_new + cols_qid_rabbit + cols_i_qid_rabbit + \
                                cols_worker_analysis])

mod_2nd_rabbit = sm.OLS(y,X).fit(cov_type='cluster',cov_kwds={'groups':df_response_rabbit['worker_id']})

In [32]:
mod_2nd.pvalues

const              2.514669e-50
group_value        1.305139e-06
part_control       3.419067e-01
part_treatment     6.782951e-06
predict_choice     1.670285e-02
                      ...      
worker_481844     6.889921e-181
worker_481845     2.001886e-169
worker_481852      2.513898e-15
worker_481904      0.000000e+00
worker_481905      1.088595e-32
Length: 249, dtype: float64

In [33]:
df_response_rabbit.groupby(['choice_value'])['response_time'].mean()

choice_value
0    3797.419611
1    4881.824260
Name: response_time, dtype: float64

In [42]:
stats.mannwhitneyu(df_response_rabbit.response_time[df_response_rabbit['choice_value']==True]/1000,df_response_rabbit.response_time[df_response_rabbit['choice_value']==False]/1000)

MannwhitneyuResult(statistic=739393.5, pvalue=1.4664862408519333e-23)

In [43]:
stats.mannwhitneyu(df_response.response_time[df_response['choice_value']==True]/1000,df_response.response_time[df_response['choice_value']==False]/1000)

MannwhitneyuResult(statistic=2528314.5, pvalue=0.0026604016728641665)

In [36]:
def draw_reg_col2(result,col_name,var_names=None,digit=3):
    
    if var_names is None:
        var_names = result.params.index
    
    col_result = pd.DataFrame(columns=[col_name])

    for r in range(len(var_names)):
        _var = var_names[r]
        _param = str(round(result.params.loc[_var],digit)) + utils.get_star(result.pvalues.loc[_var])
        _se = '(' + str(round(result.bse.loc[_var],digit)) +')'
        col_result.loc['b_'+_var] = _param
        col_result.loc['se_'+_var] = _se

    col_result.loc['nobs'] = int(result.nobs)
    col_result.loc['aic'] = str(round(result.aic,digit))
    col_result.loc['adj_r'] = str(round(result.rsquared_adj,digit))

    return col_result

response_col_choice = draw_reg_col2(mod_2nd,col_name='Intertemporal Choice',var_names=x_cols_new)
response_col_rabbit = draw_reg_col2(mod_2nd_rabbit,col_name='Rabbit',var_names=x_cols_new)
response_cols = response_col_choice.join([response_col_rabbit], how='outer')
response_cols

Unnamed: 0,Intertemporal Choice,Rabbit
b_group_value,-0.684$^{***}$,-0.792$^{***}$
se_group_value,(0.141),(0.144)
b_part_control,-0.165,0.912$^{***}$
se_part_control,(0.174),(0.199)
b_part_treatment,0.457$^{***}$,0.849$^{***}$
se_part_treatment,(0.101),(0.132)
b_predict_choice,0.954$^{*}$,1.291$^{***}$
se_predict_choice,(0.399),(0.456)
b_i_choice_group,-0.762$^{*}$,-1.265$^{***}$
se_i_choice_group,(0.304),(0.229)


In [37]:
param_list = ['Group',
            '',
            r'Question$\cdot1\{\text{Group}=0\}$',
            '',
            r'Question$\cdot1\{\text{Group}=1\}$',
            '',
            'Choice',
            '',
            r'Choice$\times$Group',
            '',
            r'Choice$\times$Question$\cdot1\{\text{Group}=0\}$',
            '',
            r'Choice$\times$Question$\cdot1\{\text{Group}=1\}$',
            '',
            'observations',
            'aic',
            r'adj-$R^2$']


response_cols.index = param_list
utils.make_table(response_cols,'tables/reg_response_time.tex')

In [38]:
df_rabbit.mouseY[df_rabbit['choice_value']==True].mean()

248.06867969212553

In [39]:
df_choice.mouseY[df_choice['choice_value']==False].mean()

218.00845377692937

In [40]:
stats.mannwhitneyu(df_rabbit.mouseY[df_rabbit['choice_value']==True],df_rabbit.mouseY[df_rabbit['choice_value']==False],nan_policy='omit')

MannwhitneyuResult(statistic=2122489.0, pvalue=2.7735038407532964e-90)

In [41]:
stats.mannwhitneyu(df_choice.mouseY[df_choice['choice_value']==True],df_choice.mouseY[df_choice['choice_value']==False],nan_policy='omit')

MannwhitneyuResult(statistic=9397658.5, pvalue=0.0)