In [1]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
import statsmodels.api as sm
from firth import firthLogit
import utils

In [68]:
# u_crra = lambda x: (x*10)**0.695
u_crra = lambda x:x*10

In [69]:
# Load data
df_choice = pd.read_csv('intertemporal_choice_obs.csv',sep=',',index_col=0)

# Create dummy variables
pid_dummies = pd.get_dummies(df_choice['pid'], prefix='pid')
rw_dummies = pd.get_dummies(df_choice['b_fixed_rw'], prefix='factor_fixed_rw')
delay_dummies = pd.get_dummies(df_choice['b_delay'], prefix='factor_delay')
df_choice = pd.concat([df_choice,pid_dummies,rw_dummies,delay_dummies],axis=1)

df_choice['b_vary_rw'] = u_crra(df_choice['b_vary_rw'])

cols_pid = [i for i in df_choice.columns if 'pid_' in i and i!= 'pid_1']

# Create interaction terms
factor_cols = [col for col in df_choice.columns if col.startswith('factor_fixed_rw') or col.startswith('factor_delay')]
for factor_col in factor_cols:
    factor_name = factor_col.replace('factor_', '')
    interaction_col = f'I_vary_{factor_name}'
    df_choice[interaction_col] = df_choice[factor_col] * df_choice['b_vary_rw']

df_choice['factor_a_rw'] = (df_choice['a_rw'] == df_choice['a_rw'].max()).astype(int)
df_choice['I_vary_a_rw'] = df_choice['factor_a_rw'] * df_choice['b_vary_rw']


# Specify each question
tab_question = df_choice.groupby(['cond','a_rw','b_fixed_rw','b_delay'])['pid'].mean().reset_index().iloc[:,:-1]
tab_question = tab_question.reset_index().rename(columns={'index':'question_id'})

df_choice = pd.merge(df_choice,tab_question,on=['cond','a_rw','b_fixed_rw','b_delay'])

# # Create independent variables for Immed_Rw_Vary
# x_cols1 = ['b_vary_rw',
#            'factor_a_rw','factor_fixed_rw_7','factor_fixed_rw_9','factor_delay_9','factor_delay_18',
#            'I_vary_a_rw','I_vary_fixed_rw_7','I_vary_fixed_rw_9','I_vary_delay_9','I_vary_delay_18']

x_cols1 = ['b_vary_rw','I_vary_a_rw','I_vary_fixed_rw_7','I_vary_fixed_rw_9','I_vary_delay_9','I_vary_delay_18']


# # Create independent variables for Delayed_Rw_Vary
# x_cols2 = ['b_vary_rw',
#            'factor_a_rw','factor_fixed_rw_7','factor_fixed_rw_9',
#            'I_vary_a_rw','I_vary_fixed_rw_7','I_vary_fixed_rw_9']

x_cols2 = ['b_vary_rw','I_vary_a_rw','I_vary_fixed_rw_7','I_vary_fixed_rw_9']


df_choice_immed = df_choice[df_choice['cond'] == 'Immed_Rw_Vary']
df_choice_delayed = df_choice[df_choice['cond'] == 'Delayed_Rw_Vary']

question_dummies_immed = pd.get_dummies(df_choice_immed['question_id'], prefix='qid')
question_dummies_delayed = pd.get_dummies(df_choice_delayed['question_id'], prefix='qid')

df_choice_immed = pd.concat([df_choice_immed,question_dummies_immed],axis=1)
df_choice_delayed = pd.concat([df_choice_delayed,question_dummies_delayed],axis=1)

cols_question_immed = [i for i in df_choice_immed.columns if 'qid_' in i and i!= 'qid_1']
cols_question_delayed = [i for i in df_choice_delayed.columns if 'qid_' in i and i!= 'qid_1']


# Covert boolean vairables to numerical variables
bool_cols = df_choice_immed.select_dtypes(include=['bool']).columns
df_choice_immed[bool_cols] = df_choice_immed[bool_cols].astype(int)

bool_cols = df_choice_delayed.select_dtypes(include=['bool']).columns
df_choice_delayed[bool_cols] = df_choice_delayed[bool_cols].astype(int)

In [61]:
mean_choice = df_choice.groupby(['a_rw','b_fixed_rw','b_vary_rw','b_delay'])['choice'].mean().reset_index()
df_filter_out_same = mean_choice[(mean_choice['choice'] > 0) & (mean_choice['choice'] < 1)].iloc[:,:-1]

df_choice_immed = pd.merge(df_choice_immed,df_filter_out_same,on=['a_rw','b_fixed_rw','b_vary_rw','b_delay'],how='inner')
df_choice_delayed = pd.merge(df_choice_delayed,df_filter_out_same,on=['a_rw','b_fixed_rw','b_vary_rw','b_delay'],how='inner')

In [74]:
y1 = df_choice_immed['choice']
X1 = sm.add_constant(df_choice_immed[x_cols1+cols_question_immed])
mod = sm.Logit(y1,X1)
result_immed_1 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice_immed['pid']})
# result_1.summary()

Optimization terminated successfully.
         Current function value: 0.303009
         Iterations 9


In [75]:
y1 = df_choice_immed['choice']
X1 = sm.add_constant(df_choice_immed[x_cols1 + cols_question_immed + cols_pid])
mod = sm.Logit(y1,X1)
result_immed_2 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice_immed['pid']})

Optimization terminated successfully.
         Current function value: 0.175573
         Iterations 10


In [76]:
y2 = df_choice_delayed['choice']
X2 = sm.add_constant(df_choice_delayed[x_cols2 + cols_question_delayed])
mod = sm.Logit(y2,X2)
result_delayed_1 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice_delayed['pid']})

Optimization terminated successfully.
         Current function value: 0.222680
         Iterations 9


In [77]:
y2 = df_choice_delayed['choice']
X2 = sm.add_constant(df_choice_delayed[x_cols2 + cols_question_delayed + cols_pid])
mod = sm.Logit(y2,X2)
result_delayed_2 = mod.fit(cov_type='cluster',cov_kwds={'groups':df_choice_delayed['pid']})

Optimization terminated successfully.
         Current function value: 0.096965
         Iterations 11


In [78]:
reg_col1 = utils.draw_reg_col(result_immed_1,col_name='immed_pool',var_names=x_cols1)
reg_col2 = utils.draw_reg_col(result_immed_2,col_name='immed_fe',var_names=x_cols1)
reg_col3 = utils.draw_reg_col(result_delayed_1,col_name='delayed_pool',var_names=x_cols2)
reg_col4 = utils.draw_reg_col(result_delayed_2,col_name='delayed_fe',var_names=x_cols2)
cols = [reg_col1,reg_col2,reg_col3,reg_col4]

all_cols = reg_col1.join(cols[1:], how='outer').fillna('')
all_cols.columns = pd.MultiIndex.from_tuples([('Front-end amount varies', '(1) Pooled'), ('Front-end amount varies','(2) FE'),
                                              ('Back-end amount varires', '(1) Pooled'), ('Back-end amount varires', '(2) FE')])
all_cols

Unnamed: 0_level_0,Front-end amount varies,Front-end amount varies,Back-end amount varires,Back-end amount varires
Unnamed: 0_level_1,(1) Pooled,(2) FE,(1) Pooled,(2) FE
b_b_vary_rw,0.181,0.316$^{***}$,0.139$^{***}$,0.318$^{***}$
se_b_vary_rw,(0.339),(0.044),(0.011),(0.03)
b_I_vary_a_rw,0.022$^{***}$,0.031$^{***}$,0.039$^{***}$,0.071$^{***}$
se_I_vary_a_rw,(0.006),(0.006),(0.007),(0.017)
b_I_vary_fixed_rw_7,-0.047,-0.079$^{*}$,-0.015,-0.024
se_I_vary_fixed_rw_7,(0.032),(0.04),(0.009),(0.023)
b_I_vary_fixed_rw_9,-0.083$^{**}$,-0.135$^{***}$,-0.026$^{**}$,-0.031
se_I_vary_fixed_rw_9,(0.031),(0.041),(0.01),(0.023)
b_I_vary_delay_9,-0.033$^{***}$,-0.058$^{***}$,,
se_I_vary_delay_9,(0.006),(0.012),,


In [67]:
all_cols.index = [r'$u(X_v)$',
                  '',
                #   r'$1\{M=M_{high}\}$',
                #   '',
                #   r'$1\{X_c=X_{mid}\}$',
                #   '',
                #   r'$1\{X_c=X_{high}\}$',
                #   '',
                #   r'$1\{T=T_{mid}\}$',
                #   '',
                #   r'$1\{T=T_{high}\}$',
                #   '',
                  r'$u(X_v)\cdot1\{M=M_{high}\}$',
                  '',
                  r'$u(X_v)\cdot1\{X_c=X_{mid}\}$',
                  '',
                  r'$u(X_v)\cdot1\{X_c=X_{high}\}$',
                  '',
                  r'$u(X_v)\cdot1\{T=T_{mid}\}$',
                  '',
                  r'$u(X_v)\cdot1\{T=T_{high}\}$',
                  '',
                  'observations',
                  'AIC']

utils.make_table(all_cols,'tables/exp1_utility_censor.tex')

In [79]:
all_cols.index = [r'$X_v$',
                  '',
                #   r'$1\{M=M_{high}\}$',
                #   '',
                #   r'$1\{X_c=X_{mid}\}$',
                #   '',
                #   r'$1\{X_c=X_{high}\}$',
                #   '',
                #   r'$1\{T=T_{mid}\}$',
                #   '',
                #   r'$1\{T=T_{high}\}$',
                #   '',
                  r'$X_v\cdot1\{M=M_{high}\}$',
                  '',
                  r'$X_v\cdot1\{X_c=X_{mid}\}$',
                  '',
                  r'$X_v\cdot1\{X_c=X_{high}\}$',
                  '',
                  r'$X_v\cdot1\{T=T_{mid}\}$',
                  '',
                  r'$X_v\cdot1\{T=T_{high}\}$',
                  '',
                  'observations',
                  'AIC']

utils.make_table(all_cols,'tables/exp1_baseline_model.tex')

In [58]:
result_immed_2.summary()

0,1,2,3
Dep. Variable:,choice,No. Observations:,18840.0
Model:,Logit,Df Residuals:,18666.0
Method:,MLE,Df Model:,173.0
Date:,"Mon, 06 May 2024",Pseudo R-squ.:,0.7518
Time:,14:36:19,Log-Likelihood:,-3238.1
converged:,True,LL-Null:,-13048.0
Covariance Type:,cluster,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-13.3872,,,,,
b_vary_rw,1.6051,0.243,6.593,0.000,1.128,2.082
I_vary_a_rw,0.2057,0.031,6.609,0.000,0.145,0.267
I_vary_fixed_rw_7,-0.5018,0.217,-2.312,0.021,-0.927,-0.076
I_vary_fixed_rw_9,-0.8330,0.223,-3.740,0.000,-1.270,-0.396
I_vary_delay_9,-0.2222,0.048,-4.651,0.000,-0.316,-0.129
I_vary_delay_18,-0.3254,0.060,-5.381,0.000,-0.444,-0.207
qid_6,-12.4918,,,,,
qid_7,-0.1079,,,,,


In [9]:
firth_reg_1 = firthLogit(y1,X1)
firth_reg_1.fit()

iteration: 0 , LL= 12751.908641337992
iteration: 1 , LL= 5848.334232711649
iteration: 2 , LL= 4266.579633452862
iteration: 3 , LL= 3606.98158435943
iteration: 4 , LL= 3389.758368902345
iteration: 5 , LL= 3353.1689375316205
iteration: 6 , LL= 3351.5572946751117
iteration: 7 , LL= 3351.5500972578798
iteration: 8 , LL= 3351.5500932456757
iteration: 9 , LL= 3351.5500932438845
iteration: 10 , LL= 3351.550093243884


In [11]:
firth_reg_1.clusterSE(cluster_var=df_choice_immed['pid'])
wald_result_1 = firth_reg_1.wald(use_cluster=True)
wald_coef_result_1 = wald_result_1[wald_result_1['var_name'].isin(x_cols1)]
wald_coef_result_1.to_csv('firth_result_immed.csv')

Confidence level:  0.95


In [5]:
df_choice_immed_q = df_choice_immed[['a_rw','b_fixed_rw','b_vary_rw','b_delay']]
df_choice_immed_q['pred_firth'] = firth_reg_1.predict()
pred_firth_immed = df_choice_immed_q.groupby(['a_rw','b_fixed_rw','b_vary_rw','b_delay']).mean('pred_firth')
pred_firth_immed.reset_index().to_csv('firth_pred_immed.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_choice_immed_q['pred_firth'] = firth_reg_1.predict()


In [6]:
firth_reg_2 = firthLogit(y2,X2)
firth_reg_2.fit()


iteration: 0 , LL= 6293.7021291342935
iteration: 1 , LL= 2618.0610249924903
iteration: 2 , LL= 1691.1154516442043
iteration: 3 , LL= 1224.3363985323476
iteration: 4 , LL= 992.4025445968265
iteration: 5 , LL= 899.3022986420766
iteration: 6 , LL= 876.6519023539877
iteration: 7 , LL= 874.5239159401027
iteration: 8 , LL= 874.4760096565784
iteration: 9 , LL= 874.4756705870433
iteration: 10 , LL= 874.4756687663141
iteration: 11 , LL= 874.475668756779
iteration: 12 , LL= 874.475668756729
iteration: 13 , LL= 874.4756687567289


In [7]:
wald_result_2 = firth_reg_2.wald()
wald_coef_result_2 = wald_result_2[wald_result_2['var_name'].isin(x_cols2)]
wald_coef_result_2.to_csv('firth_result_delayed.csv')

Confidence level:  0.95


In [8]:
df_choice_delayed_q = df_choice_delayed[['a_rw','b_fixed_rw','b_vary_rw','b_delay']]
df_choice_delayed_q['pred_firth'] = firth_reg_2.predict()
pred_firth_delayed = df_choice_delayed_q.groupby(['a_rw','b_fixed_rw','b_vary_rw','b_delay']).mean('pred_firth')
pred_firth_delayed.reset_index().to_csv('firth_pred_delayed.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_choice_delayed_q['pred_firth'] = firth_reg_2.predict()
