<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-data" data-toc-modified-id="Read-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read data</a></span></li><li><span><a href="#Post-double-selection-control-variable-selection" data-toc-modified-id="Post-double-selection-control-variable-selection-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Post-double-selection control variable selection</a></span><ul class="toc-item"><li><span><a href="#Table:-description-of-eligible-control-vars" data-toc-modified-id="Table:-description-of-eligible-control-vars-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Table: description of eligible control vars</a></span></li><li><span><a href="#Table:-control-variables-for-ITT-PDS" data-toc-modified-id="Table:-control-variables-for-ITT-PDS-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Table: control variables for ITT PDS</a></span></li><li><span><a href="#Table:-control-variables-for-LATE-PDS" data-toc-modified-id="Table:-control-variables-for-LATE-PDS-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Table: control variables for LATE PDS</a></span></li></ul></li><li><span><a href="#ITT-coefficient-estimates" data-toc-modified-id="ITT-coefficient-estimates-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>ITT coefficient estimates</a></span><ul class="toc-item"><li><span><a href="#Table:-first-stage" data-toc-modified-id="Table:-first-stage-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Table: first stage</a></span></li><li><span><a href="#Table:-second-stage" data-toc-modified-id="Table:-second-stage-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Table: second stage</a></span></li><li><span><a href="#Spillover-table" data-toc-modified-id="Spillover-table-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Spillover table</a></span></li></ul></li></ul></div>

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100

## Read data

In [None]:
df = pd.read_csv('../data/generated/pds_coeffs.csv')
print(df.dtypes)
print(df.isnull().sum())
df.head()

In [None]:
dff = pd.read_csv('../data/generated/itt_coeffs.csv')
print(dff.dtypes)
print(dff.isnull().sum())
dff.head()

In [None]:
dflate = pd.read_csv('../data/generated/lates_coeffs.csv')
print(dflate.dtypes)
print(dflate.isnull().sum())
dflate.head()

In [None]:
dfiv = pd.read_csv('../data/generated/pds_iv_coeffs.csv')
print(dfiv.dtypes)
print(dfiv.isnull().sum())
dfiv.head()

## Post-double-selection control variable selection

### Table: description of eligible control vars

In [None]:
# construct dict

ctrls = ['Midterm 1 score', 'Year = 2019', 'Cumulative GPA',
    'No cum. GPA', 'Math quiz score', 'PSET visits', 
    'Videos watched', 'Hours videos', 'Asian', 'Latinx',
    'White', 'Female', 'Transfer']

ctrl_dict = {
    'Midterm 1 score': 'Score on the first midterm',
    'Year = 2019': '1 if course taken in 2019, 0 otherwise',
    'Cumulative GPA': 'Cumulative GPA from prior term, 0 if not observed',
    'No cum. GPA': '1 if Cumulative GPA unobserved, 0 otherwise',
    'Math quiz score': 'Score on a quiz assessing prerequisite math skills',
    'PSET visits': 'Number of PSET visits as of the first midterm',
    'Videos watched': 'Number of unique videos watched as of the first midterm',
    'Hours videos': 'Hours of unique videos watched as of the first midterm',
    'Asian': '1 if ethnicity is Asian, 0 otherwise',
    'Latinx': '1 if ethnicity is Latinx, 0 otherwise',
    'White': '1 if ethnicity is White, 0 otherwise',
    'Female': '1 if female, 0 otherwise',
    'Transfer': '1 if transfer student, 0 otherwise'
}

# Notes on ethnicity:
# Asian includes Chinese/Chinese-American, Vietnamese, East Indian/Pakistani, 
#  Japanese/Japanese American, Korean/Korean American, all other Asian/Asian American
# Latinx includes Mexican/Mexican American, Chicano, and all other Spanish-American/Latino
# White includes White/Caucasian.
# Other ethnicity includes African American/Black, Pacific Islander, not given/declined to state

dfc = pd.DataFrame(data={'Variable': ctrls, 
                         'Description': [ctrl_dict.get(x) for x in ctrls]})
display(dfc.head())

# translate to tex
caption = 'Description of control variables eligible for selection a la Belloni et al (2014).'
table0 = dfc.to_latex(index=False, escape=False, longtable=True, columns=dfc.columns[:],
                      label='controlvars_desc', caption=caption,
                         column_format='p{0.3\linewidth} p{0.6\linewidth}')

# append latex table elements
table0 = '\\begin{spacing}{1.0}\centering \n' + table0 + '\\end{spacing}'

# write to tex file
with open('../tex/tables/controlvars_desc.tex', 'w') as tf:
     tf.write(table0)

### Table: control variables for ITT PDS

In [None]:
# first reshape wide
dfw = df[['depvar', 'model', 'ctrls']].set_index(['depvar', 'model']).unstack().reset_index()
dfw.columns = dfw.columns.droplevel(0)
dfw.columns = ['Dependent Variable', 'Controls,\newline Fixed Effects', 'Controls,\newline All Observations']
dfw = dfw.iloc[:, [0, 2, 1]]
dfw['og'] = dfw.iloc[:, 0].copy()
dfw.head()

In [None]:
ctrls = [x for x in dfw.iloc[:,2].unique()]
ctrls += [x for x in dfw.iloc[:,1].unique() if x not in ctrls]
ctrls = [str(x).split(' ') for x in ctrls]
ctrls = [j for i in ctrls for j in i]
ctrls = list(dict.fromkeys(ctrls))
for x in ['nan']:
    ctrls.remove(x) 
ctrls.sort()
ctrls

In [None]:
# Relabel variables with publication-ready names

name_dict = {'attendance': 'Attendance',
             'duration_final': 'Hours videos by Final',
             'duration_final_u': 'Hours unique videos by Final',
             'duration_mid2': 'Hours videos by Mid. 2',
             'duration_mid2_u': 'Hours unique videos by Mid. 2',
             'duration_u_b': 'Hours unique videos, winter',
             'final_100b': 'Final exam score, winter',
             'finalscorestd': 'Final exam score',
             'gpa_econ_sans100a': 'Term GPA, econ courses ex. 100A',
             'gpa_letter': 'Term GPA',
             'gpa_letter_sans100a': 'Term GPA, ex. 100A',
             'gpa_letter_sansecon': 'Term GPA, ex. econ courses',
             'letter_option': 'Took 100A for letter grade',
             'mid1bscorestd': 'Midterm 1 score, 100B',
             'mid2bscorestd': 'Midterm 2 score, 100B',
             'mid2scorestd': 'Midterm 2 score',
             'nclass_letter': 'Num. classes taken for letter',
             'nclass_np': 'Num. classes not passed',
             'nclass_p': 'Num. classes passed',
             'nclass_pnp': 'Num. classes taken P/NP',
             'nclass_w': 'Num. classes withdrawn',
             'piazza_answers': 'Num. Piazza answers',
             'piazza_daysonline': 'Num. Piazza days online',
             'piazza_questions': 'Num. Piazza questions asked',
             'piazza_views': 'Num. Piazza views',
             'pset_post': 'Num. of PSET visits',
             'took100b': 'Took 100B for a letter grade',
             'units_letter': 'Num. units taken for letter grade',
             'units_pnp': 'Num. units taken P/NP',
             'units_w': 'Num. units withdrawn',
             'videos_final': 'Num. videos before Final',
             'videos_final_u': 'Num. unique videos before Final',
             'videos_mid2': 'Num. videos before Mid. 2',
             'videos_mid2_u': 'Num. unique videos before Mid. 2',
             'videos_u_b': 'Num. unique videos, winter',
             'winter_gpa_econ_sans100a': 'Term GPA, econ courses ex. 100B, winter',
             'winter_gpa_letter': 'Term GPA, winter',
             'winter_gpa_letter_sans100a': 'Term GPA, ex. 100B, winter',
             'winter_gpa_letter_sansecon': 'Term GPA, ex. econ courses, winter',
             'winter_nclass_letter': 'Num. classes taken for letter, winter',
             'winter_nclass_np': 'Num. classes not passed, winter',
             'winter_nclass_p': 'Num. classes passed, winter',
             'winter_nclass_pnp': 'Num. classes taken P/NP, winter',
             'winter_nclass_w': 'Num. classes withdrawn, winter',
             'winter_units_letter': 'Num. units taken for letter grade, winter',
             'winter_units_pnp': 'Num. units taken P/NP, winter',
             'winter_units_w': 'Num. units withdrawn, winter',
             'asian': 'Asian',
             'duration_mid1_u': 'Hours videos',
             'female': 'Female',
             'latx': 'Latinx',
             'mathquizstd': 'Math quiz score',
             'prev_cumgpa': 'Cumulative GPA',
             'prev_cumgpa_unobs': 'No cum. GPA',
             'pset_pre': 'PSET visits',
             'transfer': 'Transfer',
             'videos_mid1_u': 'Videos watched',
            }

for i in range(0,3):
    print(i)
    dfw.iloc[:,i] = dfw.iloc[:,i].apply(lambda x: sorted([name_dict.get(k) or 'None' for k in str(x).split(' ')]) or x)
    dfw.iloc[:,i] = dfw.iloc[:,i].apply(lambda x: ('\newline ').join(x))

In [None]:
# Add Table labels

# tables:
# 1 - first stage
# 2 - second stage
# 3 - spillovers to study methods
# 4 - spillovers to grades
# 5 - spillovers to following quarter

table_dict = {'attendance': 3,
             'duration_final': 1,
             'duration_final_u': 1,
             'duration_mid2': 1,
             'duration_mid2_u': 1,
             'duration_u_b': 5,
             'final_100b': 5,
             'finalscorestd': 2,
             'gpa_econ_sans100a': 4,
             'gpa_letter': 4,
             'gpa_letter_sans100a': 4,
             'gpa_letter_sansecon': 4,
             'letter_option': 4,
             'mid1bscorestd': 5,
             'mid2bscorestd': 5,
             'mid2scorestd': 2,
             'nclass_letter': 4,
             'nclass_np': 4,
             'nclass_p': 4,
             'nclass_pnp': 4,
             'nclass_w': 4,
             'piazza_answers': 3,
             'piazza_daysonline': 3,
             'piazza_questions': 3,
             'piazza_views': 3,
             'pset_post': 3,
             'took100b': 5,
             'units_letter': 4,
             'units_pnp': 4,
             'units_w': 4,
             'videos_final': 1,
             'videos_final_u': 1,
             'videos_mid2': 1,
             'videos_mid2_u': 1,
             'videos_u_b': 5,
             'winter_gpa_econ_sans100a': 5,
             'winter_gpa_letter': 5,
             'winter_gpa_letter_sans100a': 5,
             'winter_gpa_letter_sansecon': 5,
             'winter_nclass_letter': 5,
             'winter_nclass_np': 5,
             'winter_nclass_p': 5,
             'winter_nclass_pnp': 5,
             'winter_nclass_w': 5,
             'winter_units_letter': 5,
             'winter_units_pnp': 5,
             'winter_units_w': 5,
            }

dfw['Table'] = dfw.og.apply(lambda x: 'Table {}'.format(str(table_dict.get(x))))
dfw.sort_values(['Table', dfw.columns[0]], inplace=True)

# set Table 'index' as first column
dfw['Table'] = dfw['Table'].mask(dfw['Table'].duplicated(),'')
dfw = dfw.loc[:, ['Table'] + [c for c in dfw.columns if c != 'Table']]
dfw.reset_index(inplace=True, drop=True)

display(dfw.head())

In [None]:
# TODO: get subset of models in paper

# get subset and convert to tex
table1vars = dfw.og.unique() # change this
table1 = dfw.loc[dfw.og.isin(table1vars), dfw.columns[:-1]]

display(table1.head())

# convert to latex
table1 = table1.to_latex(index=False, escape=False, longtable=True, columns=table1.columns[:],
                         column_format='p{0.1\linewidth} >{\hangindent=1em}p{0.5\linewidth} '+\
                         'p{0.2\linewidth} p{0.2\linewidth}')

# replace header with specific first-page header
caption = 'Control variables selected via the post-double-selection procedure of \\textcite{bch2014a} ' +\
    'in all ITT models. ' +\
    'In the \\textit{All Observations} model, \\textit{Midterm 1 score} and \\textit{Year = 2019} are ' +\
    'additionally included as controls. In the \\textit{Fixed Effects} model, pair fixed effects are included. ' +\
    'All control variables are measured before the start of the experiment, e.g. \\textit{Hours videos} is ' +\
    'the hours of videos watched as of the first midterm.'
appendstring = '\\caption{' + caption + '} \\label{controlvars_selected_itt}\\\\ \n\\toprule \n' + \
    'Table & Dependent Variable & Controls,\\newline All Observations & Controls,\\newline Fixed Effects \\\ \n' +\
    '\\midrule \n\\endfirsthead \n\multicolumn{4}{r}{{Table \\ref{controlvars_selected_itt} (continued)}} \\\ \n\\toprule'
table1 = table1.replace('\\toprule', appendstring, 1)

# append latex table elements
table1 = '\\begin{spacing}{1.0}\centering \n' + table1 + '\\end{spacing}'

# add horizontal lines betweeen groups
for i in range(2, 6):
    table1 = table1.replace('Table {}'.format(str(i)), 
                            '\\midrule \n Table {}'.format(str(i)), 1)

# write to tex file
with open('../tex/tables/controlvars_selected_itt.tex', 'w') as tf:
     tf.write(table1)

### Table: control variables for LATE PDS

In [None]:
# first reshape wide
dfw = dfiv.copy()

# mark instruments
dfw['endog'] = dfw.model.apply(lambda x: x[-1])

# rename models FE or not
dfw['model'] = dfw.model.apply(lambda x: x[:-2])

# reshape wide
dfw = dfw[['depvar', 'model', 'endog', 'ctrls']].set_index(['depvar', 'endog', 'model']).unstack().reset_index()
dfw.columns = dfw.columns.droplevel(0)
dfw.columns = ['Dependent Variable', 'Instrumented', 'Controls,\newline Fixed Effects', 'Controls,\newline All Observations']

# rename instruments
dfw.Instrumented.replace({'d': 'duration_mid1_u', 'v': 'videos_mid1_u'}, inplace=True)

# order columns, copy og varnames
dfw = dfw.iloc[:, [0, 1, 3, 2]]
dfw['og'] = dfw.iloc[:, 0].copy()

# replace varnames with publishable names
for i in range(0,4):
    dfw.iloc[:,i] = dfw.iloc[:,i].apply(lambda x: sorted([name_dict.get(k) or 'None' for k in str(x).split(' ')]) or x)
    dfw.iloc[:,i] = dfw.iloc[:,i].apply(lambda x: ('\newline ').join(x))
    
# sort by publishable name
dfw.sort_values(['Dependent Variable', 'Instrumented'], ascending=['False', 'True'], inplace=True)

dfw.head()

In [None]:
# export to tex

# get subset and convert to tex
tablevars = dfw.og.unique() # change this
table = dfw.loc[dfw.og.isin(tablevars), dfw.columns[:-1]]

display(table.head())

# convert to latex
table = table.to_latex(index=False, escape=False, longtable=True, columns=table.columns[:],
                       column_format='p{.25\linewidth} p{0.2\linewidth} '+\
                       'p{0.2\linewidth} p{0.2\linewidth}')

# replace header with specific first-page header
caption = 'Control variables selected via the post-double-selection procedure of \\textcite{bch2014a} ' +\
    'in all LATE models. ' +\
    'In the \\textit{All Observations} model, \\textit{Midterm 1 score} and \\textit{Year = 2019} are ' +\
    'additionally included as controls. In the \\textit{Fixed Effects} model, pair fixed effects are included. ' +\
    'All control variables are measured before the start of the experiment, e.g. \\textit{Hours videos} is ' +\
    'the hours of videos watched as of the first midterm.'
appendstring = '\\caption{' + caption + '} \\label{controlvars_selected_iv}\\\\ \n\\toprule \n' + \
    'Dependent Variable & Instrumented & Controls,\\newline All Observations & Controls,\\newline Fixed Effects \\\ \n' +\
    '\\midrule \n\\endfirsthead \n\multicolumn{4}{r}{{Table \\ref{controlvars_selected_iv} (continued)}} \\\ \n\\toprule'
table = table.replace('\\toprule', appendstring, 1)

# append latex table elements
table = '\\begin{spacing}{1.0}\centering \n' + table + '\\end{spacing}'

# write to tex file
with open('../tex/tables/controlvars_selected_iv.tex', 'w') as tf:
     tf.write(table)

## ITT coefficient estimates

In [None]:
def sig_digs(num, n=3):
    """
    Takes a float 'num' and returns it rounded with 'n' significant digits.
    Returns str(float), don't use with ints.
    """
    assert n > 0, 'n must be > 0'
    if num == 0:
        return str('0.0')
    power = int(np.ceil(np.log10(abs(num))))
    # return integer if number does not have digits behind decimal
    if power >= n:
        return str(int(round(num, n - power)))
    # return float if number has digits behind decimal
    else:
        formatter = '{:.' + str(n - power) + 'f}'
        return formatter.format(num)

print(sig_digs(np.pi, 3))
print(sig_digs(np.pi * 10000, 2))
print(sig_digs(np.pi * 10, 2))
print(sig_digs(np.pi * 100, 5))
print(sig_digs(np.pi / 10, 1))
print(sig_digs(np.pi / 100, 4))
print(sig_digs(2.0, 3))

In [None]:
# shape and format data

dfb = pd.concat([df.copy().drop('ctrls', 1), dff.copy()])
dfb.rename(columns={'treatbeta': '1_beta', 
                    'stderr': '2_stderr', 
                    'meanctrl': '3_mean',
                    'N': '4_N'}, inplace=True)

# get stars
dfb['stars'] = 0
dfb.loc[(abs(dfb['1_beta']) - dfb['2_stderr'] * 1.645) > 0, 'stars'] = 1
dfb.loc[(abs(dfb['1_beta']) - dfb['2_stderr'] * 1.96) > 0, 'stars'] = 2
dfb.loc[(abs(dfb['1_beta']) - dfb['2_stderr'] * 2.576) > 0, 'stars'] = 3

# stringify stats
dfb['1_beta'] = dfb['1_beta'].apply(lambda x: '{:.2f}'.format(x))
# dfb['1_beta'] = dfb.apply(lambda x: '{:.2f}'.format(x['1_beta']) \
#                           if x['depvar'][:6] != 'videos' \
#                           else '{:.2f}'.format(x['1_beta']), 1)
dfb['1_beta'] = dfb['1_beta'] + dfb.stars.apply(lambda x: '*'*x)
dfb['2_stderr'] = dfb['2_stderr'].apply(lambda x: '({:.2f})'.format(x))
dfb['3_mean'] = dfb['3_mean'].apply(lambda x: '{:.2f}'.format(x))
dfb['4_N'] = dfb['4_N'].apply(lambda x: str(int(x)))
dfb.drop(['stars'], 1, inplace=True)

# reshape long (stack beta, stderr, mean, N)
dfb = dfb.melt(id_vars=['depvar', 'model']).sort_values(['depvar', 'model', 'variable'])
dfb.rename(columns={'variable': 'stat'}, inplace=True)
# display(dfb.head())

# reshape wide (side by side models)
dfb = dfb.pivot(index=['depvar', 'stat'], columns='model').reset_index()
dfb.columns = dfb.columns.droplevel(0)
dfb.columns = ['depvar', 'stat', 'FEs', 'Neyman', 'ITT', 'All']
dfb = dfb[['depvar', 'stat', 'ITT', 'Neyman', 'All', 'FEs']]

# replace depvar with actual names
dfb['og'] = dfb.depvar.copy()
dfb['depvar'] = dfb.depvar.apply(lambda x: name_dict.get(x))

# Get one mean per depvar
means = dfb.loc[dfb.stat == '3_mean', ['depvar', 'ITT']]
means.columns = ['depvar', 'ctrlmean']
dfb = means.merge(dfb, on='depvar', how='inner')

dfb.head()

### Table: first stage

In [None]:
# create table for first stage estimates only

# keep only first stage vars
table1_vars = [x for x in table_dict if table_dict.get(x) == 1]
dfsub = dfb.loc[dfb.og.isin(table1_vars)].copy()

# keep only one mean per depvar (drop rows, keep ctrlmean col)
dfsub = dfsub.loc[dfsub.stat != '3_mean']

# add exam label (to split by)
dfsub['exam'] = 'Midterm 2'
dfsub.loc[dfsub.og.isin(['duration_final', 'duration_final_u', \
                         'videos_final', 'videos_final_u']), 'exam'] = 'Final'
dfsub.sort_values(['exam', 'depvar', 'stat'], ascending=[False, False, True], inplace=True)

# drop intermediate N within each exam
keepidx = dfsub.groupby('exam', as_index=False).stat.nth(-1).index
dfsub = dfsub.loc[(dfsub.stat != '4_N') | (dfsub.index.isin(keepidx))]
dfsub.loc[dfsub.stat == '4_N', 'depvar'] = 'Observations'

# add indents, remove repeats of depvar
dfsub.loc[dfsub.depvar != 'Observations', 'depvar'] = dfsub['depvar'].mask(dfsub['depvar'].duplicated(),'')
dfsub.loc[dfsub.og.isin(['videos_mid2', 'videos_final']) & ~dfsub.depvar.isin(['Observations', '']), 'depvar'] = 'Videos'
dfsub.loc[dfsub.og.isin(['videos_mid2_u', 'videos_final_u']) & ~dfsub.depvar.isin(['Observations', '']), 'depvar'] = 'Unique videos'
dfsub.loc[dfsub.og.isin(['duration_mid2', 'duration_final']) & ~dfsub.depvar.isin(['Observations', '']), 'depvar'] = 'Hours of videos'
dfsub.loc[dfsub.og.isin(['duration_mid2_u', 'duration_final_u']) & ~dfsub.depvar.isin(['Observations', '']), 'depvar'] = 'Hours of unique videos'
dfsub.loc[~dfsub.depvar.isin(['Observations', '']), 'depvar'] = dfsub.depvar.apply(lambda x: '\\indentrow{' + x + '} ')

# remove control mean duplicates and unecessary cols
dfsub['ctrlmean'] = dfsub.ctrlmean.mask(dfsub.ctrlmean.duplicated(),'')
dfsub.drop(['stat', 'og', 'exam'], 1, inplace=True)

# append bottom of table info
dfsub.reset_index(drop=True, inplace=True)
dfsub.loc[len(dfsub), :] = ['Treatment assignment controls', '', 'Yes', 'No', 'Yes', 'Yes']
dfsub.loc[len(dfsub), :] = ['Demographic controls', '', 'No', 'No', 'Yes', 'Yes']
dfsub.loc[len(dfsub), :] = ['Pair Fixed Effects', '', 'No', 'No', 'No', 'Yes']

# rename cols
dfsub.rename(columns={'depvar': '', 'ctrlmean': 'Control Mean', 'ITT': '(1)', 'Neyman': '(2)', 
                      'All': '(3)', 'FEs': '(4)'}, inplace=True)

display(dfsub)

In [None]:
# translate to tex

table = dfsub.to_latex(index=False, escape=False, longtable=False, columns=dfsub.columns[:],
                      column_format='m{0.35\\linewidth} *{5}{>{\\centering\\arraybackslash}m{0.1\\linewidth}}')

# Change table to three-part-table and adjust spacing
addendum = '\\begin{spacing}{1.0} \n \\def\\sym#1{\\ifmmode^{#1}\\else\\(^{#1}\\)\\fi} \n' +\
    '\\begin{table} \\centering \\label{firststage_table} \n \\caption{Effects of Grade Incentive on Video Watching} \n' +\
    '\\begin{threeparttable} \n'
table = addendum + table

# add notes to bottom of table
note = 'This table reports coefficients on $Incentive_i$ from Equations \\ref{itt_spec} ' +\
    'and TBD. Model (1) contains linear controls midterm 1 score and year; (2) is ' +\
    'the difference in means and standard errors calculated using the repeated sampling ' +\
    'framework of Neyman (1923); (3) and (4) use the post-double-selection (PDS) procedure of ' +\
    '\\textcite{bch2014a} to select control variables then estimate treatment effects and ' +\
    'standard errors. The control variables selected using PDS are listed in Table ' +\
    '\\ref{controlvars_selected_itt}. Models (2) and (4) contain only students whose matched-' +\
    'pair did not attrite from the experiment. '
# note += 'The \textit{video} outcome variables comprise course-relevant videos.' +\
#     'If the same video is watched twice, the longest duration is recorded. '
note += '\\textit{Control Mean} is the mean for the ' +\
    'Control students included in models (1) and (3), which is nearly identical ' +\
    'to the mean for the Control students included in models (2) and (4). '
note = '\\Fignote{' + note + ' \\Regnote} \n \\end{threeparttable} \n \\end{table} \n \\end{spacing}'
table = table + note

# insert \sym{} around stars
table = table.replace('*** ', '\\sym{***} ')
table = table.replace('** ', '\\sym{**} ')
table = table.replace('* ', '\\sym{*} ')

# add space between variables and table desc
# first, observations
table = table.replace('Observations', '\\customlinespace Observations')
# second, get indicies
import re
idx = [m.start() for m in re.finditer('\\\indentrow{', table)]
# third, add to table in reverse order
for i in idx[:-4:-1]:
    table = table[:i] + '\\customlinespace ' + table[i:]
insert = '\\midrule \n \\multicolumn{6}{l}{\\textbf{Panel B}: By Final Exam} \\\ \n'
table = table[:idx[4]] + insert + table[idx[4]:] 
for i in idx[3:0:-1]:
    table = table[:i] + '\\customlinespace ' + table[i:]
insert = '\\multicolumn{6}{l}{\\textbf{Panel A}: By Midterm 2} \\\ \n'
table = table[:idx[0]] + insert + table[idx[0]:] 

# add \midrule before bottom of table yes/nos
pos = table.find('Treatment assignment controls')
table = table[:pos] + '\\midrule \n ' + table[pos:]

# Shorten variable names
# table = table.replace('Num. unique videos before Mid. 2', 'Number of videos')
# table = table.replace('Num. unique videos by ', 'Number of videos')
# table = table.replace('Hours unique videos', 'Hours of videos')

# write to tex file
with open('../tex/tables/firststage.tex', 'w') as tf:
     tf.write(table)

### Table: second stage

In [None]:
# redo dfb with different rounding

dfb = pd.concat([df.copy().drop('ctrls', 1), dff.copy()])
dfb.rename(columns={'treatbeta': '1_beta', 
                    'stderr': '2_stderr', 
                    'meanctrl': '3_mean',
                    'N': '4_N'}, inplace=True)

# get stars
dfb['stars'] = 0
dfb.loc[(abs(dfb['1_beta']) - dfb['2_stderr'] * 1.645) > 0, 'stars'] = 1
dfb.loc[(abs(dfb['1_beta']) - dfb['2_stderr'] * 1.96) > 0, 'stars'] = 2
dfb.loc[(abs(dfb['1_beta']) - dfb['2_stderr'] * 2.576) > 0, 'stars'] = 3

# stringify stats
dfb['1_beta'] = dfb['1_beta'].apply(lambda x: '{:.3f}'.format(x))
dfb['1_beta'] = dfb['1_beta'] + dfb.stars.apply(lambda x: '*'*x)
dfb['2_stderr'] = dfb['2_stderr'].apply(lambda x: '({:.3f})'.format(x))
dfb['3_mean'] = dfb['3_mean'].apply(lambda x: '{:.3f}'.format(x))
dfb['4_N'] = dfb['4_N'].apply(lambda x: str(int(x)))
dfb.drop(['stars'], 1, inplace=True)

# reshape long (stack beta, stderr, mean, N)
dfb = dfb.melt(id_vars=['depvar', 'model']).sort_values(['depvar', 'model', 'variable'])
dfb.rename(columns={'variable': 'stat'}, inplace=True)
# display(dfb.head())

# reshape wide (side by side models)
dfb = dfb.pivot(index=['depvar', 'stat'], columns='model').reset_index()
dfb.columns = dfb.columns.droplevel(0)
dfb.columns = ['depvar', 'stat', 'FEs', 'Neyman', 'ITT', 'All']
dfb = dfb[['depvar', 'stat', 'ITT', 'Neyman', 'All', 'FEs']]

# replace depvar with actual names
dfb['og'] = dfb.depvar.copy()
dfb['depvar'] = dfb.depvar.apply(lambda x: name_dict.get(x))

# Get one mean per depvar
means = dfb.loc[dfb.stat == '3_mean', ['depvar', 'ITT']]
means.columns = ['depvar', 'ctrlmean']
dfb = means.merge(dfb, on='depvar', how='inner')

dfb.head()

In [None]:
# shape and format data, LATEs

dfl = pd.concat([dfiv.copy().drop('ctrls', 1), dflate.copy()])
dfl.rename(columns={'treatbeta': '1_beta', 
                    'stderr': '2_stderr', 
                    'meanctrl': '3_mean',
                    'N': '4_N'}, inplace=True)

# get stars
dfl['stars'] = 0
dfl.loc[(abs(dfl['1_beta']) - dfl['2_stderr'] * 1.645) > 0, 'stars'] = 1
dfl.loc[(abs(dfl['1_beta']) - dfl['2_stderr'] * 1.96) > 0, 'stars'] = 2
dfl.loc[(abs(dfl['1_beta']) - dfl['2_stderr'] * 2.576) > 0, 'stars'] = 3

# stringify stats
dfl['1_beta'] = dfl['1_beta'].apply(lambda x: '{:.3f}'.format(x)) + dfl.stars.apply(lambda x: '*'*x)
dfl['2_stderr'] = dfl['2_stderr'].apply(lambda x: '({:.3f})'.format(x))
dfl['3_mean'] = dfl['3_mean'].apply(lambda x: '{:.3f}'.format(x))
dfl['4_N'] = dfl['4_N'].apply(lambda x: str(int(x)))
dfl.drop('stars', 1, inplace=True)

# get instrumented var and models
dfl['Instrumented'] = dfl.model.apply(lambda x: x[-1])
dfl['model'] = dfl.model.apply(lambda x: x[:-2])

# reshape long (stack beta, stderr, mean, N)
dfl = dfl.melt(id_vars=['depvar', 'model', 'Instrumented']).\
    sort_values(['depvar', 'Instrumented', 'model', 'variable'], ascending=[False, True, True, True])
dfl.rename(columns={'variable': 'stat'}, inplace=True)

# reshape wide (side by side models)
dfl = dfl.pivot(index=['depvar', 'Instrumented', 'stat'], columns='model').reset_index()
dfl.columns = dfl.columns.droplevel(0)
dfl.columns = ['depvar', 'Instrumented', 'stat', 'Neyman', 'All', 'FEs', 'ITT']
dfl = dfl[['depvar', 'Instrumented', 'stat', 'ITT', 'Neyman', 'All', 'FEs']]

# replace depvar with actual names
dfl['og'] = dfl.depvar.copy()
dfl['depvar'] = dfl.depvar.apply(lambda x: name_dict.get(x))

dfl.head()

In [None]:
# stack with dfb (ITT estimates)

# keep only second stage vars
table2_vars = [x for x in table_dict if table_dict.get(x) == 2]
dfsub = dfb.loc[dfb.og.isin(table2_vars)].copy()

# add 'instrumented' col (for stacking), drop ctrlmean
dfsub.insert(1, 'Instrumented', 'z')
dfsub.drop('ctrlmean', 1, inplace=True)

# combine the two dfs and sort, drop mean
dfsub = pd.concat([dfsub, dfl])
dfsub = dfsub.loc[dfsub.stat != '3_mean']
dfsub.sort_values(['depvar', 'Instrumented', 'stat'], 
                  ascending=[False, False, True], inplace=True)

# drop intermediate N within each exam
keepidx = dfsub.groupby('og', as_index=False).stat.nth(-1).index
dfsub = dfsub.loc[(dfsub.stat != '4_N') | (dfsub.index.isin(keepidx))]
dfsub.loc[dfsub.stat == '4_N', 'Instrumented'] = 'Observations'

# name instruments
dfsub.Instrumented.replace({'z': 'RF: Incentive', 'v': '2SLS: 10 Videos', 'd': '2SLS: 1 Hour of Videos'}, inplace=True)
dfsub.loc[dfsub.stat != '4_N', 'Instrumented'] = dfsub.Instrumented.apply(lambda x: '\\indentrow{' + x + '} ')

# remove Instrumented duplicates and unnecessary cols
for v in dfsub.og.unique():
    dfsub.loc[dfsub.og == v, 'Instrumented'] = dfsub.loc[dfsub.og == v].\
        Instrumented.mask(dfsub.loc[dfsub.og == v].Instrumented.duplicated(),'')
dfsub.drop(['depvar', 'stat', 'og'], 1, inplace=True)

# rename cols
dfsub.rename(columns={'Instrumented': '', 'ITT': '(1)', 'Neyman': '(2)', 
                      'All': '(3)', 'FEs': '(4)'}, inplace=True)

# append bottom of table info
dfsub.reset_index(drop=True, inplace=True)
dfsub.loc[len(dfsub), :] = ['Treatment assignment controls', 'Yes', 'No', 'Yes', 'Yes']
dfsub.loc[len(dfsub), :] = ['Demographic controls', 'No', 'No', 'Yes', 'Yes']
dfsub.loc[len(dfsub), :] = ['Pair Fixed Effects', 'No', 'No', 'No', 'Yes']

dfsub

In [None]:
# translate to tex

table = dfsub.to_latex(index=False, escape=False, longtable=False, columns=dfsub.columns[:],
                      column_format='m{0.35\\linewidth} *{4}{>{\\centering\\arraybackslash}m{0.1\\linewidth}}')

# Change table to three-part-table and adjust spacing
addendum = '\\begin{spacing}{1.0} \n \\def\\sym#1{\\ifmmode^{#1}\\else\\(^{#1}\\)\\fi} \n' +\
    '\\begin{table} \\centering \\label{secondstage_table} \n \\caption{Effect of Videos on Grades} \n' +\
    '\\begin{threeparttable} \n'
table = addendum + table

# add notes to bottom of table
note = 'This table reports coefficients on $Incentive_i$ from Equation \\ref{late_spec} and ' +\
    '$Video_i$ from Equation TBD. Test scores are measured in standard deviation units. ' +\
    'Model (1) contains linear controls midterm 1 score and year; (2) is ' +\
    'the difference in means and standard errors calculated using the repeated sampling ' +\
    'framework of Neyman (1923); (3) and (4) use the post-double-selection (PDS) procedure of ' +\
    '\\textcite{bch2014a} to select control variables then estimate treatment effects and ' +\
    'standard errors. The control variables selected using PDS are listed in Table ' +\
    '\\ref{controlvars_selected_itt}. Models (2) and (4) contain only students whose matched-' +\
    'pair did not attrite from the experiment. '
note = '\\Fignote{' + note + ' \\Regnote} \n \\end{threeparttable} \n \\end{table} \n \\end{spacing}'
table = table + note

# insert \sym{} around stars
table = table.replace('*** ', '\\sym{***} ')
table = table.replace('** ', '\\sym{**} ')
table = table.replace('* ', '\\sym{*} ')

# add space between variables and table desc
# first, observations
table = table.replace('Observations', '\\customlinespace Observations')
# second, get indicies
import re
idx = [m.start() for m in re.finditer('\\\indentrow{', table)]
# third, add to table in reverse order
for i in idx[:-3:-1]:
    table = table[:i] + '\\customlinespace ' + table[i:]
insert = '\\midrule \n \\multicolumn{5}{l}{\\textbf{Panel B}: Final Exam Score} \\\ \n'
table = table[:idx[3]] + insert + table[idx[3]:] 
for i in idx[2:0:-1]:
    table = table[:i] + '\\customlinespace ' + table[i:]
insert = '\\multicolumn{5}{l}{\\textbf{Panel A}: Midterm 2 Score} \\\ \n'
table = table[:idx[0]] + insert + table[idx[0]:] 

# add \midrule before bottom of table yes/nos
pos = table.find('Treatment assignment controls')
table = table[:pos] + '\\midrule \n ' + table[pos:]

# Shorten variable names
# table = table.replace('Num. unique videos before Mid. 2', 'Number of videos')
# table = table.replace('Num. unique videos by ', 'Number of videos')
# table = table.replace('Hours unique videos', 'Hours of videos')

# write to tex file
with open('../tex/tables/secondstage.tex', 'w') as tf:
     tf.write(table)

### Spillover table

In [None]:
# todo next