# View this notebook here:
https://nbviewer.jupyter.org/github/zagoodman/microeconomics_videos/blob/master/jupyter/assemble_data.ipynb

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prep" data-toc-modified-id="Prep-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prep</a></span></li><li><span><a href="#Merge-anonymized-data" data-toc-modified-id="Merge-anonymized-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Merge anonymized data</a></span><ul class="toc-item"><li><span><a href="#Demographic-data" data-toc-modified-id="Demographic-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Demographic data</a></span><ul class="toc-item"><li><span><a href="#Concurrent-gpa/units" data-toc-modified-id="Concurrent-gpa/units-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>Concurrent gpa/units</a></span></li><li><span><a href="#Next-quarter-gpa/units" data-toc-modified-id="Next-quarter-gpa/units-2.1.2"><span class="toc-item-num">2.1.2&nbsp;&nbsp;</span>Next quarter gpa/units</a></span></li></ul></li><li><span><a href="#Treatment-and-outcome-data" data-toc-modified-id="Treatment-and-outcome-data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Treatment and outcome data</a></span><ul class="toc-item"><li><span><a href="#Student-level" data-toc-modified-id="Student-level-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Student level</a></span></li></ul></li></ul></li></ul></div>

This file takes the anonymized data from the T+LC, cleans them, and returns analysis-ready data frames. All code in this file is in Python 3.

## Prep

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
import pandas_flavor as pf

@pf.register_dataframe_accessor('z')
class MyFlavor(object):
    
    def __init__(self, df):
        self._df = df

    def get_unique_students(self, var1 = "id", var2 = "year"):
        df = self._df
        x = len(df[[var1, var2]].drop_duplicates())
        print("N unique students: {}".format(x))
        return x
    
    def get_vars(self):
        df = self._df
        cols = [x for x in df if x in tlcvarnames]
        return df[cols].rename(columns = tlcvarnames)
    
tlcvarnames = {'id': 'id',
               'year': 'year',
               'de id': 'id', 
               'previous_cum_gpa': 'prev_cumgpa',
               'term_code_econ': 'term',
               'term_precoursegpa': 'term_pregpa',
               'apct_type_desc': 'apptype',
               'deid': 'id', 
               'ethnicity_grouped': 'ethnicity',
               'gender': 'gender',
               'term_code_econ': 'term',
               'year_econ': 'year',
               'measure names': 'measure',
               'term code courses': 'term',
               'term code econ 100a': 'termecon',
               'year - econ 100a': 'year',
               'measure values': 'values',
               'Class Units - Letter Grade': 'units_letter',
               'Class Units - P/NP': 'units_pnp',
               'Class Units - Withdrawn': 'units_w',
               'GPA - Classes Letter Grade': 'gpa_letter',
               'GPA - Classes Letter Grade - No Econ': 'gpa_letter_sansecon',
               'GPA - Classes Letter Grade - No Econ 100A': 'gpa_letter_sans100a',
               'GPA - Classes Letter Grade - Only Econ - No Econ 100A': 'gpa_econ_sans100a',
               'N Classes - Letter Grade': 'nclass_letter',
               'N Classes - Not Passed': 'nclass_np',
               'N Classes - P/NP': 'nclass_pnp',
               'N Classes - Passed': 'nclass_p',
               'N Classes - Withdrawn': 'nclass_w'
              }

## Merge anonymized data

### Demographic data

In [3]:
# pre 100A (term before) cumulative gpa. Missing for all entering freshmen or first time UCSD students

dfd = pd.read_csv("../data/raw/Econ-Goodman-Su20-Pre Course GPA-FA18.csv")
dfd = pd.concat([dfd, pd.read_csv("../data/raw/Econ-Goodman-Su20-Pre Course GPA-FA19.csv")], 0)
dfd.columns = [x.lower() for x in dfd.columns]
dfd = dfd.z.get_vars()
dfd.loc[dfd.term == 'FA18', 'year'] = 2018
dfd.loc[dfd.term == 'FA19', 'year'] = 2019
# reorder and drop term
dfd = dfd[['id', 'year', 'term_pregpa', 'prev_cumgpa']]

print(len(dfd))
dfd.z.get_unique_students(var2='id')
display(dfd.isnull().sum())
dfd.head()

596
N unique students: 596


id             0
year           0
term_pregpa    0
prev_cumgpa    0
dtype: int64

Unnamed: 0,id,year,term_pregpa,prev_cumgpa
0,0A268CBD,2018.0,S218,1.24
1,0C1F46E4,2018.0,S118,2.88
2,0CE1089E,2018.0,S218,3.61
3,0DCDABDA,2018.0,S118,3.04
4,0DE6C596,2018.0,SP18,3.45


In [4]:
# ethnicity, gender, transfer status

df = pd.read_csv("../data/raw/Econ-Goodman-Su20-Student-level.csv")
df.rename(str.lower, axis=1, inplace=True)
df = df.z.get_vars()
df['transfer'] = np.array(df.apptype == 'Transfer Student', dtype='int')
df.loc[df.gender == 'Men (Cis & Trans)', 'gender'] = 'm'
df.loc[df.gender == 'Women (Cis & Trans)', 'gender'] = 'f'
df.loc[df.gender == 'Data Unavailable', 'gender'] = 'u'
pd.crosstab(df.year, df.term)
# drop term and reorder
df = df[['id', 'year', 'ethnicity', 'gender', 'transfer', 'apptype']]

# merge with dfd
dfd = df.merge(dfd, how='outer', on=['id', 'year'])

print(len(dfd))
dfd.z.get_unique_students()
print(dfd.isnull().sum())
dfd.head()

846
N unique students: 846
id               0
year             0
ethnicity        0
gender           0
transfer         0
apptype          0
term_pregpa    250
prev_cumgpa    250
dtype: int64


Unnamed: 0,id,year,ethnicity,gender,transfer,apptype,term_pregpa,prev_cumgpa
0,0A4AB941,2019,"African-American/Black, Pacific Islander, Not ...",m,0,First-Time First Year,SP19,3.7
1,0A268CBD,2018,Chinese/Chinese-American,m,1,Transfer Student,S218,1.24
2,0AF24D80,2019,Chinese/Chinese-American,f,0,First-Time First Year,SP19,1.93
3,0B737A47,2019,Chinese/Chinese-American,f,0,First-Time First Year,S219,3.73
4,0BB2B7C5,2018,Chicanx/Latinx,m,1,Transfer Student,,


#### Concurrent gpa/units

In [5]:
df = pd.read_csv("../data/raw/Econ-Goodman-Su20-Student-quarter-level_FA18-WI19.csv")
df = pd.concat([df, pd.read_csv("../data/raw/Econ-Goodman-Su20-Student-quarter-level_FA19-WI20.csv")], 0)
df.columns = [x.lower() for x in df.columns]
df = df.z.get_vars()
print(len(df))
display(pd.crosstab(df.year, df.term))
# for now keep only concurrent data - later bring back following quarter's data to check spillovers
df = df.loc[df.term.isin(["FA18", "FA19"])]
df = df[['id', 'year', 'measure', 'values']]

# reshape wide
df = df.pivot_table(index=['id', 'year'], columns='measure', values='values').reset_index()
df = df.z.get_vars()

# merge with rest of demographic data
dfd = dfd.merge(df, how='outer', on=['id', 'year'])
dfd.head(1)

19980


term,FA18,FA19,WI19,WI20
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,5136,0,4968,0
2019,0,5016,0,4860


Unnamed: 0,id,year,ethnicity,gender,transfer,apptype,term_pregpa,prev_cumgpa,units_letter,units_pnp,units_w,gpa_letter,gpa_letter_sansecon,gpa_letter_sans100a,gpa_econ_sans100a,nclass_letter,nclass_np,nclass_pnp,nclass_p,nclass_w
0,0A4AB941,2019,"African-American/Black, Pacific Islander, Not ...",m,0,First-Time First Year,SP19,3.7,16.0,,,3.5,3.65,3.766667,4.0,4.0,0.0,0.0,4.0,0.0


In [6]:
# add zeros where needed
for x in ['units_pnp', 'units_w']:
    dfd.loc[dfd[x].isnull(), x] = 0

# check and explore data

print(len(dfd))
dfd.z.get_unique_students()
print(dfd.dtypes)
print(dfd.isnull().sum())

846
N unique students: 846
id                      object
year                     int64
ethnicity               object
gender                  object
transfer                 int32
apptype                 object
term_pregpa             object
prev_cumgpa            float64
units_letter           float64
units_pnp              float64
units_w                float64
gpa_letter             float64
gpa_letter_sansecon    float64
gpa_letter_sans100a    float64
gpa_econ_sans100a      float64
nclass_letter          float64
nclass_np              float64
nclass_pnp             float64
nclass_p               float64
nclass_w               float64
dtype: object
id                       0
year                     0
ethnicity                0
gender                   0
transfer                 0
apptype                  0
term_pregpa            250
prev_cumgpa            250
units_letter             0
units_pnp                0
units_w                  0
gpa_letter               0
gpa_letter_sans

In [7]:
# export
dfd.to_csv("../data/generated/dem_concurrent.csv", index=False)

#### Next quarter gpa/units

In [8]:
df = pd.read_csv("../data/raw/Econ-Goodman-Su20-Student-quarter-level_FA18-WI19.csv")
df = pd.concat([df, pd.read_csv("../data/raw/Econ-Goodman-Su20-Student-quarter-level_FA19-WI20.csv")], 0)
df.columns = [x.lower() for x in df.columns]
df = df.z.get_vars()
print(len(df))
display(pd.crosstab(df.year, df.term))
# keep only following quarter's data to check spillovers
df = df.loc[df.term.isin(["WI19", "WI20"])]
df = df[['id', 'year', 'measure', 'values']]

# reshape wide
df = df.pivot_table(index=['id', 'year'], columns='measure', values='values').reset_index()
df = df.z.get_vars()
df.head(1)

19980


term,FA18,FA19,WI19,WI20
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,5136,0,4968,0
2019,0,5016,0,4860


measure,id,year,units_letter,units_pnp,units_w,gpa_letter,gpa_letter_sansecon,gpa_letter_sans100a,gpa_econ_sans100a,nclass_letter,nclass_np,nclass_pnp,nclass_p,nclass_w
0,00685C2D,2019,12.0,,,3.7,,3.7,3.7,3.0,0.0,0.0,3.0,0.0


In [9]:
# merge with dfd after dropping units/gpa vars
dfd = dfd[['id', 'year', 'ethnicity', 'gender', 'transfer', \
           'apptype', 'term_pregpa', 'prev_cumgpa']]
dfd = dfd.merge(df, how='outer', on=['id', 'year'])
dfd.head(1)

Unnamed: 0,id,year,ethnicity,gender,transfer,apptype,term_pregpa,prev_cumgpa,units_letter,units_pnp,units_w,gpa_letter,gpa_letter_sansecon,gpa_letter_sans100a,gpa_econ_sans100a,nclass_letter,nclass_np,nclass_pnp,nclass_p,nclass_w
0,0A4AB941,2019,"African-American/Black, Pacific Islander, Not ...",m,0,First-Time First Year,SP19,3.7,8.0,4.0,,3.65,,3.65,3.65,2.0,0.0,1.0,3.0,0.0


In [10]:
# add zeros where needed
for x in ['units_letter', 'units_pnp', 'units_w', \
          'nclass_letter', 'nclass_np', 'nclass_pnp', \
          'nclass_p', 'nclass_w']:
    dfd.loc[dfd[x].isnull(), x] = 0

# check and explore data

print(len(dfd))
dfd.z.get_unique_students()
print(dfd.dtypes)
print(dfd.isnull().sum())

846
N unique students: 846
id                      object
year                     int64
ethnicity               object
gender                  object
transfer                 int32
apptype                 object
term_pregpa             object
prev_cumgpa            float64
units_letter           float64
units_pnp              float64
units_w                float64
gpa_letter             float64
gpa_letter_sansecon    float64
gpa_letter_sans100a    float64
gpa_econ_sans100a      float64
nclass_letter          float64
nclass_np              float64
nclass_pnp             float64
nclass_p               float64
nclass_w               float64
dtype: object
id                       0
year                     0
ethnicity                0
gender                   0
transfer                 0
apptype                  0
term_pregpa            250
prev_cumgpa            250
units_letter             0
units_pnp                0
units_w                  0
gpa_letter              28
gpa_letter_sans

In [11]:
# export
dfd.to_csv("../data/generated/dem_nextquarter.csv", index=False)

### Treatment and outcome data

#### Student level

In [12]:
df = pd.read_excel("../data/raw/DeID_all_pid-year-level_data with description.xlsx")
df.rename(columns={'DeID': 'id'}, inplace=True)
print(len(df))
df.z.get_unique_students()
df = df[['id'] + [x for x in df.columns if x != 'id']]
df.head()

843
N unique students: 841


Unnamed: 0,id,toberandomized,treated,pair,year,mathquiz,mid1score,mid2score,finalscore,assigned_grade,grade_option,videos,relevant,videos_u,relevant_u,duration_all,duration_rel,duration_u,videos_b4_mid1_rel,videos_b4_mid1_relu,videos_b4_mid2_rel,videos_b4_mid2_relu,videos_b,relevant_b,videos_u_b,relevant_u_b,duration_all_b,duration_rel_b,duration_u_b,mid1_100b,mid2_100b,final_100b,lettergrade_100b,took100b,piazza_daysonline,piazza_views,piazza_contributions,piazza_questions,piazza_notes,piazza_answers,piazza_bonus_piazza,pset_pre,pset_post,pset_total,q1start,q2start,q3start,q4start,q5start,q6start,q7start,q8start,q9start,q10start,q2end,q3end,q4end,q5end,q6end,q7end,q8end,q9end,q10end,attend1,attend2,attend3,attend4,attend5,attend6,attend7
0,939B5883,1,1,1,2018,0.25,0.108,,,,,24,24,21,21,12340,12340,11514,7,7,24,21,0,0,1,0,0,0,0,,,,,0,0,0,0,0,0,0,0,0,0,0,15.0,1.0,10.0,1.5,3.0,6.5,0.0,Both Tuesday and Thursday,,,,,,,,,,,,,,,,,,
1,16DF232D,1,1,2,2018,0.15,0.12,0.08,0.124,F,L,121,120,65,64,59838,59832,31243,10,6,29,20,2,2,1,1,618,618,0,,,,,0,19,76,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,1.0,1.0,1.0
2,16DF232D,1,1,2,2018,0.15,0.12,0.08,0.124,F,L,121,120,65,64,59838,59832,31243,10,6,29,20,2,2,1,1,618,618,0,,,,,0,19,76,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,1.0,1.0,1.0
3,5BC5BEE9,1,0,2,2018,0.0,0.13,,,,,15,15,13,13,9198,9198,7588,13,11,15,13,4,4,3,3,271,271,0,,,,,0,9,52,0,0,0,0,0,0,0,0,20.0,2.0,30.0,5.0,15.0,10.0,1.0,Both Tuesday and Thursday,,,,,,,,,,,,,,,,,,
4,62F8FE53,1,1,3,2018,0.0,0.14,0.07,,F,L,9,8,9,8,2356,1691,2356,1,1,8,8,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,0,0,0,0,0,100.0,2.0,100.0,100.0,50.0,50.0,1.0,"Tuesday, 10/16",,,,,,,,,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# I think that's all I need for now...may add more later

# drop the two duplicate entries
print(len(df))
df = df.drop_duplicates()
print(len(df))
df.z.get_unique_students()

# export
df.to_csv("../data/generated/id-year_level_data.csv", index=False)

843
841
N unique students: 841
