# View this notebook here:
https://nbviewer.jupyter.org/github/zagoodman/microeconomics_videos/blob/master/jupyter/assemble_data.ipynb

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prep" data-toc-modified-id="Prep-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prep</a></span></li><li><span><a href="#Merge-anonymized-data" data-toc-modified-id="Merge-anonymized-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Merge anonymized data</a></span><ul class="toc-item"><li><span><a href="#Demographic-data" data-toc-modified-id="Demographic-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Demographic data</a></span><ul class="toc-item"><li><span><a href="#Preceding-term-vars" data-toc-modified-id="Preceding-term-vars-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>Preceding term vars</a></span></li><li><span><a href="#Student-level" data-toc-modified-id="Student-level-2.1.2"><span class="toc-item-num">2.1.2&nbsp;&nbsp;</span>Student-level</a></span></li><li><span><a href="#GPA-and-course-units" data-toc-modified-id="GPA-and-course-units-2.1.3"><span class="toc-item-num">2.1.3&nbsp;&nbsp;</span>GPA and course units</a></span><ul class="toc-item"><li><span><a href="#Concurrent-term" data-toc-modified-id="Concurrent-term-2.1.3.1"><span class="toc-item-num">2.1.3.1&nbsp;&nbsp;</span>Concurrent term</a></span></li><li><span><a href="#Following-term" data-toc-modified-id="Following-term-2.1.3.2"><span class="toc-item-num">2.1.3.2&nbsp;&nbsp;</span>Following term</a></span></li></ul></li></ul></li><li><span><a href="#Treatment-and-outcome-data" data-toc-modified-id="Treatment-and-outcome-data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Treatment and outcome data</a></span><ul class="toc-item"><li><span><a href="#Student-level" data-toc-modified-id="Student-level-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Student level</a></span></li></ul></li></ul></li></ul></div>

This file takes anonymized data from the T+LC, cleans them, and returns analysis-ready data frames:
1. `id-year_level_data.csv`: treatment and exam score data
2. `dem_concurrent.csv`: demographic data and GPA/courseload during the quarter of the experiment
3. `dem_nextquarter.csv`: demographic data and GPA/courseload during the quarter following treatment

All code in this file is in Python 3.

## Prep

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
import pandas_flavor as pf

@pf.register_dataframe_accessor('z')
class MyFlavor(object):
    
    def __init__(self, df):
        self._df = df

    def get_unique_students(self, var1 = "id", var2 = "year"):
        df = self._df
        x = len(df[[var1, var2]].drop_duplicates())
        print("N unique students: {}".format(x))
        return x
    
    def get_vars(self):
        df = self._df
        cols = [x for x in df if x in tlcvarnames]
        return df[cols].rename(columns = tlcvarnames)
    
tlcvarnames = {'id': 'id',
               'year': 'year',
               'de id': 'id', 
               'deid': 'id',
               'previous_cum_gpa': 'prev_cumgpa',
               'term_code_econ': 'term',
               'term-econ100a': 'term',
               'term code courses': 'term_courses',
               'term_precoursegpa': 'term_pregpa',
               'term code econ 100a': 'termecon',
               'apct_type_desc': 'apptype',
               'ethnicity_grouped': 'ethnicity',
               'gender': 'gender',
               'term_code_econ': 'term',
               'year_econ': 'year',
               'year - econ 100a': 'year',
               'year - econ100a': 'year',
               'year-econ100a': 'year',
               'year-zack': 'year',
               'measure names': 'measure',
               'measure values': 'values',
               'Class Units - Letter Grade': 'units_letter',
               'Class Units - P/NP': 'units_pnp',
               'Class Units - Withdrawn': 'units_w',
               'GPA - Classes Letter Grade': 'gpa_letter',
               'GPA - Classes Letter Grade - No Econ': 'gpa_letter_sansecon',
               'GPA - Classes Letter Grade - No Econ 100A': 'gpa_letter_sans100a',
               'GPA - Classes Letter Grade - Only Econ - No Econ 100A': 'gpa_econ_sans100a',
               'N Classes - Letter Grade': 'nclass_letter',
               'N Classes - Not Passed': 'nclass_np',
               'N Classes - P/NP': 'nclass_pnp',
               'N Classes - Passed': 'nclass_p',
               'N Classes - Withdrawn': 'nclass_w'
              }

## Merge anonymized data

### Demographic data

#### Preceding term vars

In [3]:
# pre 100A (term before) cumulative gpa. Missing for all entering freshmen or first time UCSD students

dfd = pd.read_excel("../data/raw/Econ-Goodman-Su20-Pre Course GPA-FA18.xlsx")
dfd = pd.concat([dfd, pd.read_excel("../data/raw/Econ-Goodman-Su20-Pre Course GPA-FA19.xlsx")], 0)
dfd.columns = [x.lower() for x in dfd.columns]
dfd = dfd.z.get_vars()
dfd.loc[dfd.term == 'FA18', 'year'] = 2018
dfd.loc[dfd.term == 'FA19', 'year'] = 2019
# reorder and drop term
dfd = dfd[['id', 'year', 'term_pregpa', 'prev_cumgpa']]

print(len(dfd))
dfd.z.get_unique_students(var2='id')
display(dfd.isnull().sum())
#dfd.head()

598
N unique students: 598


id             0
year           0
term_pregpa    0
prev_cumgpa    0
dtype: int64

#### Student-level

In [4]:
# ethnicity, gender, transfer status

df = pd.read_excel("../data/raw/Econ-Goodman-Su20-Student-level.xlsx")
df.rename(str.lower, axis=1, inplace=True)
df = df.z.get_vars()
df['transfer'] = np.array(df.apptype == 'Transfer Student', dtype='int')
df.loc[df.gender == 'Men (Cis & Trans)', 'gender'] = 'm'
df.loc[df.gender == 'Women (Cis & Trans)', 'gender'] = 'f'
df.loc[df.gender == 'Data Unavailable', 'gender'] = 'u'
# drop term and reorder
df = df[['id', 'year', 'ethnicity', 'gender', 'transfer', 'apptype']]

# merge with dfd
dfd = df.merge(dfd, how='outer', on=['id', 'year'])

print(len(dfd))
dfd.z.get_unique_students()
print(dfd.isnull().sum())
#dfd.head()

850
N unique students: 850
id               0
year             0
ethnicity        0
gender           0
transfer         0
apptype          0
term_pregpa    252
prev_cumgpa    252
dtype: int64


#### GPA and course units

In [5]:
df = pd.read_excel("../data/raw/Econ-Goodman-Su20-Student-quarter-level_FA18-WI19.xlsx")
df.columns = [x.lower() for x in df.columns]
df = df.z.get_vars()
df2 = pd.read_excel("../data/raw/Econ-Goodman-Su20-Student-quarter-level_FA19-WI20.xlsx")
df2.columns = [x.lower() for x in df2.columns]
df2 = df2.z.get_vars()

df = pd.concat([df, df2], 0)
print(len(df))
display(pd.crosstab(df.year, df.term))

21749


term,FA18,FA19
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,11076,0
2019,0,10673


##### Concurrent term

In [6]:
## concurrent data

dfc = df.loc[df.term_courses.isin(["FA18", "FA19"])]
dfc = dfc[['id', 'year', 'measure', 'values']]

# reshape wide
dfc = dfc.pivot_table(index=['id', 'year'], columns='measure', values='values').reset_index()
dfc = dfc.z.get_vars()

# merge with rest of demographic data
dfd = dfd.merge(dfc, how='outer', on=['id', 'year'])

# add zeros where needed
for x in ['units_pnp', 'units_w']:
    dfd.loc[dfd[x].isnull(), x] = 0

# check and explore data
print(len(dfd))
dfd.z.get_unique_students()
print(dfd.dtypes)
print(dfd.isnull().sum())

# export
dfd.to_csv("../data/generated/dem_concurrent.csv", index=False)

850
N unique students: 850
id                      object
year                     int64
ethnicity               object
gender                  object
transfer                 int32
apptype                 object
term_pregpa             object
prev_cumgpa            float64
units_letter           float64
units_pnp              float64
units_w                float64
gpa_letter             float64
gpa_letter_sansecon    float64
gpa_letter_sans100a    float64
gpa_econ_sans100a      float64
nclass_letter          float64
nclass_np              float64
nclass_pnp             float64
nclass_p               float64
nclass_w               float64
dtype: object
id                       0
year                     0
ethnicity                0
gender                   0
transfer                 0
apptype                  0
term_pregpa            252
prev_cumgpa            252
units_letter             0
units_pnp                0
units_w                  0
gpa_letter               2
gpa_letter_sans

##### Following term

In [7]:
## following quarter's data

dff = df.loc[df.term_courses.isin(["WI19", "WI20"])]
dff = dff[['id', 'year', 'measure', 'values']]

# reshape wide
dff = dff.pivot_table(index=['id', 'year'], columns='measure', values='values').reset_index()
dff = dff.z.get_vars()

# merge with rest of demographic data
dfd = dfd.iloc[:, 0:8]
dfd = dfd.merge(dff, how='outer', on=['id', 'year'])

# add zeros where needed
for x in ['units_letter', 'units_pnp', 'units_w', \
          'nclass_letter', 'nclass_np', 'nclass_pnp', \
          'nclass_p', 'nclass_w']:
    dfd.loc[dfd[x].isnull(), x] = 0

# check and explore data
print(len(dfd))
dfd.z.get_unique_students()
print(dfd.dtypes)
print(dfd.isnull().sum())

# export
dfd.to_csv("../data/generated/dem_nextquarter.csv", index=False)

850
N unique students: 850
id                      object
year                     int64
ethnicity               object
gender                  object
transfer                 int32
apptype                 object
term_pregpa             object
prev_cumgpa            float64
units_letter           float64
units_pnp              float64
units_w                float64
gpa_letter             float64
gpa_letter_sansecon    float64
gpa_letter_sans100a    float64
gpa_econ_sans100a      float64
nclass_letter          float64
nclass_np              float64
nclass_pnp             float64
nclass_p               float64
nclass_w               float64
dtype: object
id                       0
year                     0
ethnicity                0
gender                   0
transfer                 0
apptype                  0
term_pregpa            252
prev_cumgpa            252
units_letter             0
units_pnp                0
units_w                  0
gpa_letter              29
gpa_letter_sans

### Treatment and outcome data

#### Student level

In [8]:
df = pd.read_excel("../data/raw/DeID_all_pid-year-level_data with description.xlsx")
df.rename(columns={'DeID': 'id'}, inplace=True)
print(len(df))
df.z.get_unique_students()
df = df[['id'] + [x for x in df.columns if x != 'id']]
#df.head()

850
N unique students: 850


In [9]:
# I think that's all I need for now...may add more later

# drop the two duplicate entries
print(len(df))
df = df.drop_duplicates()
print(len(df))
df.z.get_unique_students()

# export
df.to_csv("../data/generated/id-year_level_data.csv", index=False)

850
850
N unique students: 850
