# This file: Cleans CA school enrollment data at school level
# Dependencies: Raw data from CA DOE website
# Outputs: enrollment (school-level enrollment data, district info, county pop)
# Last updated: 2/21/2019

In [2]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

  from pandas.core import datetools


In [5]:
directory = '/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/CA_school_districts'
os.chdir(directory)

# Merge the raw CA public school enrollment files

Documentation:

- 'Year' is the first year in the academic calendar - e.g. 1981 means data from the 1981-82 AY.
- Enrollment by racial/ethnic designation was not collected during the 1982–83 and 1983–84 data collection.
- Ethnic group names were revised in 1998-99 to meet federal standards.

In [123]:
sd = pd.DataFrame()

for filename in os.listdir(directory):
    
    if filename.endswith('txt') and len(filename)==9:
        year = filename[3:5]
        
        if int(year) < 20:
            append_year = int('20' + year)
        else:
            append_year = int('19' + year)
        
        df = pd.read_csv(filename, delimiter='\t', dtype={'CDS_CODE':object})
        df['YEAR'] = append_year
        
        sd = sd.append(df)

    # 81 to 92 file
    elif filename.endswith('txt') and filename.startswith('enr'):
        df = pd.read_csv(filename, delimiter='\t', dtype={'CDS_CODE':object, 'COUNTY':object, 'DISTRICT':object,
                                                         'DistrictName':object, 'SCHOOL':object, 'SchoolName':object})
        df['YEAR'] = df['YEAR'].apply(lambda x: int('19' + str(x)[:2]))
        sd = sd.append(df)

In [124]:
sd.head(10)

Unnamed: 0,ADULT,CDS_CODE,COUNTY,DISTRICT,DistrictName,ENR_TOTAL,ETHNIC,GENDER,GR_1,GR_10,...,GR_6,GR_7,GR_8,GR_9,KDGN,SCHOOL,SchoolName,UNGR_ELM,UNGR_SEC,YEAR
0,0,1100170130401,,,,1,1,M,0,0,...,0,0,1,0,0,,,0,0,2000
1,0,1100170130401,,,,1,2,F,0,0,...,0,0,0,0,0,,,0,0,2000
2,0,1100170130401,,,,23,2,M,0,5,...,0,0,0,4,0,,,0,0,2000
3,0,1100170130401,,,,3,3,M,0,0,...,0,1,0,0,0,,,0,0,2000
4,0,1100170130401,,,,2,4,M,0,0,...,0,0,0,0,0,,,0,0,2000
5,0,1100170130401,,,,9,5,F,0,4,...,0,0,0,3,0,,,0,0,2000
6,0,1100170130401,,,,77,5,M,0,15,...,0,1,3,14,0,,,0,0,2000
7,0,1100170130401,,,,46,6,F,0,10,...,0,1,4,8,0,,,0,0,2000
8,0,1100170130401,,,,262,6,M,0,54,...,2,2,15,32,0,,,0,0,2000
9,0,1100170130401,,,,6,7,F,0,2,...,0,0,1,0,0,,,0,0,2000


In [125]:
ethnic_dict_93 = {
    1: 'American Indian or Alaska',
    2: 'Asian',
    3: 'Pacific Islander',
    4: 'Filipino',
    5: 'Hispanic or Latino',
    6: 'Black, not Hispanic',
    7: 'White, not Hispanic'
}

# need to check this, but prob correct...
ethnic_dict_81 = {
    'I': 'American Indian or Alaska',
    'A': 'Asian',
    'P': 'Pacific Islander',
    'F': 'Filipino',
    'H': 'Hispanic or Latino',
    'B': 'Black, not Hispanic',
    'H': 'White, not Hispanic'
}

In [126]:
sd['ETHNIC_CAT'] = sd.apply(lambda x: ethnic_dict_93.get(x['ETHNIC']) if x['YEAR'] > 1992 else ethnic_dict_81.get(x['ETHNIC']), axis=1)

In [127]:
# Save file
sd.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/intermediate/school_districts_clean.csv', index=False)

# Merge w district information

In [129]:
sd = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/intermediate/school_districts_clean.csv', 
                 dtype={'CDS_CODE':object})

  interactivity=interactivity, compiler=compiler, result=result)


In [130]:
sd['CD Code'] = sd['CDS_CODE'].apply(lambda x: x[:7])

In [132]:
dist = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/CA_school_districts/pubschls.txt', 
                   delimiter='\t', dtype={'CDSCode':object})

In [136]:
dist.head()

Unnamed: 0,CDS_CODE,NCESDist,NCESSchool,StatusType,County,District,School,Street,StreetAbr,City,...,AdmFName1,AdmLName1,AdmEmail1,AdmFName2,AdmLName2,AdmEmail2,AdmFName3,AdmLName3,AdmEmail3,LastUpDate
0,1100170000000,691051,No Data,Active,Alameda,Alameda County Office of Education,No Data,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,L Karen,Monroe,lkmonroe@acoe.org,No Data,No Data,No Data,No Data,No Data,No Data,02/02/2017
1,1100170109835,691051,10546,Closed,Alameda,Alameda County Office of Education,FAME Public Charter,"39899 Balentine Drive, Suite 335","39899 Balentine Dr., Ste. 335",Newark,...,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,09/01/2015
2,1100170112607,691051,10947,Active,Alameda,Alameda County Office of Education,Envision Academy for Arts & Technology,1515 Webster Street,1515 Webster St.,Oakland,...,Eve,Gordon,eve@envisionacademy.org,No Data,No Data,No Data,No Data,No Data,No Data,07/26/2017
3,1100170118489,691051,12283,Closed,Alameda,Alameda County Office of Education,Aspire California College Preparatory Academy,2125 Jefferson Avenue,2125 Jefferson Ave.,Berkeley,...,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,07/01/2015
4,1100170123968,691051,12844,Active,Alameda,Alameda County Office of Education,Community School for Creative Education,2111 International Boulevard,2111 International Blvd.,Oakland,...,Monique,Brinson,moniqueb@communityschoolforcreativeeducation.org,No Data,No Data,No Data,No Data,No Data,No Data,08/16/2017


In [137]:
dist.rename(columns={'CDSCode':'CDS_CODE'}, inplace=True)
dist = dist[['CDS_CODE', 'NCESDist', 'NCESSchool', 'StatusType', 'County', 'District', 'School', 'DOC', 'DOCType']]

In [138]:
print(len(sd))
print(sd['CDS_CODE'].nunique())
print(len(dist))
print(dist['CDS_CODE'].nunique())

3534277
14737
18004
18004


In [139]:
merged = sd.merge(dist, on='CDS_CODE', how='left')

In [140]:
merged.head()

Unnamed: 0,ADULT,CDS_CODE,COUNTY,DISTRICT,DistrictName,ENR_TOTAL,ETHNIC,GENDER,GR_1,GR_10,...,ETHNIC_CAT,CD Code,NCESDist,NCESSchool,StatusType,County,District,School,DOC,DOCType
0,0,1100170130401,,,,1,1,M,0,0,...,American Indian or Alaska,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE)
1,0,1100170130401,,,,1,2,F,0,0,...,Asian,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE)
2,0,1100170130401,,,,23,2,M,0,5,...,Asian,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE)
3,0,1100170130401,,,,3,3,M,0,0,...,Pacific Islander,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE)
4,0,1100170130401,,,,2,4,M,0,0,...,Filipino,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE)


In [141]:
merged[merged['County'].isna()]

Unnamed: 0,ADULT,CDS_CODE,COUNTY,DISTRICT,DistrictName,ENR_TOTAL,ETHNIC,GENDER,GR_1,GR_10,...,ETHNIC_CAT,CD Code,NCESDist,NCESSchool,StatusType,County,District,School,DOC,DOCType
2689,0,01612596001788,,,,14,2,F,2,0,...,Asian,0161259,,,,,,,,
2690,0,01612596001788,,,,12,2,M,1,0,...,Asian,0161259,,,,,,,,
2691,0,01612596001788,,,,10,3,F,2,0,...,Pacific Islander,0161259,,,,,,,,
2692,0,01612596001788,,,,17,3,M,4,0,...,Pacific Islander,0161259,,,,,,,,
2693,0,01612596001788,,,,2,4,F,0,0,...,Filipino,0161259,,,,,,,,
2694,0,01612596001788,,,,1,4,M,0,0,...,Filipino,0161259,,,,,,,,
2695,0,01612596001788,,,,257,5,F,58,0,...,Hispanic or Latino,0161259,,,,,,,,
2696,0,01612596001788,,,,284,5,M,51,0,...,Hispanic or Latino,0161259,,,,,,,,
2697,0,01612596001788,,,,314,6,F,52,0,...,"Black, not Hispanic",0161259,,,,,,,,
2698,0,01612596001788,,,,316,6,M,51,0,...,"Black, not Hispanic",0161259,,,,,,,,


# Clean the county populations

In [6]:
# 1980
# Check header, skipfooter
countypop_1980 = pd.read_excel('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/county_populations/e4_popest_1981_1990.xls', 
                               header=5, skipfooter=716).dropna()

start = 1980
l = list(range(start + 1, start + 11))
l = [str(i) for i in l]

countypop_1980.columns = ['County'] + l

countypop_1980 = pd.melt(countypop_1980, id_vars='County', value_vars=l, var_name='YEAR', 
                         value_name='County_Population')

In [7]:
countypop_1980.head()

Unnamed: 0,County,YEAR,County_Population
0,Alameda,1981,1117800.0
1,Alpine,1981,1090.0
2,Amador,1981,19800.0
3,Butte,1981,146800.0
4,Calaveras,1981,21350.0


In [8]:
# 1990
# Check header, skipfooter
countypop_1990 = pd.read_excel('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/county_populations/e4_popest_1991_2000.xls', 
                               header=2, skipfooter=1).dropna()

start = 1990
l = list(range(start + 1, start + 11))
l = [str(i) for i in l]

# drop extraneous cols
countypop_1990.drop([countypop_1990.columns[1], countypop_1990.columns[-1]], axis=1, inplace=True)

countypop_1990.columns = ['County'] + l

countypop_1990 = pd.melt(countypop_1990, id_vars='County', value_vars=l, var_name='YEAR', 
                         value_name='County_Population')

In [9]:
# 2000
# Check header, skipfooter
countypop_2000 = pd.read_excel('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/county_populations/e4_popest_2001_2010.xls', 
                               sheet_name=1, header=3, skipfooter=1).dropna()

start = 2000
l = list(range(start + 1, start + 11))
l = [str(i) for i in l]

# drop extraneous cols
countypop_2000.drop([countypop_2000.columns[1], countypop_2000.columns[-1]], axis=1, inplace=True)

countypop_2000.columns = ['County'] + l

countypop_2000 = pd.melt(countypop_2000, id_vars='County', value_vars=l, var_name='YEAR', 
                         value_name='County_Population')

In [10]:
# Now want to get county population levels in order to use Boustan's specification
# Check header, skipfooter
countypop_2010 = pd.read_excel('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/county_populations/e4_popest_2011_2018.xls', 
                               sheet_name=1, header=3, skipfooter=1).dropna()

start = 2010
l = list(range(start + 1, start + 9))
l = [str(i) for i in l]

# drop extraneous cols
countypop_2010.drop(countypop_2010.columns[1], axis=1, inplace=True)

countypop_2010.columns = ['County'] + l

countypop_2010 = pd.melt(countypop_2010, id_vars='County', value_vars=l, var_name='YEAR', 
                         value_name='County_Population')

In [11]:
# AGGREGATED
countypop = countypop_1980.append(countypop_1990).append(countypop_2000).append(countypop_2010)

In [12]:
countypop['County'] = countypop['County'].apply(lambda x: x.strip())
countypop['YEAR'] = countypop['YEAR'].astype(int)

In [13]:
countypop.head()

Unnamed: 0,County,YEAR,County_Population
0,Alameda,1981,1117800.0
1,Alpine,1981,1090.0
2,Amador,1981,19800.0
3,Butte,1981,146800.0
4,Calaveras,1981,21350.0


In [14]:
len(countypop)

2204

# Merge schools with the county populations

In [225]:
final = merged.merge(countypop, on=['County', 'YEAR'], how='left')

In [226]:
final.head()

Unnamed: 0,ADULT,CDS_CODE,COUNTY,DISTRICT,DistrictName,ENR_TOTAL,ETHNIC,GENDER,GR_1,GR_10,...,CD Code,NCESDist,NCESSchool,StatusType,County,District,School,DOC,DOCType,County_Population
0,0,1100170130401,,,,1,1,M,0,0,...,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE),1437136.0
1,0,1100170130401,,,,1,2,F,0,0,...,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE),1437136.0
2,0,1100170130401,,,,23,2,M,0,5,...,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE),1437136.0
3,0,1100170130401,,,,3,3,M,0,0,...,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE),1437136.0
4,0,1100170130401,,,,2,4,M,0,0,...,110017,691051,9264,Active,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,0.0,County Office of Education (COE),1437136.0


In [227]:
len(final)

3534277

In [228]:
len(final[final['County_Population'].isna()])

74444

# Open to do: Why is county NA? 
## There are some districts in the enrollment data which are not in the identifying file (so can't the district and county names...)

# Deliverable Enrollment data

In [232]:
# Save file
final.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/enrollment.csv', index=False)