# School demographics

This notebook loads and cleans school enrollment and demographic data from the annual enrollment snapshots produced by the New York City Department of Education.

## Import Python libraries and set working directories

In [1]:
import os
import feather
import numpy as np
import pandas as pd

In [2]:
input_dir = os.path.join(os.path.dirname(os.getcwd()), 'data', 'input')
intermediate_dir = os.path.join(os.path.dirname(os.getcwd()), 'data', 'intermediate')
output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data', 'output')

## Load data and select relevant variables

The [raw file](http://schools.nyc.gov/NR/rdonlyres/77954FB0-FD24-476B-AB81-3E9BBE8655D9/213559/DemographicSnapshot201213to201617Public_FINAL1.xlsx) comes from the NYC Department of Education (NYCDOE), available [here](http://schools.nyc.gov/Accountability/data/default.htm).

In [6]:
demographics = pd.read_excel(
    os.path.join(input_dir, 'DemographicSnapshot201213to201617Public_FINAL1.xlsx'), 
    sheetname = 'School'
)

demographics.columns = demographics.columns.str.lower()
demographics.columns = demographics.columns.str.replace(' ', '_')
demographics.columns = demographics.columns.str.replace('%', 'perc')
demographics.drop([c for c in demographics.columns if ('grade' in c) | ('#' in c) | ('index' in c)], axis = 1, inplace = True)
percent_vars = [c for c in demographics.columns if 'perc' in c]

for var in percent_vars :
    demographics[var] = demographics[var] * 100

demographics = demographics.loc[demographics['year'] == '2013-14']

demographics.rename(columns = {'perc_multiple_race_categories_not_represented':'perc_multiple_other',
                              'perc_poverty':'perc_free_lunch'}, inplace = True)

demographics.drop(['year'], axis = 1, inplace = True)
demographics.reset_index(inplace = True, drop = True)

## Save data

Save the `demographics` dataframe to a [feather](https://blog.cloudera.com/blog/2016/03/feather-a-fast-on-disk-format-for-data-frames-for-r-and-python-powered-by-apache-arrow/) file in the `data/intermediate` folder.

In [7]:
demographics.to_feather(os.path.join(intermediate_dir, 'df_demographics.feather'))