## Importing Libraries

In [17]:
import numpy as np
import pandas as pd 
import warnings
import googlemaps
from datetime import datetime
warnings.filterwarnings("ignore")

## Loading Data

In [18]:
#######################################
#### Public School database California
#######################################

#Source: https://www.cde.ca.gov/ds/si/ds/pubschls.asp
public = pd.read_excel('pubschls.xlsx', header = 5)

#Enrollment 
#Source: https://www.cde.ca.gov/ds/sd/sd/filesenr.asp [For year 2016-17]
enrollment = pd.read_csv("enrollment1617.txt", sep='\t')

#Ranking
#souce: http://www.ccsa.org/2017/10/2018-19-ccsa-decile-ranks-spreadsheet.html
ranking = pd.read_excel('2018-19 CCSA Decile Ranks Spreadsheet (110218).xls', header = 2,  
                        names = ['CDS', 'school', 'sw_rank_2108', 'sw_rank_2017', 'sw_rank_2016',
                                'ss_rank_2018', 'ss_rank_2017', 'ss_rank_2016', 'county', 'district',
                                'authorizer', 'charter', 'school_type', 'parent', 'DASS_18_19', 'test_taker' ])

############################
#### Private School database
############################
#Source: https://www.cde.ca.gov/ds/si/ps/
private = pd.read_excel('privateschools1718.xlsx', header = 2)

###################
## Additional Data
###################

#Bay area counties list 
BA_counties = ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Francisco', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma']

## Data cleaning 

### Public school data base

In [19]:
#Filterning Bay area counties in database 
mask_public = public.County.isin(BA_counties)
BA_public = public[mask_public]

#Filter columns with the information needed only
info = ['CDSCode', 'NCESDist', 'NCESSchool', 'StatusType', 'County', 'District',
       'School', 'Street', 'City', 'Zip', 'State', 'OpenDate', 'ClosedDate', 'Charter', 'CharterNum',
       'SOC', 'SOCType', 'EILCode', 'EILName', 'GSoffered', 'GSserved', 'Virtual', 'Latitude', 'Longitude', 'LastUpDate']

BA_public = BA_public.loc[:,info]

######################
#Deleted observations
######################

#1. Delete Closed and pending schools 
closed_school = (BA_public.StatusType == 'Closed') | (BA_public.StatusType == 'Pending')
BA_public = BA_public[~closed_school]

#2. Delete Rows representing districs
school_id = BA_public.CDSCode.str.split(r'\d{7}', n = 1, expand = True) #Gets the string after the 7th position
BA_public['CDSCodeID'] = school_id[1] 
BA_public = BA_public[BA_public['CDSCodeID']!='0000000'] #Filters out all observations 0000000 that represent an distric (not a shcool)

#3. Closing date less than 2010 (Travel survey was in 2011) [Scools that are closed to to merge to other school]
BA_public.ClosedDate = [np.nan if x=='No Data' else x for x in BA_public.ClosedDate]
mask_closingDate = BA_public.ClosedDate.dt.year<=2010
BA_public = BA_public[~mask_closingDate]

#4a. Deleting Adult Schools
BA_public = BA_public[BA_public.EILName != 'Adult']
BA_public = BA_public[BA_public.SOCType != 'Adult Education Centers']

# #4b Deliting Virtual School 
# mask_virtual = (BA_public.Virtual == 'V') | (BA_public.Virtual == 'F')
# BA_public = BA_public[~mask_virtual]

#######################
## Structuring Data
#######################

#1. Infering grades served. Creates a colum for each grade >> if a school serves a grade x, then 1, o.w 0. (K< X <12)
BA_public.GSserved = [np.nan if x == 'No Data' else x for x in BA_public.GSserved] #Replace 'No Data' values with NaN
BA_public.GSserved.fillna(BA_public.GSoffered, inplace=True) #Replace NaN values in GSServe with values in GSoffered

grade_range = BA_public.GSserved.str.split('-', n = 1, expand = True) # Gets min and max grade served in a school
grade_range = grade_range.replace(['K', 'P', None], [0, 0, np.nan]) #Zero represents K grade. 
grade_range = grade_range.astype('float') # All values are a float. Before they were str

# NaN values in colum 1 are replace by the value in column 0
# That happens when there is only one grade
grade_range[1].fillna(grade_range[0], inplace=True) # NaN values in colum 1 are replace by the value in column 0

for grade in range(13):
    col_name = 'grade_' + str(grade)
    BA_public[col_name] = [1 if (grade >= row[0]) & (grade <= row[1]) else 0 for index, row in grade_range.iterrows()]
    
#2. Type: Public school 
BA_public['type'] = 'public'

BA_public.head()

# CDSCode from str to int 64
BA_public.CDSCode = BA_public.CDSCode.astype('int64')

##########################
## Adding enrollment data
##########################

#Adding enrollment per school per grade. 
enrollment = enrollment.groupby(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL']).sum().reset_index(drop=False)

#Merging Public School database with enrollment
BA_public = BA_public.merge(enrollment, how='left', left_on='CDSCode', right_on='CDS_CODE')
BA_public.columns

#Checking null enrollment after merge [94 schools total. 43 were open after 2017]
BA_public.OpenDate = pd.to_datetime(BA_public.OpenDate) #changing formating to datetime
mask = BA_public.OpenDate.dt.year <2017 #Filter school open before 2017
null_school = BA_public[(BA_public.GR_1.isnull()) & (mask)] 
# null_school.loc[:,['CDSCode', 'SOCType', 'OpenDate', 'GSoffered', 'GSserved']]

############################
## Adding Performance Index 
############################

#Filtering Bay Area Counties 
mask_county = ranking.county.isin(BA_counties)
ranking = ranking[mask_county]

#Merging Performance Index [302 NAN]
BA_public = BA_public.merge(ranking, how = 'left', left_on='CDSCodeID', right_on='CDS')

print(BA_public.shape)



(1828, 78)


In [20]:
# Randonmly select 30 observations with no rank information. 
# a[a.sw_rank_2016_y.isnull()].loc[:,['CDSCodeID','School','Street','ENR_TOTAL']].sample(30)

In [21]:
# import urllib.request
# urllib.request.urlopen('https://www.caschooldashboard.org/reports/01611430131177/2018').read()

# import requests

# link = "https://www.caschooldashboard.org/reports/01611430131177/2018"
# f = requests.get(link)
# print(f.text)

### Private school dataset

In [22]:
#Filtering Bay Area private school 
mask_private = private.County.isin(BA_counties)
BA_private = private[mask_private]

#Filtering the columns with the information needed only
info_private = ['County', 'CDS Code', 'School ', 'Street', 'City',
       'State', 'Zip', 'Public District','School Type',
       'Low Grade', 'High Grade', 'Kindergarten Enrollment',
       'Grade 1 Enrollment', 'Grade 2 Enrollment', 'Grade 3 Enrollment',
       'Grade 4 Enrollment', 'Grade 5 Enrollment', 'Grade 6 Enrollment',
       'Grade 7 Enrollment', 'Grade 8 Enrollment', 'Grade 9 Enrollment',
       'Grade 10 Enrollment', 'Grade 11 Enrollment', 'Grade 12 Enrollment',
       'Ungraded Elementary Enrollmnet', 'Ungraded Secondary Enrollment',
       'Total Enrollment']
BA_private = BA_private.loc[:,info_private]

#Creating type >> All school in this dataset are private 
BA_private['type'] = 'private'

#######################
## Structuring Data
#######################

#1. Creates a colum for each grade >> if a school serves a grade x, then 1, o.w 0. (K< X <12)
grade_range_private = BA_private.loc[:,['Low Grade', 'High Grade']]
grade_range_private = grade_range_private.replace(['K'], [0])
grade_range_private = grade_range_private.astype('float') 

for grade in range(13):
    col_name = 'grade_' + str(grade)
    BA_private[col_name] = [1 if (grade >= row[0]) & (grade <= row[1]) else 0 for index, row in grade_range_private.iterrows()]

#Google API to get latitude and longitude information

################ MISSING, BUT ALREADY ON THE TO DO LIST #############3

BA_private.shape

(726, 41)

### Google API for geocoding private school latitude and longitude based on their address. 

In [23]:
#Google API Request: Input >> School address, Output >> School lat and lng coordinates. 
school_address = [row[4]+', '+row[5]+', '+row[7] for index, row in BA_private.iterrows()] #Address required by google API
gmaps = googlemaps.Client(key='AIzaSyCIaoRDeVqLyaXgBi5-vd5USdSNaI6gakA') #This is my personal key. DO NOT USE IT!! 
json_coor = [gmaps.geocode(x) for x in school_address] #Google API query

In [24]:
#Creating a list of latitude and longitude 
lat = [json_coor[x][0]['geometry']['location']['lat'] for x in range(len(json_coor))]
lng = [json_coor[x][0]['geometry']['location']['lng'] for x in range(len(json_coor))]

#Appending the list to the BA_private dataframe
BA_private['Latitude'] = lat
BA_private['Longitude'] = lng

In [25]:
BA_private.shape

(726, 43)

## Joining public and private school dataset

In [27]:
# Selection common columns 
BA_public = BA_public.loc[:,['CDSCode', 'School', 'District','County', 'Street', 'City', 'Zip', 'State', 'Latitude', 'Longitude',
                 'type','grade_0', 'grade_1', 'grade_2', 'grade_3','grade_4', 'grade_5', 'grade_6', 'grade_7', 
                 'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12', 'KDGN', 'GR_1', 'GR_2', 'GR_3', 'GR_4',
                 'GR_5', 'GR_6', 'GR_7', 'GR_8', 'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
                 'sw_rank_2108', 'sw_rank_2017', 'sw_rank_2016', 'ss_rank_2018_', 'ss_rank_2017', 'ss_rank_2016']]

BA_private = BA_private.loc[:,['CDS Code', 'School ', 'Public District', 'County', 'Street', 'City', 'Zip', 'State', 'type',
                  'grade_0','grade_1', 'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 
                  'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12', 'Kindergarten Enrollment', 
                  'Grade 1 Enrollment', 'Grade 2 Enrollment','Grade 3 Enrollment', 'Grade 4 Enrollment', 
                  'Grade 5 Enrollment','Grade 6 Enrollment', 'Grade 7 Enrollment', 'Grade 8 Enrollment',
                  'Grade 9 Enrollment', 'Grade 10 Enrollment', 'Grade 11 Enrollment', 'Grade 12 Enrollment', 
                  'Ungraded Elementary Enrollmnet', 'Ungraded Secondary Enrollment', 'Total Enrollment', 'Latitude', 'Longitude' ]]

#Unifying columns names 
BA_public.columns = ['CDSCode', 'School', 'District', 'County', 'Street', 'City', 'Zip','State', 
                     'Latitude', 'Longitude', 'type', 'grade_0', 'grade_1','grade_2', 'grade_3', 
                     'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'grade_10', 
                     'grade_11', 'grade_12', 'Kindergarten Enrollment', 'Grade 1 Enrollment', 
                     'Grade 2 Enrollment','Grade 3 Enrollment', 'Grade 4 Enrollment', 'Grade 5 Enrollment',
                     'Grade 6 Enrollment', 'Grade 7 Enrollment', 'Grade 8 Enrollment','Ungraded Elementary Enrollmnet',
                     'Grade 9 Enrollment', 'Grade 10 Enrollment', 'Grade 11 Enrollment', 'Grade 12 Enrollment', 
                     'Ungraded Secondary Enrollment','Total Enrollment', 'sw_rank_2108', 'sw_rank_2017', 
                     'sw_rank_2016','ss_rank_2018_', 'ss_rank_2017', 'ss_rank_2016']

BA_private.columns = ['CDSCode', 'School', 'District', 'County', 'Street', 'City', 'Zip', 'State', 'type',
                  'grade_0','grade_1', 'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 
                  'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12', 'Kindergarten Enrollment', 
                  'Grade 1 Enrollment', 'Grade 2 Enrollment','Grade 3 Enrollment', 'Grade 4 Enrollment', 
                  'Grade 5 Enrollment','Grade 6 Enrollment', 'Grade 7 Enrollment', 'Grade 8 Enrollment',
                  'Grade 9 Enrollment', 'Grade 10 Enrollment', 'Grade 11 Enrollment', 'Grade 12 Enrollment', 
                  'Ungraded Elementary Enrollmnet', 'Ungraded Secondary Enrollment', 'Total Enrollment', 'Latitude', 'Longitude']


#####################################
## Creating a Unique SchoolData Base 
####################################

schools = pd.concat([BA_public, BA_private], sort= False)


## Getting node ID for each school

In [50]:
#Small netwrok 
nodessmall = pd.read_csv(d + 'bay_area_tertiary_strongly_nodes.csv') \
            .set_index('osmid')
edgessmall = pd.read_csv(d + 'bay_area_tertiary_strongly_edges.csv')
netsmall = pdna.Network(nodessmall.x, nodessmall.y, edgessmall.u,
                                edgessmall.v, edgessmall[['length']],
                                twoway=False)
netsmall.precompute(25000)

def node_id_small(school, netsmall):
    """ Return the node ID for each school"""
    idssmall_shcool = netsmall.get_node_ids(school.Longitude, school.Latitude)
    return idssmall_shcool

schools['nodeID'] = node_id_small(schools, netsmall)

## Gettin parcel ID for each school. 

In [51]:
#Setting a new Directory
os.chdir('/home/juan/ual_model_workspace/spring-2019-models')
orca.run(['initialize_network_small', 'initialize_network_walk'])

Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.04 s
Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 0.38 s
Total time to execute iteration 1 with iteration value None: 0.42 s


In [52]:
parcels = orca.get_table('parcels').to_frame()

In [72]:
schools['Latitude'] = schools['Latitude'].astype(float)
schools['Longitude'] = schools['Longitude'].astype(float)

In [73]:
#Transforming Latitude/Longitude to radians
parcels_rad = np.deg2rad(parcels[['y', 'x']])
school_rad = np.deg2rad(schools[['Latitude', 'Longitude']])

In [74]:
#Retriving parcel ID for each school
tree = BallTree(parcels_rad, metric='haversine')

dists, idxs = tree.query(school_rad, return_distance=True)
schools['parcel_id_school'] = parcels.iloc[idxs[:,0]].index

In [73]:
#Cheking no repeted schools 
schools = schools.drop_duplicates(subset = 'CDSCode', keep = 'first', inplace = False)

In [79]:
path='/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/'
schools.to_csv(path + 'schools.csv') #Exporting the database to excel