# Imports

In [1]:
import pandas as pd
import requests as rq
import re
import json
#pd.set_option('display.max_rows', 300)

# Functions

In [2]:
def get_borough_name(borough_code):
    if borough_code == 'K':
        return 'Brooklyn'
    elif borough_code == 'Q':
        return 'Queens'
    elif borough_code == 'M':
        return 'Manhattan'
    elif borough_code == 'X':
        return 'The Bronx'
    else:
        return 'Staten Island'

def separate_dbn_or_bn(dbn_or_bn_code):
    split_string = re.split('(X)|(K)|(Q)|(M)|(R)', dbn_or_bn_code)
    split_string = tuple(filter(None, split_string))
    if len(split_string) == 2:
        borough, number = split_string
        borough = get_borough_name(borough)
        return [borough, int(number)]
    else:
        district, borough, number = split_string
        borough = get_borough_name(borough)
        return [int(district), str(borough), int(number)]


def fill_name_and_district(borough, number, directory):
    idx = (borough, number)
    if idx in directory.index:
        name = directory.loc[idx, 'School Name']
        district = directory.loc[idx, 'District']
        print(int(district))
        return [district, name]
    else:
        return [None, None]
    

# Cleaning Charter School Data

In [3]:
charter_list = pd.read_excel('Data/nyc_charter_school_list.xlsx')
charter_list.head();
charter_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 14 columns):
DBN                           236 non-null object
Bedscode                      292 non-null int64
School Name                   292 non-null object
School District               292 non-null object
NYC                           292 non-null object
CSD                           234 non-null float64
Network Name                  153 non-null object
Network Type                  119 non-null object
Conversion                    292 non-null object
Unionized                     292 non-null object
2018-19 Years of Operation    292 non-null int64
Grades                        292 non-null object
Charter End Date              292 non-null datetime64[ns]
Authorizer                    292 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(10)
memory usage: 32.0+ KB


In [4]:
col_list = ['DBN', 'School Name', 'NYC', 'CSD', 'Grades', '2018-19 Years of Operation']
charter_list = charter_list[col_list]
charter_list.head();

In [5]:
charter_list = charter_list[charter_list['NYC'] == 'Yes'].reset_index(drop = True)
charter_list = charter_list.drop('NYC', axis = 1)
charter_list.head();

In [6]:
charter_list[['District', 'Borough', 'Number']] = charter_list.apply(
    lambda row: pd.Series(separate_dbn_or_bn(row['DBN'])), axis = 1)

charter_list = charter_list.drop('DBN', axis = 1)
charter_list.head();

In [7]:
charter_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 7 columns):
School Name                   236 non-null object
CSD                           234 non-null float64
Grades                        236 non-null object
2018-19 Years of Operation    236 non-null int64
District                      236 non-null int64
Borough                       236 non-null object
Number                        236 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 13.0+ KB


In [8]:
nan_rows = charter_list[charter_list['CSD'].isnull()]
nan_rows;

In [9]:
charter_list.at[122, 'CSD'] = 23.0
charter_list.at[137, 'CSD'] = 31.0
charter_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 7 columns):
School Name                   236 non-null object
CSD                           236 non-null float64
Grades                        236 non-null object
2018-19 Years of Operation    236 non-null int64
District                      236 non-null int64
Borough                       236 non-null object
Number                        236 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 13.0+ KB


In [10]:
charter_list['District'] = charter_list['CSD'].astype('int64')
charter_list = charter_list.drop('CSD', axis = 1)
charter_list.head();

In [11]:
charter_list['Opening Year'] = 2019 - charter_list['2018-19 Years of Operation']
charter_list = charter_list[charter_list['Opening Year'] <= 2017]
charter_list = charter_list.drop('2018-19 Years of Operation', axis = 1)
charter_list.head();

In [12]:
col_list = ['Borough', 'District', 'Number', 'School Name', 'Grades', 'Opening Year']
charter_list = charter_list[col_list]
charter_list.head();

In [13]:
charter_list = charter_list.set_index(['Borough', 'District', 'Opening Year']).sort_index()
charter_list.loc[('Brooklyn', 13, 2000)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number,School Name,Grades
Borough,District,Opening Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Brooklyn,13,2000,702,Community Partnership Charter School,"PK,0K,01,02,03,04,05,06,07,08"


In [14]:
keys = [(i, j) for i in range(1, 33) for j in range(1999, 2018)]
school_counter = {key:0 for key in keys}
for index, row in charter_list.iterrows():
    b, d, y = index
    school_counter[(d, y)] += 1

In [15]:
new_school_counter = {'District' : [], 'Year' : [], 'Num Charters' : []}
for key, value in school_counter.items():
    district, year = key
    new_school_counter['District'].append(district)
    new_school_counter['Year'].append(year)
    new_school_counter['Num Charters'].append(value)
    
num_charters_df = pd.DataFrame(new_school_counter).set_index('District').sort_index()
num_charters_df['Num Charters'] = num_charters_df['Num Charters'].groupby('District').cumsum()
num_charters_df = num_charters_df[num_charters_df['Year'] >= 2005]
num_charters_df = num_charters_df.reset_index().set_index(['District', 'Year'])
num_charters_df.head(15);

# Cleaning Public School Data

In [16]:
row_limit = '10000'
cols = 'bn,IndicatorOverall_1,IndicatorOverall_2,IndicatorOverall_3,IndicatorOverall_4,IndicatorOverall_5,overall_rating'
SoQL_query = '&$select=' + cols
r = rq.get('https://data.cityofnewyork.us/resource/9n9z-hh9p.json?$limit=' + row_limit + SoQL_query)
public_school_ratings = pd.DataFrame(r.json())
public_school_ratings.head()




Unnamed: 0,IndicatorOverall_1,IndicatorOverall_2,IndicatorOverall_3,IndicatorOverall_4,IndicatorOverall_5,bn,overall_rating
0,P,WD,WD,WD,P,M307,P
1,P,P,WD,P,P,M459,P
2,WD,WD,WD,WD,WD,M560,WD
3,WD,WD,WD,WD,P,M294,WD
4,P,P,P,P,P,M299,P


In [17]:
SoQL_query = '&$select=dbn,school_name AS Name'
r = rq.get('https://data.cityofnewyork.us/resource/h7rb-945c.json?' + SoQL_query)
hs_directory_2018 = pd.DataFrame(r.json())
hs_directory_2018.head();

In [18]:
hs_directory_2018[['District', 'Borough', 'Number']] = hs_directory_2018.apply(
    lambda row: pd.Series(separate_dbn_or_bn(row['dbn'])), axis = 1)

hs_directory_2018 = hs_directory_2018.drop('dbn', axis = 1)
hs_directory_2018.head();

In [19]:
SoQL_query = '&$select=schooldbn,printedschoolname AS Name'
r = rq.get('https://data.cityofnewyork.us/resource/6kcb-9g8d.json?' + SoQL_query)
ms_directory_2018 = pd.DataFrame(r.json())
ms_directory_2018.head();

In [20]:
ms_directory_2018[['District', 'Borough', 'Number']] = ms_directory_2018.apply(
    lambda row: pd.Series(separate_dbn_or_bn(row['schooldbn'])), axis = 1)

ms_directory_2018 = ms_directory_2018.drop('schooldbn', axis = 1)
ms_directory_2018.head();

In [21]:
school_directory_2018 = hs_directory_2018.append(ms_directory_2018)
school_directory_2018.columns = ['School Name', 'District', 'Borough', 'Number']
school_directory_2018 = school_directory_2018.set_index(['Borough', 'Number']).sort_index()
school_directory_2018

Unnamed: 0_level_0,Unnamed: 1_level_0,School Name,District
Borough,Number,Unnamed: 2_level_1,Unnamed: 3_level_1
Brooklyn,2,Parkside Preparatory Academy,17
Brooklyn,8,Robert Fulton (P.S./M.S. 8),13
Brooklyn,14,Shell Bank (J.H.S. 14),22
Brooklyn,30,Mary White Ovington (P.S./I.S. 30),20
Brooklyn,35,Stephen Decatur Middle School 35,16
Brooklyn,41,Walter Francis White (P.S. 41),23
Brooklyn,45,Horace E. Greene (P.S./I.S. 45K),32
Brooklyn,50,John D. Wells (M.S. 50),14
Brooklyn,51,William Alexander Middle School (M.S. 51),15
Brooklyn,57,Ron Brown Academy (M.S. 57),16


In [22]:
public_school_ratings = public_school_ratings.drop(75).reset_index(drop = True)
public_school_ratings['bn'] = public_school_ratings['bn'].str.upper()

public_school_ratings[['Borough', 'Number']] = public_school_ratings.apply(  
    lambda row: pd.Series(separate_dbn_or_bn(row['bn'])), axis = 1)

public_school_ratings.drop('bn', axis = 1)

Unnamed: 0,IndicatorOverall_1,IndicatorOverall_2,IndicatorOverall_3,IndicatorOverall_4,IndicatorOverall_5,overall_rating,Borough,Number
0,P,WD,WD,WD,P,P,Manhattan,307
1,P,P,WD,P,P,P,Manhattan,459
2,WD,WD,WD,WD,WD,WD,Manhattan,560
3,WD,WD,WD,WD,P,WD,Manhattan,294
4,P,P,P,P,P,P,Manhattan,299
5,WD,WD,WD,WD,P,WD,Manhattan,407
6,P,WD,WD,WD,P,WD,Manhattan,519
7,P,P,WD,WD,P,P,Manhattan,304
8,WD,WD,WD,WD,WD,WD,Manhattan,449
9,P,P,WD,P,P,P,Manhattan,570


In [23]:
public_school_ratings.head()

Unnamed: 0,IndicatorOverall_1,IndicatorOverall_2,IndicatorOverall_3,IndicatorOverall_4,IndicatorOverall_5,bn,overall_rating,Borough,Number
0,P,WD,WD,WD,P,M307,P,Manhattan,307
1,P,P,WD,P,P,M459,P,Manhattan,459
2,WD,WD,WD,WD,WD,M560,WD,Manhattan,560
3,WD,WD,WD,WD,P,M294,WD,Manhattan,294
4,P,P,P,P,P,M299,P,Manhattan,299


In [24]:
public_school_ratings[['District', 'School Name']] = public_school_ratings.apply(
    lambda row: pd.Series(fill_name_and_district(row['Borough'], row['Number'], school_directory_2018)), axis = 1)

public_school_ratings

3
3
2
2
3


TypeError: ("cannot convert the series to <class 'int'>", 'occurred at index 5')

In [None]:
public_school_ratings.iloc[5]

In [25]:
public_school_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8063 entries, 0 to 8062
Data columns (total 9 columns):
IndicatorOverall_1    8063 non-null object
IndicatorOverall_2    8063 non-null object
IndicatorOverall_3    8063 non-null object
IndicatorOverall_4    8063 non-null object
IndicatorOverall_5    8063 non-null object
bn                    8063 non-null object
overall_rating        8063 non-null object
Borough               8063 non-null object
Number                8063 non-null int64
dtypes: int64(1), object(8)
memory usage: 567.0+ KB
