In [1]:
import pandas as pd
import numpy as np
from tabula import read_pdf

In [2]:
abbr = 'MI'

In [3]:
file = 'michigan2018.xlsx'

Read in federal level data

In [4]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [5]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [6]:
len(state_fiscal)

710

In [7]:
state_fiscal.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
6791,2600001,N,26,26065,330,29620,Michigan Department of Corrections,Michigan,MI,2,...,N,N,N,N,N,N,N,N,N,N
6792,2600004,N,26,26065,330,29620,Michigan Department of Human Services,Michigan,MI,3,...,N,N,N,N,N,N,N,N,N,N
6793,2600005,23501300600000,26,26025,310,12980,Battle Creek Public Schools,Michigan,MI,3,...,R,R,R,R,R,M,M,I,R,M
6794,2600006,23502700100000,26,26053,N,N,Bessemer Area School District,Michigan,MI,3,...,R,R,R,R,R,M,M,R,R,M
6795,2600007,23503202900000,26,26063,N,N,Harbor Beach Community Schools,Michigan,MI,3,...,R,R,R,R,R,M,M,R,R,M


Read in state level data

In [8]:
state_grads = pd.read_excel('../../data/state_data_raw/' + file, sheet_name='2018 4-Yr Grad Drop')

In [9]:
state_grads.head()

Unnamed: 0,State of Michigan 2018 Cohort 4-Year Graduation and Dropout Rate Report,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,End of Row
0,Final Results,,,,,,,,,,,,,,,,End of Row
1,District / Building Name (Code),Totals - \nFirst Time 9th Grade in Fall 2014,Totals - \n (+)Transfers In,Totals - \n (-)Transfers Out & Exempt,Totals - \nCohort Size,Cohort Status - \nNumber of On Time Graduates,Cohort Status - \nNumber of Dropouts,Cohort Status - \nNumber Continuing in School,Cohort Status - \nNumber of Other Completers,Rates - \nGraduation Rate,Rates - \nDropout Rate,ISD Code,District Code,Building Code,ISD Name,District Name,Building Name
2,State,119026,6888,3670,122244,98583,10668,11693,1300,80.64%,8.73%,,00000,00000,,Statewide,State
3,Adams Township School District (31020),36,6,1,41,40,< 10,< 10,< 10,97.56%,0.00%,31,31020,,Copper Country ISD,Adams Township School District (31020),
4,Jeffers High School (01893),36,6,1,41,40,< 10,< 10,< 10,97.56%,0.00%,31,31020,01893,Copper Country ISD,Adams Township School District (31020),Jeffers High School (01893)


Reset columns.

In [10]:
state_grads.columns = state_grads.loc[1]
state_grads = state_grads.loc[2:]

In [11]:
state_grads.head()

1,District / Building Name (Code),Totals - \nFirst Time 9th Grade in Fall 2014,Totals - \n (+)Transfers In,Totals - \n (-)Transfers Out & Exempt,Totals - \nCohort Size,Cohort Status - \nNumber of On Time Graduates,Cohort Status - \nNumber of Dropouts,Cohort Status - \nNumber Continuing in School,Cohort Status - \nNumber of Other Completers,Rates - \nGraduation Rate,Rates - \nDropout Rate,ISD Code,District Code,Building Code,ISD Name,District Name,Building Name
2,State,119026,6888,3670,122244,98583,10668,11693,1300,80.64%,8.73%,,0,0.0,,Statewide,State
3,Adams Township School District (31020),36,6,1,41,40,< 10,< 10,< 10,97.56%,0.00%,31.0,31020,,Copper Country ISD,Adams Township School District (31020),
4,Jeffers High School (01893),36,6,1,41,40,< 10,< 10,< 10,97.56%,0.00%,31.0,31020,1893.0,Copper Country ISD,Adams Township School District (31020),Jeffers High School (01893)
5,Addison Community Schools (46020),67,5,10,62,55,< 10,< 10,< 10,88.71%,1.61%,46.0,46020,,Lenawee ISD,Addison Community Schools (46020),
6,Addison High School (00023),67,5,10,62,55,< 10,< 10,< 10,88.71%,1.61%,46.0,46020,23.0,Lenawee ISD,Addison Community Schools (46020),Addison High School (00023)


Filter results.

In [12]:
state_grads = state_grads[(state_grads['District / Building Name (Code)'] == state_grads['District Name'])]

Select and rename columns.

In [13]:
state_grads = state_grads[['District Name', 'Totals - \nCohort Size', 'Rates - \nGraduation Rate']]

In [14]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [15]:
state_grads

Unnamed: 0,District Name,Total,Graduation Rate
3,Adams Township School District (31020),41,97.56%
5,Addison Community Schools (46020),62,88.71%
7,Adrian Public Schools (46010),269,86.62%
11,Airport Community Schools (58020),231,81.39%
17,Akron-Fairgrove Schools (79010),27,96.30%
...,...,...,...
1948,Tuscola ISD (79000),11,
1951,Van Buren ISD (80000),17,5.88%
1954,Washtenaw ISD (81000),14,7.14%
1958,West Shore Educational Service District (53000),< 10,


Convert data types.

In [16]:
state_grads['Total'] = state_grads['Total'].astype(str).str.replace('< 10', '')
state_grads['Total'] = state_grads['Total'].astype(str).str.replace('nan', '')
state_grads['Graduation Rate'] = state_grads['Graduation Rate'].astype(str).str.replace('%', '')
state_grads['Graduation Rate'] = state_grads['Graduation Rate'].astype(str).str.replace('NaN', '')
state_grads['Graduation Rate'] = state_grads['Graduation Rate'].astype(str).str.replace('nan', '')

In [17]:
state_grads['Total'] = pd.to_numeric(state_grads['Total'])
state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate']) / 100

Check for matches and non-matches in the two lists

In [18]:
state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(r'\s\([0-9]+\)', '')
state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' \*', '')
state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(r' \#(^\D|$)', '')
state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(',', '')


# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(',', '')

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
Matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
Matches.sort()
len(Matches)

679

In [20]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

['Adrian Public Schools',
 'Allen Academy',
 'Arenac Eastern School District',
 'Blanche Kelso Bruce Academy',
 'Blue Water Learning Academy',
 'Brandon School District in the Counties of Oakland and Lapeer',
 'Burton Glen Charter Academy',
 'Calhoun Intermediate School District',
 'Crescent Academy',
 'Dollar Bay-Tamarack City Area  K-12 School',
 'Dream Academy',
 'Eastpointe Community Schools',
 'Education Achievement Authority of Michigan',
 'Excel Charter Academy',
 'Experiencia Preparatory Academy',
 'Flat River Academy',
 'FlexTech High School - Novi',
 'Grand Rapids Ellington Academy of Arts & Technology',
 'Mackinac Preparatory Academy',
 'Michigan Department of Community Health',
 'Michigan School for the Arts',
 'NexTech High School',
 'NexTech High School of Lansing',
 'NexTech High School of Metro Detroit',
 'Plymouth Educational Center Charter School',
 'Presque Isle Academy',
 'Schools for the Future Detroit',
 'Whiteford Agricultural School District of the Counties of L

In [21]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['Adrian School District of the City of',
 'American International Academy',
 'Brandon School District in the Counties of Oakland and Lapee',
 'Calhoun ISD',
 'Cornerstone Jefferson-Douglass Academy',
 'David Ellis Academy',
 'David Ellis Academy West',
 'Detroit Collegiate High School',
 'Dollar Bay-Tamarack City Area Schools',
 'East Detroit Public Schools',
 'Genesee STEM Academy',
 'Gogebic-Ontonagon ISD',
 'Grattan Academy',
 'Highpoint Virtual Academy of Michigan',
 'Hope of Detroit Academy',
 'Huron Academy',
 'Innocademy',
 'Kent ISD',
 'Keys Grace Academy',
 'Livingston Classical Cyber Academy',
 'Macomb Academy',
 'New Paradigm College Prep',
 'Nexus Academy of Grand Rapids',
 'Nexus Academy of Lansing',
 'Nexus Academy of Royal Oak',
 'Oakland FlexTech Academy',
 'Oakland Schools',
 'Presque Isle Academy II',
 'Southwest Detroit Lighthouse Charter Academy',
 'Tipton Academy',
 'Whiteford Agricultural School District of the Counties of Le']

Rename the remaining samples I can find matches for.

In [22]:
state_fiscal_rename = {
    'Adrian School District of the City of' : 'Adrian Public Schools',
#     'American International Academy',
    'Brandon School District in the Counties of Oakland and Lapee' : 'Brandon School District in the Counties of Oakland and Lapeer',
    'Calhoun ISD' : 'Calhoun Intermediate School District',
#     'Cornerstone Jefferson-Douglass Academy',
#     'David Ellis Academy',
#     'David Ellis Academy West',
#     'Detroit Collegiate High School',
    'Dollar Bay-Tamarack City Area Schools' : 'Dollar Bay-Tamarack City Area  K-12 School',
#     'East Detroit Public Schools',
#     'Genesee STEM Academy',
#     'Gogebic-Ontonagon ISD',
#     'Grattan Academy',
#     'Highpoint Virtual Academy of Michigan',
#     'Hope of Detroit Academy',
#     'Huron Academy',
#     'Innocademy',
#     'Kent ISD',
#     'Keys Grace Academy',
#     'Livingston Classical Cyber Academy',
#     'Macomb Academy',
#     'New Paradigm College Prep',
#     'Nexus Academy of Grand Rapids',
#     'Nexus Academy of Lansing',
#     'Nexus Academy of Royal Oak',
#     'Oakland FlexTech Academy',
#     'Oakland Schools',
    'Presque Isle Academy II' : 'Presque Isle Academy',
#     'Southwest Detroit Lighthouse Charter Academy',
#     'Tipton Academy',
    'Whiteford Agricultural School District of the Counties of Le' : 'Whiteford Agricultural School District of the Counties of Lenawee and Monroe'
}

In [23]:
state_fiscal = state_fiscal.replace(state_fiscal_rename)

Merge federal and state data, keeping only matches between the two. 

In [24]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [25]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)