In [1]:
import pandas as pd
import numpy as np
from tabula import read_pdf

In [2]:
abbr = 'MO'

In [3]:
file = 'missouri2018.xlsx'

Read in federal level data

In [4]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [5]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [6]:
len(state_fiscal)

467

In [7]:
state_fiscal.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
8456,2900001,26506300100000,29,29125,N,N,MARIES CO. R-II,Missouri,MO,03,...,R,R,R,R,R,R,R,M,M,M
8457,2900002,26504100700000,29,29081,N,N,SOUTH HARRISON CO. R-II,Missouri,MO,03,...,R,R,R,R,R,R,R,M,M,M
8458,2900003,26507240100000,29,29143,N,N,PORTAGEVILLE,Missouri,MO,03,...,R,R,R,R,R,R,R,M,M,M
8459,2900004,26507201200000,29,29143,N,N,NEW MADRID CO. R-I,Missouri,MO,03,...,R,R,R,R,R,R,R,M,M,M
8460,2900005,N,29,29159,N,42740,STATE FAIR COMMUNITY COLLEGE,Missouri,MO,N,...,M,M,M,M,M,M,M,M,M,M


Read in state level data

In [8]:
state_grads = pd.read_excel('../../data/state_data_raw/' + file)

In [9]:
state_grads

Unnamed: 0,YEAR,COUNTY_DISTRICT_CODE,DISTRICT_NAME,IS_K_12_DISTRICT,IS_K_8_ONLY_DISTRICT,BEG_GRADE,END_GRADE,DIST_EXIST_3_YRS,IS_GROWING_A_HIGH_SCHOOL,S1_ELA_CURR_MPI,...,S5_PYR3_6YR_GRAD_RATE,S5_PROG_6YR_GRAD_PRIOR_2YR_AVG,S5_PROG_6YR_GRAD_CURR_2YR_AVG,S5_AVERAGE_6YR_GRAD_RATE,S5_CURR_7YR_GRAD_RATE,S5_PRIOR_7YR_GRAD_RATE,S5_PYR3_7YR_GRAD_RATE,S5_PROG_7YR_GRAD_PRIOR_2YR_AVG,S5_PROG_7YR_GRAD_CURR_2YR_AVG,S5_AVERAGE_7YR_GRAD_RATE
0,2019,1090,ADAIR CO. R-I,Y,N,K,12,Y,N,327.6,...,85.2,92.6,97.4,93.3,100.0,85.2,93.1,89.2,92.6,92.8
1,2019,1091,KIRKSVILLE R-III,Y,N,PK,12,Y,N,340.6,...,94.4,94.5,93.0,93.5,94.6,94.4,94.9,94.7,94.5,94.6
2,2019,1092,ADAIR CO. R-II,Y,N,K,12,Y,N,336.3,...,100.0,94.1,94.1,96.1,88.2,100.0,100.0,100.0,94.1,96.1
3,2019,2089,NORTH ANDREW CO. R-VI,Y,N,K,12,Y,N,357.2,...,95.8,94.8,94.9,95.2,93.8,95.8,84.4,90.1,94.8,91.3
4,2019,2090,AVENUE CITY R-IX,N,Y,K,8,Y,N,401.0,...,,,,,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,2019,115924,LAFAYETTE PREPARATORY ACADEMY,N,N,K,7,Y,N,352.8,...,,,,,0.0,0.0,,,,
549,2019,115925,HAWTHORN LEADERSHIP SCHL GIRLS,N,Y,06,10,Y,Y,295.2,...,0.0,,,,0.0,0.0,0.0,,,
550,2019,115926,THE BIOME,N,N,K,4,Y,N,246.4,...,,,,,0.0,0.0,,,,
551,2019,115928,LA SALLE CHARTER SCHOOL,N,Y,05,8,Y,N,260.0,...,,,,,0.0,0.0,,,,


Filter results.

In [10]:
state_grads = state_grads[state_grads['END_GRADE'] == 12]

Select and rename columns.

In [11]:
state_grads['Total'] = np.full_like(state_grads['DISTRICT_NAME'], '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
state_grads = state_grads[['DISTRICT_NAME', 'Total', 'S5_PRIOR_4YR_GRAD_RATE']]

In [13]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [14]:
state_grads

Unnamed: 0,District Name,Total,Graduation Rate
0,ADAIR CO. R-I,,90.5
1,KIRKSVILLE R-III,,92.0
2,ADAIR CO. R-II,,100.0
3,NORTH ANDREW CO. R-VI,,85.3
5,SAVANNAH R-III,,94.8
...,...,...,...
536,ST. LOUIS CITY,,78.2
537,LIFT FOR LIFE ACADEMY,,93.8
539,CONFLUENCE ACADEMIES,,88.6
545,GATEWAY SCIENCE ACAD/ST LOUIS,,95.7


Convert data types.

In [15]:
state_grads['Total'] = pd.to_numeric(state_grads['Total'])
state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate']) / 100

Check for matches and non-matches in the two lists. 
Names all capitalized to catch as many matches as possible.

In [16]:
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.upper()
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.upper()

In [17]:
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace('.', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(',', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' DISTRICT', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' DIST', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' PUBLIC', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' SCHOOLS', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' SCHOOL', '')

# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace('.', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(',', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' DISTRICT', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' DIST', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' PUBLIC', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' SCHOOLS', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' SCHOOL', '')

In [18]:
matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
matches.sort()
len(matches)

456

In [19]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

['EWING MARION KAUFFMAN SCHOOL', 'ST. LOUIS COLLEGE PREP']

In [20]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['CROWDER COLLEGE',
 'DIVISION OF YOUTH SERVICE',
 'GRAND CENTER ARTS ACADEMY',
 'JEFFERSON COLLEGE',
 'MO SCHLS FOR THE SEV DISABLED',
 'MO SCHOOL FOR THE BLIND',
 'MO SCHOOL FOR THE DEAF',
 'MO VIRTUAL INSTRUCTION PROGRAM',
 'OZARKS TECHNICAL COMM COLLEGE',
 'PEMISCOT CO. SPEC. SCH. DIST.',
 'STATE FAIR COMMUNITY COLLEGE']

No remaining matches I can find. 

In [21]:
#state_fiscal_rename = {}

In [22]:
#state_fiscal = state_fiscal.replace(state_fiscal_rename)

Merge federal and state data, keeping only matches between the two. 

In [23]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [24]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)