In [1]:
import pandas as pd
import numpy as np

In [2]:
abbr = 'KY'

In [3]:
file = 'kentucky2019.xlsx'

Read in federal level data

In [4]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [5]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [6]:
len(state_fiscal)

171

In [7]:
state_fiscal.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
5656,2100030,18500100100000,21,21001,N,N,Adair County,Kentucky,KY,3,...,M,R,R,R,R,R,R,R,R,M
5657,2100070,18500200100000,21,21003,150,14540,Allen County,Kentucky,KY,3,...,M,R,R,R,R,R,R,R,R,M
5658,2100081,18508900300000,21,21177,N,16420,Muhlenberg County,Kentucky,KY,3,...,M,R,R,R,R,R,R,R,R,M
5660,2100094,N,21,21111,350,31140,Kentucky School for the Blind,Kentucky,KY,3,...,N,N,N,N,N,N,N,N,N,N
5661,2100095,N,21,21021,N,19220,Kentucky School for the Deaf,Kentucky,KY,3,...,N,N,N,N,N,N,N,N,N,N


Read in state level data

In [8]:
state_grads = pd.read_excel('../../data/state_data_raw/' + file)

In [9]:
state_grads.head()

Unnamed: 0,Code,District Name,School Name,Gender,Race/Ethnicity,English Learners,Economically Disadvantaged,Students With Disabilities,Cohort Population,Number of Graduates,4 Year Cohort Rate
0,1,Adair County,,---,---,---,No,No,63,63.0,100.0
1,1,Adair County,,---,---,---,No,Yes,1,,
2,1,Adair County,,---,---,---,Yes,No,99,98.0,99.0
3,1,Adair County,,---,---,---,Yes,Yes,13,7.0,53.8
4,1,Adair County,,---,---,No,---,No,160,159.0,99.4


Filter samples.

In [10]:
state_grads = state_grads[(state_grads['School Name'] == '  ') &
                          (state_grads['Gender'] == '--- ') &
                          (state_grads['Race/Ethnicity'] == '--- ') &
                          (state_grads['English Learners'] == '--- ') &
                          (state_grads['Economically Disadvantaged'] == 'No ') &
                          (state_grads['Students With Disabilities'] == 'No ')]

In [11]:
state_grads

Unnamed: 0,Code,District Name,School Name,Gender,Race/Ethnicity,English Learners,Economically Disadvantaged,Students With Disabilities,Cohort Population,Number of Graduates,4 Year Cohort Rate
0,1,Adair County,,---,---,---,No,No,63,63,100
490,5,Allen County,,---,---,---,No,No,106,106,100
904,11,Anderson County,,---,---,---,No,No,133,131,98.5
1542,12,Ashland Independent,,---,---,---,No,No,119,118,99.2
1940,13,Augusta Independent,,---,---,---,No,No,9,,
...,...,...,...,...,...,...,...,...,...,...,...
97157,906,ED COOP - OVEC,,---,---,---,No,No,2400,2342,97.6
97617,907,ED COOP - SESC,,---,---,---,No,No,1909,1861,97.5
98049,908,ED COOP - WKEC,,---,---,---,No,No,2222,2168,97.6
98481,909,ED COOP - JEFF CO,,---,---,---,No,No,3386,2796,82.6


Select and rename columns.

In [12]:
state_grads = state_grads[['District Name', 'Cohort Population', '4 Year Cohort Rate']]

In [13]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [14]:
state_grads

Unnamed: 0,District Name,Total,Graduation Rate
0,Adair County,63,100
490,Allen County,106,100
904,Anderson County,133,98.5
1542,Ashland Independent,119,99.2
1940,Augusta Independent,9,
...,...,...,...
97157,ED COOP - OVEC,2400,97.6
97617,ED COOP - SESC,1909,97.5
98049,ED COOP - WKEC,2222,97.6
98481,ED COOP - JEFF CO,3386,82.6


In [15]:
state_grads.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 0 to 98991
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   District Name    178 non-null    object
 1   Total            178 non-null    object
 2   Graduation Rate  178 non-null    object
dtypes: object(3)
memory usage: 5.6+ KB


Convert data types.

In [16]:
state_grads = state_grads.replace('  ', '')

In [17]:
state_grads['District Name'] = state_grads['District Name'].astype(str).str.strip()

In [18]:
state_grads['Total'] = pd.to_numeric(state_grads['Total'])
state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate']) / 100

Check for matches and non-matches in the two lists

In [19]:
Matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
Matches.sort()
len(Matches)

163

In [20]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

['ED COOP - CKEC',
 'ED COOP - GRREC',
 'ED COOP - JEFF CO',
 'ED COOP - KEDC',
 'ED COOP - KVEC',
 'ED COOP - NKCES',
 'ED COOP - OVEC',
 'ED COOP - SESC',
 'ED COOP - WKEC',
 'Larue County',
 'Mccracken County',
 'Mccreary County',
 'Mclean County',
 'Raceland-Worthington Independe',
 'State']

In [21]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['Kentucky School for the Blind',
 'Kentucky School for the Deaf',
 'Kentucky Tech System',
 'LaRue County',
 'McCracken County',
 'McCreary County',
 'McLean County',
 'Raceland-Worthington Independent']

In [22]:
state_grads_rename = {
    'Larue County' : 'LaRue County',
    'Mccracken County' : 'McCracken County',
    'Mccreary County' : 'McCreary County',
    'Mclean County' : 'McLean County',
    'Raceland-Worthington Independe' : 'Raceland-Worthington Independent'
}

In [23]:
state_grads = state_grads.replace(state_grads_rename)

Merge federal and state data, keeping only matches between the two. 

In [24]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [25]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)