In [1]:
import pandas as pd
import numpy as np

In [2]:
abbr = 'KS'

In [3]:
file = 'kansas2018.xls'

Read in federal level data

In [4]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [5]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [6]:
len(state_fiscal)

311

In [7]:
state_fiscal.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
5335,2000001,17508501400000,20,20169,N,41460,Southeast Of Saline,Kansas,KS,3,...,R,R,R,R,R,R,M,R,R,R
5336,2000002,17505702200000,20,20113,N,32700,Smoky Valley,Kansas,KS,3,...,R,R,R,R,R,R,M,R,R,R
5337,2000003,17507503400000,20,20149,358,31740,Wamego,Kansas,KS,3,...,R,R,R,R,R,R,M,R,R,R
5338,2000004,17507503600000,20,20149,358,31740,Rock Creek,Kansas,KS,3,...,R,R,R,R,R,R,M,I,R,R
5339,2000006,17500701800000,20,20013,N,N,Hiawatha,Kansas,KS,3,...,R,R,R,R,R,R,M,R,R,R


Read in state level data

In [8]:
state_grads = pd.read_excel('../../data/state_data_raw/' + file)

In [9]:
state_grads.head(20)

Unnamed: 0,State Totals,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
0,,,,,,,,,,,...,,,,,,,,,,
1,2017-2018,,,,,,,,,,...,,,,,,,,,,
2,STATE GRADUATION RATE - FOUR-YEAR ADJUSTED CO...,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,"BY DISTRICT, RACE AND GENDER",,,,,,,,,,...,,,,,,,,,,
5,ALL SCHOOLS,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,TOTAL,,,WHITE,,...,,,REDUCED-PRICE LUNCH,,,SPECIAL EDUC.,,,,
8,ORG. #,ORGANIZATION NAME,COUNTY NAME,COHORT TYPE,,ALL,MALE,FEM.,MALE,FEM.,...,,FEM.,MALE,,FEM.,MALE,FEM.,LEP,MIGRANT,HOME\nLESS
9,D0101,Erie-Galesburg,Neosho,Graduation Rate,,98.3,100.0,95.8,100.0,95.2,...,,92.3,100.0,,100.0,100.0,100.0,100.0,0.0,100.0


Reset columns and rows.

In [10]:
state_grads.columns = state_grads.iloc[8]

In [11]:
state_grads = state_grads.iloc[9:308, :6]

In [12]:
state_grads.tail(10)

8,ORG. #,ORGANIZATION NAME,COUNTY NAME,COHORT TYPE,NaN,ALL
298,Z0009,Independence Bible School,Montgomery,Graduation Rate,,80.0
299,Z0013,St John's Military School,Saline,Graduation Rate,,0.0
300,Z0029,Kansas City Catholic Diocese,Wyandotte,Graduation Rate,,97.4
301,Z0030,Salina Catholic Diocese,Saline,Graduation Rate,,96.6
302,Z0031,Wichita Catholic Diocese,Sedgwick,Graduation Rate,,98.7
303,Z0032,Lakemary Center Paola,Miami,Graduation Rate,,22.2
304,Z0058,Kickapoo Nation School,Brown,Graduation Rate,,83.3
305,Z0060,Accelerated Schools,Johnson,Graduation Rate,,88.2
306,Z0066,Word of Life Traditional School,Sedgwick,Graduation Rate,,94.5
307,Z0071,Riverbend International School,Atchison,Graduation Rate,,0.0


Select and rename columns.

In [13]:
state_grads = state_grads[['ORGANIZATION NAME', 'COHORT TYPE', 'ALL']]

In [14]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [15]:
state_grads['Total'] = np.full_like(state_grads['District Name'], '')

In [16]:
state_grads

Unnamed: 0,District Name,Total,Graduation Rate
9,Erie-Galesburg,,98.3
10,Cimarron-Ensign,,97.1
11,Cheylin,,100.0
12,Rawlins County,,95.8
13,Western Plains,,87.5
...,...,...,...
303,Lakemary Center Paola,,22.2
304,Kickapoo Nation School,,83.3
305,Accelerated Schools,,88.2
306,Word of Life Traditional School,,94.5


In [17]:
state_grads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 9 to 307
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   District Name    299 non-null    object
 1   Total            299 non-null    object
 2   Graduation Rate  299 non-null    object
dtypes: object(3)
memory usage: 7.1+ KB


Convert data types.

state_grads = state_grads.replace('****', '')

In [18]:
state_grads['Total'] = pd.to_numeric(state_grads['Total'])
state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate']) / 100

Check for matches and non-matches in the two lists

In [19]:
Matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
Matches.sort()
len(Matches)

286

In [20]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

['Accelerated Schools',
 'Department of Corrections',
 'Ft Leavenworth',
 'Heartspring',
 'Independence Bible School',
 'Kansas City Catholic Diocese',
 'Kickapoo Nation School',
 'Lakemary Center Paola',
 'Riverbend International School',
 'Salina Catholic Diocese',
 "St John's Military School",
 'Wichita Catholic Diocese',
 'Word of Life Traditional School']

In [21]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['ANW Special Education Cooperative',
 'Brown Co KS Special Ed Coop',
 'Butler Co Special Education Interlocal',
 'Copeland',
 'Doniphan Co Education Coop',
 'East Central KS Coop in Educ',
 'Grinnell Public Schools',
 'Haviland',
 'High Plains Educational Cooperative',
 'Lansing Correctional Facility',
 'Lewis',
 'Marion County Special Education',
 'Northeast KS Education Serv Cntr',
 'Northwest KS Educational Serv Cntr',
 'Reno County Education Cooperative',
 'Sedgwick Co Area Educational Servs',
 'South Central KS Education Serv Cnt',
 'South Central KS Spec Ed Coop',
 'Southeast KS Education Serv Center',
 'Southeast Kansas Special Education Interlocal',
 'Southwest Kansas Area Cooperative',
 'Southwest Plains Regional Svc Ctr',
 'Sumner Co Educational Services',
 'Three Lakes Educational Cooperative',
 'Tri County Special Education Coop']

Can't find any additional matches to be made

Merge federal and state data, keeping only matches between the two. 

In [22]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [23]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)