In [25]:
import pandas as pd
import numpy as np

In [26]:
abbr = 'NE'

In [27]:
file = 'nebraska2018.csv'

Read in federal level data

In [28]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [29]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [30]:
len(state_fiscal)

255

In [31]:
state_fiscal.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
9514,3100002,28501206800000,31,31023,N,N,DAVID CITY PUBLIC SCHOOLS,Nebraska,NE,3,...,R,R,R,R,M,M,M,M,M,M
9515,3100003,28501280100000,31,31023,N,N,EAST BUTLER PUBLIC SCHOOLS,Nebraska,NE,3,...,R,R,R,R,M,M,M,M,M,M
9516,3100004,28501403800000,31,31027,N,N,LAUREL-CONCORD-COLERIDGE SCHOOL,Nebraska,NE,3,...,R,R,R,R,M,M,M,M,M,M
9517,3100006,28502006600000,31,31039,N,N,WEST POINT PUBLIC SCHOOLS,Nebraska,NE,3,...,R,R,R,R,M,M,M,M,M,M
9518,3100008,28502006700000,31,31039,N,N,WISNER-PILGER PUBLIC SCHOOLS,Nebraska,NE,3,...,R,R,R,R,M,M,M,M,M,M


Read in state level data

In [54]:
state_grads = pd.read_csv('../../data/state_data_raw/' + file)

In [55]:
state_grads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139371 entries, 0 to 139370
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Type               139371 non-null  object 
 1   School Year        139371 non-null  int64  
 2   County             139371 non-null  int64  
 3   District           139371 non-null  int64  
 4   School             139371 non-null  int64  
 5   Agency Name        139371 non-null  object 
 6   Graduation Cohort  139371 non-null  int64  
 7   Cohort Year        139371 non-null  int64  
 8   Description        139371 non-null  object 
 9   Graduation Count   139371 non-null  int64  
 10  Graduation Pct     139371 non-null  float64
 11  DataAsOf           139371 non-null  object 
dtypes: float64(1), int64(7), object(4)
memory usage: 12.8+ MB


In [56]:
state_grads

Unnamed: 0,Type,School Year,County,District,School,Agency Name,Graduation Cohort,Cohort Year,Description,Graduation Count,Graduation Pct,DataAsOf
0,ST,20172018,0,0,0,STATE OF NEBRASKA,2018,4,All students,20503,0.89,2018-12-13 00:00:00.000
1,ST,20172018,0,0,0,STATE OF NEBRASKA,2018,4,Male,10312,0.86,2018-12-13 00:00:00.000
2,ST,20172018,0,0,0,STATE OF NEBRASKA,2018,4,Female,10191,0.91,2018-12-13 00:00:00.000
3,ST,20172018,0,0,0,STATE OF NEBRASKA,2018,4,Students eligible for free and reduced lunch,7269,0.81,2018-12-13 00:00:00.000
4,ST,20172018,0,0,0,STATE OF NEBRASKA,2018,4,Special Education Students,1815,0.69,2018-12-13 00:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...,...
139366,SC,20132014,93,96,1,HEARTLAND COMMUNITY HIGH SCH,2011,7,Ethnic7 - Asian,-1,-1.00,2015-08-11 00:00:00.000
139367,SC,20132014,93,96,1,HEARTLAND COMMUNITY HIGH SCH,2011,7,Ethnic7 - Black or African American,-1,-1.00,2015-08-11 00:00:00.000
139368,SC,20132014,93,96,1,HEARTLAND COMMUNITY HIGH SCH,2011,7,Ethnic7 - Native Hawaiian or Other Pacific Isl...,-1,-1.00,2015-08-11 00:00:00.000
139369,SC,20132014,93,96,1,HEARTLAND COMMUNITY HIGH SCH,2011,7,Ethnic7 - White,29,0.94,2015-08-11 00:00:00.000


Filter results.

In [57]:
state_grads = state_grads[(state_grads['Type'] == 'DI') &
                         (state_grads['Graduation Cohort'] == 2018) &
                         (state_grads['Cohort Year'] == 4) &
                         (state_grads['Description'] == 'All students')]

Select and rename columns.

In [59]:
state_grads['Total'] = np.where(state_grads['Graduation Pct'] == -1, 
                                np.nan, 
                                np.round((1. / state_grads['Graduation Pct']) * state_grads['Graduation Count']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [61]:
state_grads = state_grads[['Agency Name', 'Total', 'Graduation Pct']]

In [62]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [63]:
state_grads

Unnamed: 0,District Name,Total,Graduation Rate
272,KENESAW PUBLIC SCHOOLS,24.0,1.00
816,HASTINGS PUBLIC SCHOOLS,279.0,0.82
1360,ADAMS CENTRAL PUBLIC SCHOOLS,73.0,1.00
1904,SILVER LAKE PUBLIC SCHOOLS,11.0,1.00
2448,NELIGH-OAKDALE SCHOOLS,26.0,0.92
...,...,...,...
136651,BLUE HILL PUBLIC SCHOOLS,26.0,1.00
137195,WHEELER CENTRAL SCHOOLS,10.0,1.00
137739,YORK PUBLIC SCHOOLS,121.0,0.98
138283,MC COOL JUNCTION PUBLIC SCHS,21.0,1.00


Convert data types.

In [64]:
state_grads = state_grads.replace(-1, '')

In [65]:
state_grads['Total'] = pd.to_numeric(state_grads['Total'])
state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate'])

Check for matches and non-matches in the two lists. 

In [16]:
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.upper()
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.upper()

In [17]:
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace('.', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(',', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' DISTRICT', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' DIST', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' PUBLIC', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' SCHOOLS', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' SCHOOL', '')

# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace('.', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(',', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' DISTRICT', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' DIST', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' PUBLIC', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' SCHOOLS', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(' SCHOOL', '')

In [67]:
matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
matches.sort()
len(matches)

249

In [68]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

[]

In [69]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['EDUCATIONAL SERVICE UNIT 02',
 'EDUCATIONAL SERVICE UNIT 03',
 'EDUCATIONAL SERVICE UNIT 04',
 'EDUCATIONAL SERVICE UNIT 08',
 'EDUCATIONAL SERVICE UNIT 09',
 'EDUCATIONAL SERVICE UNIT 13']

No remaining matches I can find. 

In [21]:
#state_fiscal_rename = {}

In [22]:
#state_fiscal = state_fiscal.replace(state_fiscal_rename)

Merge federal and state data, keeping only matches between the two. 

In [23]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [24]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)