In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
abbr = 'WV'

In [4]:
file = 'west_virginia2018.csv'

Read in federal level data

In [5]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [6]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [7]:
len(state_fiscal)

57

In [8]:
state_fiscal.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
18121,5400030,49500100100000,54,54001,N,N,BARBOUR COUNTY SCHOOLS,West Virginia,WV,3,...,R,R,R,R,R,R,R,R,R,R
18122,5400060,49500200100000,54,54003,548,25180,BERKELEY COUNTY SCHOOLS,West Virginia,WV,3,...,R,R,R,R,R,R,R,R,R,R
18123,5400090,49500300100000,54,54005,170,16620,BOONE COUNTY SCHOOLS,West Virginia,WV,3,...,R,R,R,R,R,R,R,R,R,R
18124,5400120,49500400100000,54,54007,N,N,BRAXTON COUNTY SCHOOLS,West Virginia,WV,3,...,R,R,R,R,R,R,R,R,R,R
18125,5400150,49500500100000,54,54009,430,48260,BROOKE COUNTY SCHOOLS,West Virginia,WV,3,...,R,R,R,R,R,R,R,R,R,R


Read in state level data

In [21]:
state_grads = pd.read_csv('../../data/state_data_raw/' + file)

In [22]:
state_grads.head()

Unnamed: 0,County Number,County Name,All,White,Black,Hispanic,Asian,Indian,Multi-Racial,Pacific Islander,Male,Female,ELL,Low SES,Spec Ed
0,2,Barbour ...,0.9023,0.9113,,,,,0.7778,,0.8875,0.9245,,0.9023,0.8571
1,4,Berkeley ...,0.941,0.9428,0.9286,0.9697,1.0,0.75,0.9057,,0.9302,0.9512,0.9333,0.9144,0.8246
2,6,Boone ...,0.8796,0.8771,1.0,1.0,1.0,,1.0,,0.8492,0.925,1.0,0.8822,0.7907
3,8,Braxton ...,0.8652,0.8623,,,1.0,,1.0,,0.8228,0.9194,,0.8652,0.8
4,10,Brooke ...,0.9389,0.9378,,1.0,,,1.0,,0.931,0.9469,,0.943,0.8205


Reset columns.

Filter results.

In [11]:
# state_grads = state_grads[(state_grads['OrganizationLevel'] == 'District') &
#                          (state_grads['StudentGroup'] == 'All Students') &
#                          (state_grads['Cohort'] == 'Four Year')]

Select and rename columns.

In [24]:
state_grads['Total'] = np.full_like(state_grads['All'], np.nan)

In [25]:
state_grads = state_grads[['County Name', 
                           'Total',
                           'All']]

In [26]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [27]:
state_grads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   District Name    56 non-null     object 
 1   Total            0 non-null      float64
 2   Graduation Rate  56 non-null     float64
dtypes: float64(2), object(1)
memory usage: 1.4+ KB


Convert data types.

In [14]:
# state_grads['Total'] = state_grads['Total'].astype(str).str.replace('<', '')
# state_grads['Graduation Rate'] = state_grads['Graduation Rate'].astype(str).str.replace('%', '')

In [15]:
# state_grads['Total'] = pd.to_numeric(state_grads['Total'])
# state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate']) / 100

Check for matches and non-matches in the two lists. 

In [31]:
state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.upper().str.strip()
state_grads['District Name'] = state_grads['District Name'].astype(str).str.upper().str.strip()

In [32]:
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(r'\sSu$', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(r'\sSd$', '')

state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' COUNTY SCHOOLS', '')

In [33]:
matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
matches.sort()
len(matches)

54

In [34]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

['BOONE', 'STATE TOTAL']

In [35]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['BOONE COUNTY  SCHOOLS',
 'INSTITUTIONAL EDUCATIONAL PROGRAMS',
 'WV SCHOOLS FOR THE DEAF AND THE BLIND']

Make any additional matches I can find.

In [22]:
state_fiscal_rename = {
    'BOONE COUNTY  SCHOOLS' : 'BOONE'
}

In [23]:
state_fiscal = state_fiscal.replace(state_fiscal_rename)

Merge federal and state data, keeping only matches between the two. 

In [24]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [25]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)