In [1]:
import pandas as pd
import numpy as np
from tabula import read_pdf

Read in federal level data

In [2]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [3]:
fiscal_DE = fiscal[(fiscal['STNAME'] == 'Delaware') & (fiscal['GSHI'] == '12')]

In [4]:
len(fiscal_DE)

27

In [5]:
fiscal_DE.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
2895,1000004,N,10,10003,428,37980,Charter School of Wilmington,Delaware,DE,2,...,R,M,M,M,R,R,M,R,R,R
2896,1000005,N,10,10001,428,20100,Positive Outcomes Charter School,Delaware,DE,2,...,R,M,M,M,R,R,M,I,R,R
2900,1000011,N,10,10005,N,41540,Sussex Academy,Delaware,DE,3,...,R,M,M,M,R,R,M,R,R,R
2902,1000015,N,10,10003,428,37980,Newark Charter School,Delaware,DE,3,...,R,M,M,M,R,R,M,R,R,R
2903,1000016,N,10,10003,428,37980,Delaware Military Academy,Delaware,DE,2,...,R,M,M,M,R,R,M,R,R,R


Read in state level data

In [6]:
DE_pdf = read_pdf('../../data/state_data_raw/delaware2018.pdf', pages=5)
DE = DE_pdf[0]

In [7]:
DE.head(10)

Unnamed: 0,District/Charter,2016,2017,2018
0,Appoquinimink School District,89.82%,94.61%,>95.00%
1,Brandywine School District,89.68%,84.81%,84.99%
2,Caesar Rodney School District,84.82%,89.46%,91.37%
3,Cape Henlopen School District,88.85%,87.35%,91.01%
4,Capital School District,80.71%,84.41%,77.08%
5,Christina School District,69.39%,69.51%,72.73%
6,Colonial School District,70.20%,79.85%,79.18%
7,Delaware Academy of Public Safety and Security,81.63%,80.70%,84.31%
8,Delmar School District,82.24%,91.08%,92.36%
9,Design Thinking Academy,,,86.54%


Drop and rename columns.

In [8]:
DE['Total'] = np.full_like(DE['2018'], '')

In [9]:
DE = DE[['District/Charter', 'Total', '2018']]

In [10]:
DE.columns = ['District Name', 'Total', 'Graduation Rate']

In [11]:
DE

Unnamed: 0,District Name,Total,Graduation Rate
0,Appoquinimink School District,,>95.00%
1,Brandywine School District,,84.99%
2,Caesar Rodney School District,,91.37%
3,Cape Henlopen School District,,91.01%
4,Capital School District,,77.08%
5,Christina School District,,72.73%
6,Colonial School District,,79.18%
7,Delaware Academy of Public Safety and Security,,84.31%
8,Delmar School District,,92.36%
9,Design Thinking Academy,,86.54%


In [12]:
DE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   District Name    27 non-null     object
 1   Total            27 non-null     object
 2   Graduation Rate  27 non-null     object
dtypes: object(3)
memory usage: 776.0+ bytes


Convert data types.

In [13]:
DE['Graduation Rate'] = DE['Graduation Rate'].astype(str).str.replace('>', '')
DE['Graduation Rate'] = DE['Graduation Rate'].astype(str).str.replace('%', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
DE['Total'] = pd.to_numeric(DE['Total'])
DE['Graduation Rate'] = pd.to_numeric(DE['Graduation Rate']) / 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Check for matches and non-matches in the two lists

In [15]:
Matches = [name for name in list(DE['District Name']) if name.lower() in [low.lower() for low in fiscal_DE['NAME']]]
Matches.sort()
Matches

['Appoquinimink School District',
 'Brandywine School District',
 'Caesar Rodney School District',
 'Cape Henlopen School District',
 'Capital School District',
 'Christina School District',
 'Colonial School District',
 'Delaware Academy of Public Safety and Security',
 'Delmar School District',
 'Indian River School District',
 'Lake Forest School District',
 'Laurel School District',
 'Milford School District',
 'Newark Charter School',
 'POLYTECH School District',
 'Positive Outcomes Charter School',
 'Seaford School District',
 'Smyrna School District',
 'Sussex Academy',
 'Sussex Technical School District',
 'Woodbridge School District']

In [16]:
A = [name for name in list(DE['District Name']) if name.lower() not in [low.lower() for low in fiscal_DE['NAME']]]
A.sort()
A

['Design Thinking Academy',
 'Early College High School at Del State',
 'First State Military Academy',
 'MOT Charter School',
 'New Castle County Vocational-Technical School',
 'Red Clay Consolidated School District\r(includes Charter School of Wilmington and Delaware Military Academy)']

In [17]:
B = [name for name in list(fiscal_DE['NAME']) if name.lower() not in [low.lower() for low in DE['District Name']]]
B.sort()
B

['Charter School of Wilmington',
 'Delaware Design-Lab High School',
 'Delaware Military Academy',
 'Early College High School at Delaware State University',
 'New Castle County Vocational-Technical School District',
 'Red Clay Consolidated School District']

Replace the names I can find matches for.

In [18]:
DE_rename = {
    'Design Thinking Academy' : 'Delaware Design-Lab High School',
    'Early College High School at Del State' : 'Early College High School at Delaware State University',
    'First State Military Academy' : 'Delaware Military Academy',
    'New Castle County Vocational-Technical School' : 'New Castle County Vocational-Technical School District',
    'Red Clay Consolidated School District\r(includes Charter School of Wilmington and Delaware Military Academy)' : 'Red Clay Consolidated School District'
}

In [19]:
DE = DE.replace(DE_rename)

Merge federal and state data, keeping only matches between the two. 

In [20]:
DE_merged = pd.merge(fiscal_DE, DE, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [21]:
DE_merged.to_csv('../../data/state_data_merged/DE.csv', index=False)