In [1]:
import pandas as pd
import numpy as np
from tabula import read_pdf

Read in federal level data

In [2]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [3]:
fiscal_FL = fiscal[(fiscal['STNAME'] == 'Florida') & (fiscal['GSHI'] == '12')]

In [4]:
len(fiscal_FL)

77

In [5]:
fiscal_FL.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
3016,1200002,N,12,12095,422,36740,FL VIRTUAL,Florida,FL,03,...,M,M,M,M,M,M,M,M,M,M
3018,1200030,10500100100000,12,12001,264,23540,ALACHUA,Florida,FL,03,...,R,R,R,R,R,R,R,R,R,R
3019,1200060,10500200100000,12,12003,300,27260,BAKER,Florida,FL,03,...,R,R,R,R,R,R,R,R,R,R
3020,1200080,N,12,12105,N,29460,LAKE WALES CHARTER SCHOOLS,Florida,FL,N,...,M,M,M,M,M,M,M,M,M,M
3021,1200081,N,12,12099,370,33100,SOUTH TECH ACADEMY,Florida,FL,N,...,M,M,M,M,M,M,M,M,M,M


Read in state level data

In [6]:
FL_pdf = read_pdf('../../data/state_data_raw/florida2018.pdf', pages=[3, 4])

In [7]:
FL = pd.concat((FL_pdf[1], FL_pdf[2]))

In [8]:
FL

Unnamed: 0.1,Unnamed: 0,District,2013-14,2014-15,2015-16,2016-17,2017-18
0,,FLORIDA,76.1%,77.9%,80.7%,82.3%,86.1%
1,1.0,Alachua,72.2%,74.3%,78.4%,82.7%,88.0%
2,2.0,Baker,75.2%,81.8%,79.4%,81.0%,75.5%
3,3.0,Bay,70.8%,70.6%,81.0%,78.0%,81.1%
4,4.0,Bradford,71.3%,76.9%,83.7%,78.9%,89.0%
...,...,...,...,...,...,...,...
34,71.0,FL Virtual,74.9%,70.9%,66.6%,67.3%,81.4%
35,72.0,FAU Lab School,100.0%,100.0%,100.0%,100.0%,100.0%
36,73.0,FSU Lab School,88.0%,97.1%,98.6%,98.7%,96.6%
37,74.0,FAMU Lab School,76.7%,97.1%,94.9%,89.3%,94.3%


Drop and rename columns.

In [9]:
FL['Total'] = np.full_like(FL['2017-18'], '')

In [10]:
FL = FL[['District', 'Total', '2017-18']]

In [11]:
FL.columns = ['District Name', 'Total', 'Graduation Rate']

In [12]:
FL

Unnamed: 0,District Name,Total,Graduation Rate
0,FLORIDA,,86.1%
1,Alachua,,88.0%
2,Baker,,75.5%
3,Bay,,81.1%
4,Bradford,,89.0%
...,...,...,...
34,FL Virtual,,81.4%
35,FAU Lab School,,100.0%
36,FSU Lab School,,96.6%
37,FAMU Lab School,,94.3%


In [13]:
FL.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75 entries, 0 to 38
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   District Name    75 non-null     object
 1   Total            75 non-null     object
 2   Graduation Rate  75 non-null     object
dtypes: object(3)
memory usage: 2.3+ KB


Convert data types.

In [14]:
FL['Graduation Rate'] = FL['Graduation Rate'].astype(str).str.replace('%', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
FL['Total'] = pd.to_numeric(FL['Total'])
FL['Graduation Rate'] = pd.to_numeric(FL['Graduation Rate']) / 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Check for matches and non-matches in the two lists

In [16]:
Matches = [name for name in list(FL['District Name']) if name.lower() in [low.lower() for low in fiscal_FL['NAME']]]
Matches.sort()
len(Matches)

68

In [17]:
A = [name for name in list(FL['District Name']) if name.lower() not in [low.lower() for low in fiscal_FL['NAME']]]
A.sort()
A

['FAMU Lab School',
 'FAU Lab School',
 'FLORIDA',
 'FLORIDA',
 'FSU Lab School',
 'Miami-Dade',
 'UF Lab School']

In [18]:
B = [name for name in list(fiscal_FL['NAME']) if name.lower() not in [low.lower() for low in FL['District Name']]]
B.sort()
B

['DADE',
 'FAMU LAB SCH',
 'FAU LAB SCH',
 'FSU LAB SCH',
 'LAKE WALES CHARTER SCHOOLS',
 'SOUTH TECH ACADEMY',
 'UF LAB SCH',
 'UNITED CEREBRAL PALSY SYSTEM',
 'WASH SPECIAL']

Replace the names I can find matches for.

In [19]:
FL_rename = {
    'DADE' : 'Miami-Dade',
    'FAMU LAB SCH' : 'FAMU Lab School',
    'FAU LAB SCH' : 'FAU Lab School',
    'FSU LAB SCH' : 'FSU Lab School',
    'UF LAB SCH' : 'UF LAB SCH',
}

In [20]:
fiscal_FL = fiscal_FL.replace(FL_rename)

In [21]:
fiscal_FL['NAME'] = fiscal_FL['NAME'].astype(str).str.upper()
FL['District Name'] = FL['District Name'].astype(str).str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Merge federal and state data, keeping only matches between the two. 

In [22]:
FL_merged = pd.merge(fiscal_FL, FL, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [23]:
FL_merged.to_csv('../../data/state_data_merged/FL.csv', index=False)