In [1]:
import pandas as pd
import numpy as np

Read in federal level data

In [2]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [3]:
fiscal_CT = fiscal[(fiscal['STNAME'] == 'Connecticut') & (fiscal['GSHI'] == '12')]

In [4]:
len(fiscal_CT)

146

In [5]:
fiscal_CT.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
2689,900002,N,9,9007,278,25540,Connecticut Technical High Sc,Connecticut,CT,02,...,N,N,N,N,N,N,N,N,N,N
2690,900003,N,9,9003,278,25540,UNIFIED SCHOOL DISTRICT #1,Connecticut,CT,N,...,N,N,N,N,N,N,N,N,N,N
2691,900004,N,9,9003,278,25540,UNIFIED SCHOOL DISTRICT #2,Connecticut,CT,N,...,N,N,N,N,N,N,N,N,N,N
2692,900005,07500740100000,9,9013,278,25540,REGIONAL SCHOOL DISTRICT 19,Connecticut,CT,02,...,R,R,R,R,R,R,R,M,M,M
2693,900007,N,9,9003,278,25540,JUMOKE ACADEMY DISTRICT,Connecticut,CT,03,...,R,R,R,R,R,R,R,M,M,M


Read in state level data

In [6]:
CT = pd.read_excel('../../data/state_data_raw/connecticut2018.xlsx')

In [7]:
CT.head()

Unnamed: 0,FallOfYear,RptngDistrictName,ReportingDistrictCode,SchoolName,SchoolCode,SchoolOrgType,SchoolLowGrade,SchoolHighGrade,SchoolTitleIType,Category,...,Distinction,DistinctionCategory,Perform,ELA_ALL_YN,Math_ALL_YN,ELA_HN_YN,Math_HN_YN,Improve,SupportType,schoolyear
0,2017,Capital Preparatory Harbor School District,2970013,District,0,District,District,District,,DistrictTot,...,,,.,.,.,.,.,.,,2017-18
1,2017,Unified School District #1,3360015,District,0,District,District,District,,DistrictTot,...,,,.,.,.,.,.,.,,2017-18
2,2017,Norwich School District,1040011,District,0,District,District,District,,DistrictTot,...,,,.,.,.,.,.,.,,2017-18
3,2017,Path Academy District,2930013,District,0,District,District,District,,DistrictTot,...,,,.,.,.,.,.,.,,2017-18
4,2017,Great Oaks Charter School District,2940013,District,0,District,District,District,,DistrictTot,...,,,.,.,.,.,.,.,,2017-18


Filter for only the district level, total population samples.

In [8]:
CT = CT[(CT['SchoolName'] == 'District')]

In [9]:
len(CT)

202

Drop and rename columns.

In [10]:
CT['Total'] = np.full_like(CT['RptngDistrictName'], '')

In [11]:
CT = CT[['RptngDistrictName', 'Total', 'Ind8Rate']]

In [12]:
CT.columns = ['District Name', 'Total', 'Graduation Rate']

In [13]:
CT

Unnamed: 0,District Name,Total,Graduation Rate
0,Capital Preparatory Harbor School District,,.
1,Unified School District #1,,0.010256
2,Norwich School District,,.
3,Path Academy District,,0.088235
4,Great Oaks Charter School District,,.
...,...,...,...
197,Sharon School District,,.
198,Shelton School District,,0.924433
199,Franklin School District,,.
200,Glastonbury School District,,0.979339


Convert data types and remove placeholder values.

In [14]:
CT['Graduation Rate'] = CT['Graduation Rate'].replace('.', '')
CT['Graduation Rate'] = pd.to_numeric(CT['Graduation Rate'])
CT['Graduation Rate'] = pd.to_numeric(CT['Graduation Rate'])

Check for non-matches in the two lists

In [15]:
Matches = [name for name in list(CT['District Name']) if name.lower() in [low.lower() for low in fiscal_CT['NAME']]]
Matches.sort()
Matches

['Amistad Academy District',
 'Ansonia School District',
 'Avon School District',
 'Berlin School District',
 'Bethel School District',
 'Bloomfield School District',
 'Bolton School District',
 'Branford School District',
 'Bridgeport School District',
 'Bristol School District',
 'Brookfield School District',
 'Canton School District',
 'Cheshire School District',
 'Clinton School District',
 'Colchester School District',
 'Coventry School District',
 'Cromwell School District',
 'Danbury School District',
 'Darien School District',
 'Derby School District',
 'East Granby School District',
 'East Haddam School District',
 'East Hampton School District',
 'East Hartford School District',
 'East Haven School District',
 'East Lyme School District',
 'East Windsor School District',
 'Ellington School District',
 'Enfield School District',
 'Explorations District',
 'Fairfield School District',
 'Farmington School District',
 'Glastonbury School District',
 'Granby School District',
 'Gr

In [16]:
A = [name for name in list(CT['District Name']) if name.lower() not in [low.lower() for low in fiscal_CT['NAME']]]
A.sort()
A

['Achievement First Bridgeport Academy District',
 'Achievement First Hartford Academy District',
 'Andover School District',
 'Area Cooperative Educational Services',
 'Ashford School District',
 'Barkhamsted School District',
 'Bethany School District',
 'Booker T. Washington Academy District',
 'Bozrah School District',
 'Brass City Charter School District',
 'Brooklyn School District',
 'Canaan School District',
 'Canterbury School District',
 'Capital Preparatory Harbor School District',
 'Capitol Region Education Council',
 'Chaplin School District',
 'Chester School District',
 'Colebrook School District',
 'Columbia School District',
 'Common Ground High School District',
 'Connecticut Technical Education and Career System',
 'Cooperative Educational Services',
 'Cornwall School District',
 'Deep River School District',
 'Eastern Connecticut Regional Educational Service Center (EASTCONN)',
 'Eastford School District',
 'Easton School District',
 'EdAdvance',
 'Elm City College 

In [17]:
B = [name for name in list(fiscal_CT['NAME']) if name.lower() not in [low.lower() for low in CT['District Name']]]
B.sort()
B

['AREA COOPERATIVE EDUCATIONAL',
 'Achievement First Hartford Ac',
 'Bridgeport Achievement First',
 'CAPITOL REGION EDUCATION COUN',
 'COMMON GROUND HIGH SCHOOL DIS',
 'COOPERATIVE EDUCATIONAL SERVI',
 'Capital Preparatory Harbor Sc',
 'Connecticut Technical High Sc',
 'DEPARTMENT OF MENTAL HEALTH A',
 'EASTERN CONNECTICUT REGIONAL',
 'EDUCATION CONNECTION',
 'Elm City College Preparatory',
 'Highville Charter School',
 'NORTH BRANFORD SCHOOL DISTRIC',
 'NORTH STONINGTON SCHOOL DISTR',
 'NORWICH FREE ACADEMY',
 'Stamford Academy',
 'THE GILBERT SCHOOL',
 'WOODSTOCK ACADEMY']

Replace the names I can find matches for.

In [18]:
CT_fiscal_rename = {
 'AREA COOPERATIVE EDUCATIONAL' : 'Area Cooperative Educational Services',
 'Achievement First Hartford Ac' :  'Achievement First Hartford Academy District',
 'Bridgeport Achievement First' : 'Achievement First Bridgeport Academy District',
 'CAPITOL REGION EDUCATION COUN' : 'Capitol Region Education Council',
 'COMMON GROUND HIGH SCHOOL DIS' : 'Common Ground High School District',
 'COOPERATIVE EDUCATIONAL SERVI' : 'Cooperative Educational Services',
 'Capital Preparatory Harbor Sc' : 'Capital Preparatory Harbor School District',
 'Connecticut Technical High Sc' : 'Connecticut Technical Education and Career System',
 'EASTERN CONNECTICUT REGIONAL' :  'Eastern Connecticut Regional Educational Service Center (EASTCONN)',
 'Elm City College Preparatory' :  'Elm City College Preparatory School District',
 'Highville Charter School' :  'Highville Charter School District',
 'NORTH BRANFORD SCHOOL DISTRIC' :  'North Branford School District',
 'NORTH STONINGTON SCHOOL DISTR' :  'North Stonington School District',
 'NORWICH FREE ACADEMY' :  'Norwich Free Academy District',
 'Stamford Academy' :  'Stamford Academy District',
 'THE GILBERT SCHOOL' :  'The Gilbert School District',
 'WOODSTOCK ACADEMY' : 'Woodstock School District'
}

In [19]:
fiscal_CT = fiscal_CT.replace(CT_fiscal_rename)

Set both lists of names to be in uppercase to make sure they match when merging. 

In [20]:
fiscal_CT['NAME'] = fiscal_CT['NAME'].astype(str).str.upper()
CT['District Name'] = CT['District Name'].astype(str).str.upper()

Examine data types and missing values.

In [21]:
CT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 0 to 201
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   District Name    202 non-null    object 
 1   Total            202 non-null    object 
 2   Graduation Rate  138 non-null    float64
dtypes: float64(1), object(2)
memory usage: 6.3+ KB


Change column names for consistency across states. 

Merge federal and state data, keeping only matches between the two. 

In [22]:
CT_merged = pd.merge(fiscal[fiscal['STABBR'] == 'CT'], CT, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [23]:
CT_merged.to_csv('../../data/state_data_merged/CT.csv', index=False)