In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
abbr = 'DC'

In [4]:
file = 'dc2018.xlsx'

Read in federal level data

In [5]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [6]:
state_fiscal = fiscal[(fiscal['STABBR'] == abbr) & (fiscal['GSHI'] == '12')]

In [7]:
len(state_fiscal)

22

In [8]:
state_fiscal

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
2945,1100005,N,11,11001,548,47900,Cesar Chavez PCS for Public Policy,District of Columbia,DC,3,...,R,R,R,R,R,R,R,R,R,M
2946,1100008,N,11,11001,548,47900,Friendship PCS,District of Columbia,DC,3,...,R,R,R,R,R,R,R,R,R,M
2950,1100013,N,11,11001,548,47900,IDEA PCS,District of Columbia,DC,2,...,R,R,R,R,I,R,I,R,R,M
2951,1100014,N,11,11001,548,47900,Maya Angelou PCS,District of Columbia,DC,2,...,R,R,R,R,R,R,R,R,R,M
2955,1100019,N,11,11001,548,47900,Washington Latin PCS,District of Columbia,DC,3,...,R,R,R,R,R,R,R,R,R,M
2957,1100022,N,11,11001,548,47900,SEED PCS of Washington DC,District of Columbia,DC,3,...,R,R,R,R,R,R,R,R,R,M
2958,1100026,N,11,11001,548,47900,Washington Mathematics Science Technology PCHS,District of Columbia,DC,2,...,M,M,M,M,M,M,M,M,M,M
2960,1100030,09200100130100,11,11001,548,47900,District of Columbia Public Schools,District of Columbia,DC,3,...,R,R,R,R,R,R,R,R,R,M
2961,1100031,N,11,11001,548,47900,KIPP DC PCS,District of Columbia,DC,3,...,R,R,R,R,R,R,R,I,R,M
2963,1100034,N,11,11001,548,47900,Thurgood Marshall Academy PCS,District of Columbia,DC,2,...,R,R,R,R,R,R,R,R,R,M


Read in state level data

In [9]:
state_grads = pd.read_excel('../../data/state_data_raw/' + file, sheet_name = '2017-18 Rate by School')

In [10]:
state_grads.head()

Unnamed: 0,"DC 2018 4-year Adjusted Cohort Graduation Rates,",Unnamed: 1,Unnamed: 2,Unnamed: 3
0,by School,,,
1,,,,
2,School,Graduates,Cohort Total,2017 ACGR
3,All,3330,4863,68.5%
4,Charter Cohort,1053,1454,72.4%


Reset columns.

In [11]:
state_grads.columns = state_grads.loc[2]
state_grads = state_grads.loc[4:28]

In [12]:
state_grads

2,School,Graduates,Cohort Total,2017 ACGR
4,Charter Cohort,1053,1454,72.4%
5,BASIS DC PCS,16,16,100.0%
6,Capital City PCS - High School,61,71,85.9%
7,Cesar Chavez PCS for Public Policy - Capitol Hill,56,74,75.7%
8,Cesar Chavez PCS for Public Policy - Chavez Prep,n<10,n<10,
9,Cesar Chavez PCS for Public Policy - Parkside ...,56,65,86.2%
10,E.L. Haynes PCS - High School,93,109,85.3%
11,Friendship PCS - Collegiate Academy,142,188,75.5%
12,Friendship PCS - Technology Preparatory High S...,41,48,85.4%
13,Goodwill Excel Center PCS,2,29,6.9%


Filter results.

In [13]:
# state_grads = state_grads[(state_grads['SCHOOL_NAME'] == '[Districtwide]') &
#                          (state_grads['GROUP_BY_VALUE'] == 'All Students') &
#                          (state_grads['TIMEFRAME'] == '4-Year rate') &
#                          (state_grads['COMPLETION_STATUS'] == 'Completed - Regular High School Diploma')]

Select and rename columns.

In [14]:
state_grads.drop(columns='Graduates', inplace=True)

In [15]:
state_grads.head()

2,School,Cohort Total,2017 ACGR
4,Charter Cohort,1454,72.4%
5,BASIS DC PCS,16,100.0%
6,Capital City PCS - High School,71,85.9%
7,Cesar Chavez PCS for Public Policy - Capitol Hill,74,75.7%
8,Cesar Chavez PCS for Public Policy - Chavez Prep,n<10,


In [16]:
state_grads.columns = ['District Name', 'Total', 'Graduation Rate']

In [17]:
state_grads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 4 to 28
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   District Name    25 non-null     object
 1   Total            25 non-null     object
 2   Graduation Rate  23 non-null     object
dtypes: object(3)
memory usage: 732.0+ bytes


Convert data types.

In [18]:
state_grads['Total'] = state_grads['Total'].astype(str).str.replace('n<10', '')
state_grads['Total'] = state_grads['Total'].astype(str).str.replace('nan', '')
state_grads['Graduation Rate'] = state_grads['Graduation Rate'].astype(str).str.replace('%', '')
state_grads['Graduation Rate'] = state_grads['Graduation Rate'].astype(str).str.replace('nan', '')
state_grads = state_grads.fillna('')

In [19]:
state_grads['Total'] = pd.to_numeric(state_grads['Total'])
state_grads['Graduation Rate'] = pd.to_numeric(state_grads['Graduation Rate']) / 100

For DC, two charter systems have high schools reporting their graduation rates individually, so I will add them manually. Because Chavez Prep's numbers are < 10, they are not reported and will not be included in the sum.

In [20]:
addon = pd.DataFrame({'District Name' : {0 : 'Cesar Chavez PCS for Public Policy', 1 : 'Friendship PCS'},
             'Total' : {0 : (74 + 65), 1 : (188 + 48)},
             'Graduation Rate' : {0 : ((56 + 56) / (74 + 65)), 1 : ((142 + 41) / (188 + 48))}})

In [28]:
state_grads = pd.concat([state_grads, addon], ignore_index=True)

Check for matches and non-matches in the two lists. 

In [22]:
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.upper().str.strip()
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.upper().str.strip()

In [23]:
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(r'\sSu$', '')
# state_grads['District Name'] = state_grads['District Name'].astype(str).str.replace(r'\sSd$', '')

# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' County School District', '')
# state_fiscal['NAME'] = state_fiscal['NAME'].astype(str).str.replace(' # ', ' #')

In [29]:
matches = [name for name in list(state_grads['District Name']) if name in list(state_fiscal['NAME'])]
matches.sort()
len(matches)

12

In [30]:
A = [name for name in list(state_grads['District Name']) if name not in list(state_fiscal['NAME'])]
A.sort()
A

['BASIS DC PCS',
 'Capital City PCS - High School',
 'Cesar Chavez PCS for Public Policy - Capitol Hill',
 'Cesar Chavez PCS for Public Policy - Chavez Prep',
 'Cesar Chavez PCS for Public Policy - Parkside High School',
 'Charter Cohort',
 'DCPS Cohort',
 'E.L. Haynes PCS - High School',
 'Friendship PCS - Collegiate Academy',
 'Friendship PCS - Technology Preparatory High School',
 'KIPP DC - College Preparatory Academy PCS',
 'Maya Angelou PCS - High School',
 'Paul PCS - International High School',
 'Washington Latin PCS - Upper School',
 'Washington Leadership Academy PCS']

In [31]:
B = [name for name in list(state_fiscal['NAME']) if name not in list(state_grads['District Name'])]
B.sort()
B

['Basis DC PCS',
 'Capital City PCS',
 'DYRS',
 'District of Columbia Public Schools',
 'E.L. Haynes PCS',
 'KIPP DC PCS',
 'Maya Angelou PCS',
 'Paul PCS',
 'St. Coletta Special Education PCS',
 'Washington Latin PCS']

Make any additional matches.

In [27]:
state_grads_rename = {
    'BASIS DC PCS' : 'Basis DC PCS',
    'Capital City PCS - High School' : 'Capital City PCS',
#     'Cesar Chavez PCS for Public Policy - Capitol Hill',
#     'Cesar Chavez PCS for Public Policy - Chavez Prep',
#     'Cesar Chavez PCS for Public Policy - Parkside High School',
#     'Charter Cohort',
    'DCPS Cohort' : 'District of Columbia Public Schools',
    'E.L. Haynes PCS - High School' : 'E.L. Haynes PCS - High School',
    'Friendship PCS - Collegiate Academy',
    'Friendship PCS - Technology Preparatory High School',
    'KIPP DC - College Preparatory Academy PCS',
    'Maya Angelou PCS - High School',
    'Paul PCS - International High School',
    'Washington Latin PCS - Upper School',
    'Washington Leadership Academy PCS'
}

SyntaxError: invalid syntax (<ipython-input-27-9b37f2bb7a1e>, line 4)

In [None]:
# state_fiscal = state_fiscal.replace(state_fiscal_rename)

Merge federal and state data, keeping only matches between the two. 

In [None]:
state_grads_merged = pd.merge(state_fiscal, state_grads, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [None]:
state_grads_merged.to_csv('../../data/state_data_merged/' + abbr + '.csv', index=False)