In [1]:
import pandas as pd
import numpy as np

Read in federal level data

In [2]:
fiscal = pd.read_sas('../../data/fiscal2018', format = 'sas7bdat', encoding='iso-8859-1')

Generate list of districts in the state in the federal data

In [3]:
fiscal_CA = fiscal[(fiscal['STNAME'] == 'California') & (fiscal['GSHI'] == '12')]

In [4]:
len(fiscal_CA)

633

In [5]:
fiscal_CA.head()

Unnamed: 0,LEAID,CENSUSID,FIPST,CONUM,CSA,CBSA,NAME,STNAME,STABBR,SCHLEV,...,FL_66V,FL_W01,FL_W31,FL_W61,FL_V95,FL_V02,FL_K14,FL_CE1,FL_CE2,FL_CE3
1253,600001,05501909400000,6,6037,348,31080,Acton-Agua Dulce Unified,California,CA,3,...,M,R,R,R,M,M,M,M,M,M
1254,600002,N,6,6001,488,41860,California School for the Blind (State Special...,California,CA,3,...,N,N,N,N,N,N,N,N,N,N
1255,600003,N,6,6001,488,41860,California School for the Deaf-Fremont (State ...,California,CA,3,...,N,N,N,N,N,N,N,N,N,N
1257,600007,N,6,6065,348,40140,California Sch for the Deaf-Riverside (State S...,California,CA,3,...,N,N,N,N,N,N,N,N,N,N
1258,600009,05504201200000,6,6083,N,42200,Cuyama Joint Unified,California,CA,3,...,M,R,R,R,M,M,M,M,M,M


Read in state level data

In [6]:
CA = pd.read_csv('../../data/state_data_raw/california2018.txt', delimiter='\t')

In [7]:
CA.head()

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterSchool,DASS,...,SPED Certificate (Count),SPED Certificate (Rate),GED Completer (Count),GED Completer (Rate),Other Transfer (Count),Other Transfer (Rate),Dropout (Count),Dropout (Rate),Still Enrolled (Count),Still Enrolled (Rate)
0,2017-18,C,1,,,Alameda,,,All,All,...,51,0.6,3,0.0,55,0.6,472,5.5,274,3.2
1,2017-18,C,1,,,Alameda,,,All,All,...,90,1.0,7,0.1,74,0.8,828,9.2,368,4.1
2,2017-18,C,1,,,Alameda,,,All,All,...,28,0.7,1,0.0,12,0.3,104,2.5,66,1.6
3,2017-18,C,1,,,Alameda,,,All,All,...,30,1.5,2,0.1,11,0.6,219,11.2,148,7.6
4,2017-18,C,1,,,Alameda,,,All,All,...,0,0.0,0,0.0,1,1.4,14,19.2,11,15.1


Filter for only the district level, total population samples.

In [8]:
CA = CA[(CA['DASS'] == 'All') &
        (CA['CharterSchool'] == 'All') &
        (CA['ReportingCategory'] == 'TA') & 
        (CA['AggregateLevel'] == 'D')]

Drop and rename columns.

In [9]:
CA = CA[['DistrictName', 'CohortStudents', 'Regular HS Diploma Graduates (Rate)']]

In [10]:
CA.columns = ['District Name', 'Total', 'Graduation Rate']

Remove placeholder values.

In [11]:
CA['Total'] = CA['Total'].astype(str).str.replace('\*', '')
CA['Graduation Rate'] = CA['Graduation Rate'].astype(str).str.replace('\*', '')

  """Entry point for launching an IPython kernel.
  


Check for non-matches in the two lists

In [12]:
A = [name for name in list(CA['District Name']) if name not in fiscal_CA['NAME']]
A.sort()
A

['ABC Unified',
 'Acalanes Union High',
 'Acton-Agua Dulce Unified',
 'Adelanto Elementary',
 'Alameda County Office of Education',
 'Alameda Unified',
 'Albany City Unified',
 'Alhambra Unified',
 'Alpaugh Unified',
 'Alpine County Office of Education',
 'Alvord Unified',
 'Amador County Office of Education',
 'Amador County Unified',
 'Anaheim Union High',
 'Anderson Union High',
 'Anderson Valley Unified',
 'Antelope Valley Union High',
 'Antioch Unified',
 'Apple Valley Unified',
 'Arcadia Unified',
 'Arcata Elementary',
 'Arena Union Elementary',
 'Armona Union Elementary',
 'Aromas - San Juan Unified',
 'Atascadero Unified',
 'Azusa Unified',
 'Baker Valley Unified',
 'Baldwin Park Unified',
 'Banning Unified',
 'Barstow Unified',
 'Bassett Unified',
 'Bear Valley Unified',
 'Beaumont Unified',
 'Bellflower Unified',
 'Benicia Unified',
 'Berkeley Unified',
 'Beverly Hills Unified',
 'Big Oak Flat-Groveland Unified',
 'Big Pine Unified',
 'Big Sur Unified',
 'Big Valley Joint Uni

In [13]:
B = [name for name in fiscal_CA['NAME'] if name not in list(CA['District Name'])]
B.sort()
B

['Alpine County Unified',
 'Amador County ROP',
 'Antelope Valley ROP',
 'Arena Union Elementary/Point Arena Joint Union High',
 'Aromas/San Juan Unified',
 'Baldy View ROP',
 'Bellevue Union',
 'Butte County ROP',
 'CA Advancing Pathways for Students in Los Angeles Co ROC/P',
 'California Education Authority (CEA) Headquarters',
 'California Sch for the Deaf-Riverside (State Special Schl)',
 'Central Orange County CTE Partnership (CTEp)',
 'Central Sierra ROP',
 'Coastline ROP',
 'College and Career Advantage',
 'Colton-Redlands-Yucaipa ROP',
 'Columbia Elementary',
 'Compton Unified ROP',
 'Contra Costa County ROP',
 'Del Norte County ROP',
 'East San Gabriel Valley ROP',
 'Eden Area ROP',
 'Enterprise Elementary',
 'Forty-Niner ROP',
 'Fresno ROP',
 'Glenn County ROP',
 'Hart ROP',
 'Humboldt County ROP',
 'Imperial Valley ROP',
 'Kern County ROP',
 'Kern High ROC',
 'Kings County ROP',
 'La Puente Valley ROP',
 'Lake County ROP',
 'Lassen ROP',
 'Long Beach Unified ROP',
 'Los Ange

Replace the names I can find matches for.

In [14]:
CA_fiscal_rename = {
 'Alpine County Unified' : 'Alpine County Office of Education',
 'Arena Union Elementary/Point Arena Joint Union High' :  'Arena Union Elementary',
 'Aromas/San Juan Unified' :  'Aromas - San Juan Unified',
 'California Sch for the Deaf-Riverside (State Special Schl)' :  'California School for the Deaf-Riverside (State Special Schl)',
 'Modesto City Schools' : 'Modesto City High',
 'Nuview Union' : 'Nuview Union ',
 'Santa Cruz City Elementary/High' :  'Santa Cruz City High',
 'Santa Rosa City Schools' : 'Santa Rosa High',
 'Sierra County Office of Education' : 'Sierra Unified',
}

In [15]:
fiscal_CA = fiscal_CA.replace(CA_fiscal_rename)

Examine data types and missing values.

In [16]:
CA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 7749 to 55864
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   District Name    558 non-null    object
 1   Total            558 non-null    object
 2   Graduation Rate  558 non-null    object
dtypes: object(3)
memory usage: 17.4+ KB


Change column names for consistency across states. 

Change data types.

In [17]:
CA['Graduation Rate'] = pd.to_numeric(CA['Graduation Rate']) / 100

In [18]:
CA['Total'] = pd.to_numeric(CA['Total'])

Merge federal and state data, keeping only matches between the two. 

In [19]:
CA_merged = pd.merge(fiscal[fiscal['STABBR'] == 'CA'], CA, how='inner', left_on='NAME', right_on='District Name')

Save cleaned data. 

In [20]:
CA_merged.to_csv('../../data/state_data_merged/CA.csv', index=False)