# Create 2016-2017 School Datasets
** This program uses all raw datasets to create the flattened datasets within the NCEA repository.**
1. This notebook reads raw dataset .csv files directly from the \EducationDataNC\2017\Raw Datasets folder.
2. Each raw dataset is transformed to contain only one record per public school campus or unique unit_code.
3. Many raw datasets have more than one record per campus, per year.  In these instances, table pivots are used to create new columns from row level entries and reduce each dataset to one record per school.  This adds many new colums the flattened dataset.  (see the code below for more details)

In [15]:
#import required Libraries
import pandas as pd
import numpy as np
import os
import string

#**********************************************************************************
# Set the following variables before running this code!!!
#**********************************************************************************

#Location where copies of the raw data files will be read in from csv files.
dataDir = 'C:/Users/Jake/Documents/GitHub/EducationDataNC/2017/Raw Datasets/'

#Location where the new school datasets will be created.
outputDir = 'C:/Users/Jake/Documents/GitHub/EducationDataNC/2017/School Datasets/'

#All raw data files are processed for the year below
schoolYear = 2017

## Read in the Raw Data Files
**This section reads raw data files directly from the \EducationDataNC\2017\Raw Datasets folder.**  
* The file input location is specified at the *dataDir* parameter.
* The file output location is specified at the *outputDir* parameter.
* The *schoolYear* parameter is used to specify the correct school year to process.

In [16]:
#Read in raw data files

#Profile Table 
profile = pd.read_csv(dataDir + 'profile.csv', low_memory=False, dtype={'unit_code': object})
#Profile Metric Table 
profileMetric = pd.read_csv(dataDir + 'profile-metrics.csv', low_memory=False, dtype={'unit_code': object})
#Funding Table
funding = pd.read_csv(dataDir + 'funding.csv', low_memory=False, dtype={'unit_code': object})                     
#School Performance Grade (SPG) Table
spg = pd.read_csv(dataDir + 'spg.csv', low_memory=False, dtype={'unit_code': object}) 
#READY Accountability Drill Down
accDrillDown = pd.read_csv(dataDir + 'accDrillDown.csv', low_memory=False, dtype={'unit_code': object}) 
#Read To Achieve (RTA) 
rta = pd.read_csv(dataDir + 'rta.csv', low_memory=False, dtype={'unit_code': object}) 
#Participation Targets Overall
pTargets = pd.read_csv(dataDir + 'participation-targets.csv', low_memory=False, dtype={'unit_code': object}) 
#School Indicators Table 
schoolInds = pd.read_csv(dataDir + 'school-indicators.csv', low_memory=False, dtype={'unit_code': object}) 
#Specialized Course Enrollment 
sce = pd.read_csv(dataDir + 'sce.csv', low_memory=False, dtype={'unit_code': object}) 
#College Enrollment Table
collegeEnroll = pd.read_csv(dataDir + 'college-enrollment.csv', low_memory=False, dtype={'unit_code': object}) 
#Environment Table
environment = pd.read_csv(dataDir + 'environment.csv', low_memory=False, dtype={'unit_code': object}) 
#Personnel Table
personnel = pd.read_csv(dataDir + 'personnel.csv', low_memory=False, dtype={'unit_code': object}) 
#Educator Experience Table (YOE)
yoe = pd.read_csv(dataDir + 'yoe.csv', low_memory=False, dtype={'unit_code': object}) 
#Educator Effectiveness Table 
effectiveness = pd.read_csv(dataDir + 'effectiveness.csv', low_memory=False, dtype={'unit_code': object}) 
#Statistical Profiles - Student Body Racial Compositions at the School Level
ec_pupils = pd.read_csv(dataDir + 'ec_pupils.csv', low_memory=False, dtype={'unit_code': object})

#***************New Data for 2017

#Student Readiness Table 
readiness = pd.read_csv(dataDir + 'acc-student-readiness.csv', low_memory=False, dtype={'unit_code': object}) 
#Economically Disadvantaged 
edRates = pd.read_csv(dataDir + 'acc-ed-rates.csv', low_memory=False, dtype={'unit_code': object})
#Career and Technical Education Table Concentrations
concentrations = pd.read_csv(dataDir + 'cte-concentrations.csv', low_memory=False, dtype={'unit_code': object})
#Career and Technical Education Table Concentrations
credentials = pd.read_csv(dataDir + 'cte-credentials.csv', low_memory=False, dtype={'unit_code': object})

## Reshape tables as needed to one record per school 

In [17]:
#***********************************************************************
#Profile Metric Table Reshape
#***********************************************************************

#get rid of state and district level records (this information is also in the school level records)
profileMetric = profileMetric[  (profileMetric['unit_code'] != 'NC-SEA') 
                              & (profileMetric['unit_code'].str.contains("LEA") == False)]
#Pivot table creating one record per unit_code / school campus
profileMetric = pd.pivot_table(profileMetric, values='size',index=['unit_code'],columns=['level'])
#concatenate multiindex column names using a list comprehension.
profileMetric.columns = [col + '_Size' for col in profileMetric.columns]
#Make our index a column for merges later
profileMetric.reset_index(level=0, inplace=True)

#******  Could go back and add size - district and size - state features here!!! 

In [18]:
#***********************************************************************
#READY Accountability Drill Down Reshape
#***********************************************************************
#Shorten Standard column name for clean code
accDrillDown.rename(columns={'Standard (CCR - Level 4 & 5, GLP - Level 3 & Above)':'Standard'}, inplace=True)
#Shorten Standard row values before table pivot
accDrillDown['Standard'] = accDrillDown['Standard'].map({'College and Career Ready'       :'CACR'
                                                          ,'Grade Level Proficient'       :'GLP'
                                                          ,'Standard (4 Year)'            :'4yr'
                                                          ,'Extended (5 year)'            :'5yr'
                                                          ,'Met The ACT Benchmark'        :'ACTBenchmark'
                                                          ,'Met UNC Minimum'              :'UNCMin'
                                                          ,'Percent of Benchmarks Met'    :'BenchmarksMet'
                                                          ,'Silver or Better Certificate' :'SilverPlus'
                                                         })
#Shorten Subject field Names before table pivot
accDrillDown['Subject'] = accDrillDown['Subject'].transform(lambda x: x.replace('Grades','Gr'))
accDrillDown['Subject'] = accDrillDown['Subject'].transform(lambda x: x.replace('Grade','Gr'))
accDrillDown['Subject'] = accDrillDown['Subject'].transform(lambda x: x.replace('The ',''))
accDrillDown['Subject'] = accDrillDown['Subject'].transform(lambda x: x.replace('All ',''))
accDrillDown['Subject'] = accDrillDown['Subject'].transform(lambda x: x.replace(' - ',''))
accDrillDown['Subject'] = accDrillDown['Subject'].transform(lambda x: x.replace(' ',''))

#Pivot table using Subjects and Standards - All Students
accDrillDownAll = pd.pivot_table(accDrillDown, values='All Students',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownAll.columns = ['_'.join(col) + '_All' for col in accDrillDownAll.columns]
#Make our index a column for merges later
accDrillDownAll.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - Female
accDrillDownFemale = pd.pivot_table(accDrillDown, values='Female',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownFemale.columns = ['_'.join(col) + '_Female' for col in accDrillDownFemale.columns]
#Make our index a column for merges later
accDrillDownFemale.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - Male
accDrillDownMale = pd.pivot_table(accDrillDown, values='Male',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownMale.columns = ['_'.join(col) + '_Male' for col in accDrillDownMale.columns]
#Make our index a column for merges later
accDrillDownMale.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - American Indian
accDrillDownAmericanIndian = pd.pivot_table(accDrillDown, values='American Indian'
                                            ,index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownAmericanIndian.columns = ['_'.join(col) + '_AmericanIndian' for col in accDrillDownAmericanIndian.columns]
#Make our index a column for merges later
accDrillDownAmericanIndian.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - Asian
accDrillDownAsian = pd.pivot_table(accDrillDown, values='Asian',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownAsian.columns = ['_'.join(col) + '_Asian' for col in accDrillDownAsian.columns]
#Make our index a column for merges later
accDrillDownAsian.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - Black
accDrillDownBlack = pd.pivot_table(accDrillDown, values='Black',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownBlack.columns = ['_'.join(col) + '_Black' for col in accDrillDownBlack.columns]
#Make our index a column for merges later
accDrillDownBlack.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - Hispanic
accDrillDownHispanic = pd.pivot_table(accDrillDown, values='Hispanic',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownHispanic.columns = ['_'.join(col) + '_Hispanic' for col in accDrillDownHispanic.columns]
#Make our index a column for merges later
accDrillDownHispanic.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - Hispanic
accDrillDownTwoorMoreRaces = pd.pivot_table(accDrillDown, values='Two or More Races'
                                            ,index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownTwoorMoreRaces.columns = ['_'.join(col) + '_TwoorMoreRaces' for col in accDrillDownTwoorMoreRaces.columns]
#Make our index a column for merges later
accDrillDownTwoorMoreRaces.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - White
accDrillDownWhite = pd.pivot_table(accDrillDown, values='White',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownWhite.columns = ['_'.join(col) + '_White' for col in accDrillDownWhite.columns]
#Make our index a column for merges later
accDrillDownWhite.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - EDS
accDrillDownEDS = pd.pivot_table(accDrillDown, values='EDS',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownEDS.columns = ['_'.join(col) + '_EDS' for col in accDrillDownEDS.columns]
#Make our index a column for merges later
accDrillDownEDS.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - LEP
accDrillDownLEP = pd.pivot_table(accDrillDown, values='LEP',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownLEP.columns = ['_'.join(col) + '_LEP' for col in accDrillDownLEP.columns]
#Make our index a column for merges later
accDrillDownLEP.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - SWD
accDrillDownSWD = pd.pivot_table(accDrillDown, values='SWD',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownSWD.columns = ['_'.join(col) + '_SWD' for col in accDrillDownSWD.columns]
#Make our index a column for merges later
accDrillDownSWD.reset_index(level=0, inplace=True)

#Pivot table using Subjects and Standards - AIG
accDrillDownAIG = pd.pivot_table(accDrillDown, values='AIG',index=['unit_code'],columns=['Subject', 'Standard'])
#concatenate multiindex column names using a list comprehension.
accDrillDownAIG.columns = ['_'.join(col) + '_AIG' for col in accDrillDownAIG.columns]
#Make our index a column for merges later
accDrillDownAIG.reset_index(level=0, inplace=True)

In [19]:
#***********************************************************************
#Read To Achieve (RTA) Reshape
#***********************************************************************
#get rid of state and district level records (this information is also in the school level records)
rta = rta[  (rta['unit_code'] != 'NC-SEA') & (rta['unit_code'].str.contains("LEA") == False)]
#Pivot table creating one record per unit_code / school campus 
rta = pd.pivot_table(rta, values=['pct'],index=['unit_code'],columns=['metrics'])
#concatenate multiindex column names using a list comprehension.
rta.columns = ['_'.join(col) for col in rta.columns]
#Make our index a column for merges later
rta.reset_index(level=0, inplace=True)

In [20]:
#***********************************************************************
#Participation Targets Overall Table Reshape
#***********************************************************************

#get rid of state and district level records (this information is also in the school level records)
pTargets = pTargets[  (pTargets['unit_code'] != 'NC-SEA') & (pTargets['unit_code'].str.contains("LEA") == False)]
#Shorten Subject field Names before table pivot
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace('Grades','Gr'))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace('Grade','Gr'))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace('The ',''))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace('Mathematics','Math'))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace(' through ','-'))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace(' and ', '&'))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace('Science', 'Sci'))
pTargets['Part_Targets'] = pTargets['Part_Targets'].transform(lambda x: x.replace(' ', ''))

#Pivot table creating one record per unit_code / school campus
pTargets = pd.pivot_table(pTargets, values='percent_met',index=['unit_code'],columns=['Part_Targets'])
#concatenate multiindex column names using a list comprehension.
pTargets.columns = [col + '_pTarget_PctMet' for col in pTargets.columns]
#Make our index a column for merges later
pTargets.reset_index(level=0, inplace=True)

#**********************  Could go back and add targets assigned and targets met counts

In [21]:
#***********************************************************************
#College Enrollment Table Reshape
#***********************************************************************

#get rid of state and district level records (this information is also in the school level records)
collegeEnroll = collegeEnroll[  (collegeEnroll['unit_code'] != 'NC-SEA') 
                              & (collegeEnroll['unit_code'].str.contains("LEA") == False)]
#Pivot table creating one record per unit_code / school campus
collegeEnroll = pd.pivot_table(collegeEnroll, values='sch_percent_enrolled'
                               ,index=['unit_code'],columns=['subgroup', 'subgroup_name','status'])
#concatenate multiindex column names using a list comprehension.
collegeEnroll.columns = ['_'.join(col) + '_sch_pct' for col in collegeEnroll.columns]
#Make our index a column for merges later
collegeEnroll.reset_index(level=0, inplace=True)

#******  Could go back and add size - district and size - state features here!!! 

In [22]:
#***********************************************************************
#Educator Experience Table (YOE) Reshape
#***********************************************************************

#get rid of state and district level records (this information is also in the school level records)
yoe = yoe[(yoe['unit_code'] != 'NC-SEA') & (yoe['unit_code'].str.contains("LEA") == False)]
#Pivot table creating one record per unit_code / school campus
yoeTch = pd.pivot_table(yoe, values='pct_tch',index=['unit_code'],columns=['Experience'])
#concatenate multiindex column names using a list comprehension.
yoeTch.columns = [col + '_Exp_Pct_Tch' for col in yoeTch.columns]
#Make our index a column for merges later
yoeTch.reset_index(level=0, inplace=True)

#Pivot table creating one record per unit_code / school campus for principals
yoePrin = pd.pivot_table(yoe, values='lea_pct_prin',index=['unit_code'],columns=['Experience'])
#concatenate multiindex column names using a list comprehension.
yoePrin.columns = [col + '_LEA_Exp_Pct_Prin' for col in yoePrin.columns]
#Make our index a column for merges later
yoePrin.reset_index(level=0, inplace=True)

#******  Could go back and add pct_tch - district and pct_tch - state features here!!! 

In [23]:
#***********************************************************************
#Educator Effectiveness Reshape
#***********************************************************************
#It does not appear that this table reports values of 0 for missing categories, imputing NA to 0 for all missing values  
effectiveness.percent.fillna(0, inplace=True)
#Pivot table creating one record per unit_code / school campus
effectiveness = pd.pivot_table(effectiveness, values='percent',index=['unit_code'],columns=['level','Role','STANDARD'])
#concatenate multiindex column names using a list comprehension.
effectiveness.columns = ['_'.join(col) + '_Pct' for col in effectiveness.columns]
#Make our index a column for merges later
effectiveness.reset_index(level=0, inplace=True)

In [24]:
#***********************************************************************
#Student Readiness Reshape
#***********************************************************************

#get rid of state and district level records (this information is also in the school level records)
readiness = readiness[(readiness['unit_code'] != 'NC-SEA') & (readiness['unit_code'].str.contains("LEA") == False)]
#Pivot table creating one record per unit_code / school campus
readiness = pd.pivot_table(readiness, values='pct_prof',index=['unit_code'],columns=['grade'])
#concatenate multiindex column names using a list comprehension.
readiness.columns = ['Gr_' + str(col) + '_Pct_Prof' for col in readiness.columns]
#Make our index a column for merges later
readiness.reset_index(level=0, inplace=True)

In [25]:
#***********************************************************************
#Career and Technical Education Table Concentrations Reshape
#***********************************************************************

#Pivot table creating one record per unit_code / school campus
concentrations = pd.pivot_table(concentrations, values='Counts',index=['unit_code'],columns=['Concentrator'])
#concatenate multiindex column names using a list comprehension.
concentrations.columns = [col + '_Concentrator_Ct' for col in concentrations.columns]
#Make our index a column for merges later
concentrations.reset_index(level=0, inplace=True)

In [26]:
#***********************************************************************
# Statistical Profiles - Student Body Racial Compositions at the School Level Reshape
#
# Statistical Profiles data are already one record per public school but must be converted to percentages
# Creates a new dataset - ec_pupils_pct.csv
#
#***********************************************************************

#Create Racial Composition summary variables
ec_pupils['Indian'] = ec_pupils['Indian Male'] + ec_pupils['Indian Female']
ec_pupils['Asian'] = ec_pupils['Asian Male'] + ec_pupils['Asian Female']
ec_pupils['Hispanic'] = ec_pupils['Hispanic Male'] + ec_pupils['Hispanic Female']
ec_pupils['Black'] = ec_pupils['Black Male'] + ec_pupils['Black Female']
ec_pupils['White'] = ec_pupils['White Male'] + ec_pupils['White Female']
ec_pupils['Pacific Island'] = ec_pupils['Pacific Island Male'] + ec_pupils['Pacific Island Female']
ec_pupils['Two or  More'] = ec_pupils['Two or  More Male'] + ec_pupils['Two or  More Female']

#The original total field is corrupted with non-printable characters and will not convert to int or float 
ec_pupils.drop(['Total'], axis=1, inplace=True)
#Create a new totals field by summing race composition fields
ec_pupils['Total'] = ec_pupils['Indian'] + ec_pupils['Asian'] + \
                     ec_pupils['Hispanic'] + ec_pupils['Black'] + \
                     ec_pupils['White'] + ec_pupils['Pacific Island'] + ec_pupils['Two or  More']
#Convert Totals to float64 for division later
ec_pupils['Total'] = ec_pupils['Total'].astype(np.float64)

#Create Minority summary variables 
ec_pupils['Minority Male'] = ec_pupils['Indian Male'] + ec_pupils['Asian Male'] \
                           + ec_pupils['Hispanic Male'] + ec_pupils['Black Male'] \
                           + ec_pupils['Pacific Island Male'] + ec_pupils['Two or  More Male'] 
ec_pupils['Minority Female'] = ec_pupils['Indian Female'] + ec_pupils['Asian Female'] \
                           + ec_pupils['Hispanic Female'] + ec_pupils['Black Female'] \
                           + ec_pupils['Pacific Island Female'] + ec_pupils['Two or  More Female']
ec_pupils['Minority'] = ec_pupils['Minority Male'] + ec_pupils['Minority Female']

#Create Student Body Racial Composition PERCENTAGES at the School Level
ec_pupils_pct = pd.DataFrame({'unit_code'   : ec_pupils['unit_code']
                            , 'School Name' : ec_pupils['___School Name___']
                            , 'IndianPct'   : ec_pupils['Indian'] / ec_pupils['Total']  
                            , 'AsianPct'    : ec_pupils['Asian'] / ec_pupils['Total']
                            , 'HispanicPct' : ec_pupils['Hispanic'] / ec_pupils['Total']
                            , 'BlackPct'    : ec_pupils['Black'] / ec_pupils['Total']
                            , 'WhitePct'    : ec_pupils['White'] / ec_pupils['Total']
                            , 'PacificIslandPct': ec_pupils['Pacific Island'] / ec_pupils['Total']
                            , 'TwoOrMorePct': ec_pupils['Two or  More'] / ec_pupils['Total']
                            , 'MinorityPct' : ec_pupils['Minority'] / ec_pupils['Total']
                            
                              
                            , 'IndianMalePct'   : ec_pupils['Indian Male'] / ec_pupils['Total']  
                            , 'AsianMalePct'    : ec_pupils['Asian Male'] / ec_pupils['Total']
                            , 'HispanicMalePct' : ec_pupils['Hispanic Male'] / ec_pupils['Total']
                            , 'BlackMalePct'    : ec_pupils['Black Male'] / ec_pupils['Total']
                            , 'WhiteMalePct'    : ec_pupils['White Male'] / ec_pupils['Total']
                            , 'PacificIslandMalePct': ec_pupils['Pacific Island Male'] / ec_pupils['Total']
                            , 'TwoOrMoreMalePct': ec_pupils['Two or  More Male'] / ec_pupils['Total']  
                            , 'MinorityMalePct' : ec_pupils['Minority Male'] / ec_pupils['Total']
                                                          
                            , 'IndianFemalePct'   : ec_pupils['Indian Female'] / ec_pupils['Total']  
                            , 'AsianFemalePct'    : ec_pupils['Asian Female'] / ec_pupils['Total']
                            , 'HispanicFemalePct' : ec_pupils['Hispanic Female'] / ec_pupils['Total']
                            , 'BlackFemalePct'    : ec_pupils['Black Female'] / ec_pupils['Total']
                            , 'WhiteFemalePct'    : ec_pupils['White Female'] / ec_pupils['Total']
                            , 'MinorityFemalePct' : ec_pupils['Minority Female'] / ec_pupils['Total'] 
                            , 'PacificIslandFemalePct': ec_pupils['Pacific Island Female'] / ec_pupils['Total']
                            , 'TwoOrMoreFemalePct': ec_pupils['Two or  More Female'] / ec_pupils['Total']
                             })

#Save the racial composition percentage data to disk 
ec_pupils_pct.to_csv(dataDir + 'ec_pupils_pct.csv', sep=',', index=False)

## Merge all datasets to one master dataset with one record per school 
**Starting with the profiles table we left outer join on unit_code, merging data from each reshaped table into one master record.**

In [27]:
#Remove state and district level profiles before performing campus level merges
profile = profile[(profile['unit_code'] != 'NC-SEA') & (profile['unit_code'].str.contains("LEA") == False)]

print('*********************************Start: Profile Data*********************************')
profile.info(verbose=False)

#Merge profile and profileMetric data
PublicSchools = profile.merge(profileMetric,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Profile Metric Data**************************')
PublicSchools.info(verbose=False)

#Merge funding data
PublicSchools = PublicSchools.merge(funding,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Funding Data*********************************')
PublicSchools.info(verbose=False)

#Merge SPG data
PublicSchools = PublicSchools.merge(spg,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: SPG Data*************************************')
PublicSchools.info(verbose=False)

#Merge accDrillDownAll data
PublicSchools = PublicSchools.merge(accDrillDownAll,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownAll Data*************************')
PublicSchools.info(verbose=False)

#Merge accDrillDownFemale data
PublicSchools = PublicSchools.merge(accDrillDownFemale,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownFemale Data**********************')
PublicSchools.info(verbose=False)

#Merge accDrillDownMale data
PublicSchools = PublicSchools.merge(accDrillDownMale,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownMale Data************************')
PublicSchools.info(verbose=False)

#Merge accDrillDownAmericanIndian data
PublicSchools = PublicSchools.merge(accDrillDownAmericanIndian,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownAmericanIndian Data**************')
PublicSchools.info(verbose=False)

#Merge accDrillDownAsian data
PublicSchools = PublicSchools.merge(accDrillDownAsian,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownAsian Data***********************')
PublicSchools.info(verbose=False)

#Merge accDrillDownBlack data
PublicSchools = PublicSchools.merge(accDrillDownBlack,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownBlack Data***********************')
PublicSchools.info(verbose=False)

#Merge accDrillDownHispanic data
PublicSchools = PublicSchools.merge(accDrillDownHispanic,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownHispanic Data********************')
PublicSchools.info(verbose=False)

#Merge accDrillDownTwoorMoreRaces data
PublicSchools = PublicSchools.merge(accDrillDownTwoorMoreRaces,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownTwoorMoreRaces Data**************')
PublicSchools.info(verbose=False)

#Merge accDrillDownWhite data
PublicSchools = PublicSchools.merge(accDrillDownWhite,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownWhite Data***********************')
PublicSchools.info(verbose=False)

#Merge accDrillDownEDS data
PublicSchools = PublicSchools.merge(accDrillDownEDS,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownEDS Data*************************')
PublicSchools.info(verbose=False)

#Merge accDrillDownLEP data
PublicSchools = PublicSchools.merge(accDrillDownLEP,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownLEP Data*************************')
PublicSchools.info(verbose=False)

#Merge accDrillDownSWD data
PublicSchools = PublicSchools.merge(accDrillDownSWD,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownSWD Data*************************')
PublicSchools.info(verbose=False)

#Merge accDrillDownAIG data
PublicSchools = PublicSchools.merge(accDrillDownAIG,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: accDrillDownAIG Data*************************')
PublicSchools.info(verbose=False)

#Merge RTA data
PublicSchools = PublicSchools.merge(rta,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: RTA Data*************************************')
PublicSchools.info(verbose=False)

#Merge Participation Targets data
PublicSchools = PublicSchools.merge(pTargets,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Participation Targets Data*******************')
PublicSchools.info(verbose=False)

#Merge School Indicators data
PublicSchools = PublicSchools.merge(schoolInds,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: School Indicators Data***********************')
PublicSchools.info(verbose=False)

#Merge Specialized Course Enrollment data
PublicSchools = PublicSchools.merge(sce,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Specialized Course Enrollment****************')
PublicSchools.info(verbose=False)

#Merge College Enrollment data
PublicSchools = PublicSchools.merge(collegeEnroll,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: College Enrollment***************************')
PublicSchools.info(verbose=False)

#Merge Environment data
PublicSchools = PublicSchools.merge(environment,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Environment Data*****************************')
PublicSchools.info(verbose=False)

#Merge personnel data
PublicSchools = PublicSchools.merge(personnel,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Personnel Data*******************************')
PublicSchools.info(verbose=False)

#Merge Years of Experience Teachers data
PublicSchools = PublicSchools.merge(yoeTch,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Years of Experience Teachers Data************')
PublicSchools.info(verbose=False)

#Merge Years of Experience Principals data
PublicSchools = PublicSchools.merge(yoePrin,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Years of Experience Principals Data**********')
PublicSchools.info(verbose=False)

#Merge Educator Effectiveness data
PublicSchools = PublicSchools.merge(effectiveness,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Educator Effectiveness Data******************')
PublicSchools.info(verbose=False)

#Merge Racial Composition data
PublicSchools = PublicSchools.merge(ec_pupils_pct,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Racial Composition Data**********************')
PublicSchools.info(verbose=False)

#Merge Readiness data
PublicSchools = PublicSchools.merge(readiness,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Student Readiness Data***********************')
PublicSchools.info(verbose=False)

#Merge Economically Disadvantaged data
PublicSchools = PublicSchools.merge(edRates,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Economically Disadvantaged Data**************')
PublicSchools.info(verbose=False)

#Merge Technical Education Concentrations data
PublicSchools = PublicSchools.merge(concentrations,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Technical Education Concentrations Data******')
PublicSchools.info(verbose=False)

#Merge Technical Education Credentials data
PublicSchools = PublicSchools.merge(credentials,how='left',on='unit_code', suffixes=('', '_Drop'))

print('*********************************After: Technical Education Credentials Data*********')
PublicSchools.info(verbose=False)

#Delete all of the duplicate / overlapping columns 
#i.e. When two tables have columns with identical names, the column from the table inside the merge() is deleted.
dropCols = [x for x in PublicSchools.columns if x.endswith('_Drop')]
PublicSchools = PublicSchools.drop(dropCols, axis=1)

print('*********************************After: Deleting Duplicated Columns*********')
PublicSchools.info(verbose=False)



*********************************Start: Profile Data*********************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2617 entries, 0 to 3083
Columns: 35 entries, vphone_ad to url
dtypes: float64(7), int64(1), object(27)
memory usage: 736.0+ KB
*********************************After: Profile Metric Data**************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2617 entries, 0 to 2616
Columns: 52 entries, vphone_ad to Math I_Size
dtypes: float64(24), int64(1), object(27)
memory usage: 1.1+ MB
*********************************After: Funding Data*********************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2617 entries, 0 to 2616
Columns: 90 entries, vphone_ad to st_building_expense_pct
dtypes: float64(57), int64(2), object(31)
memory usage: 1.8+ MB
*********************************After: SPG Data*************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2617 entries, 0 to 2616
Columns: 113 entries, vph

 ** *The report above shows changes to the final dataset's column and row counts as each flattened raw dataset is merged into the final Public School Datasets* **

## Create and Save the Final Public School Dataset Files

In [28]:
#Save the master file to disk
PublicSchools.to_csv(outputDir + 'PublicSchools' + str(schoolYear) + '.csv', sep=',', index=False)

print('*********************************All Public Schools****************************')
PublicSchools.info(verbose=False)

#Filter regular public high schools
HighSchools = PublicSchools[((PublicSchools.category_cd == 'H') | 
                             (PublicSchools.category_cd == 'T') | 
                             (PublicSchools.category_cd == 'A')) &
                             (PublicSchools.student_num > 0) & 
                             (PublicSchools.type_cd == 'P') & 
                             (PublicSchools.school_type_txt == 'Regular School')
                            ]

#Save the file to disk
HighSchools.to_csv(outputDir + 'PublicHighSchools' + str(schoolYear) + '.csv', sep=',', index=False)

print('*********************************Regular Public High Schools*******************')
HighSchools.info(verbose=False)

#Filter regular public middle schools
MiddleSchools = PublicSchools[((PublicSchools.category_cd == 'M') | 
                               (PublicSchools.category_cd == 'T') | 
                               (PublicSchools.category_cd == 'A') |
                               (PublicSchools.category_cd == 'I')) &
                               (PublicSchools.student_num > 0) & 
                               (PublicSchools.type_cd == 'P') & 
                               (PublicSchools.school_type_txt == 'Regular School')
                             ]

#Save the file to disk
MiddleSchools.to_csv(outputDir + 'PublicMiddleSchools' + str(schoolYear) + '.csv', sep=',', index=False)

print('*********************************Regular Public Middle Schools******************')
MiddleSchools.info(verbose=False)


#Filter regular elementary high schools
ElementarySchools = PublicSchools[((PublicSchools.category_cd == 'E') | 
                                   (PublicSchools.category_cd == 'I') | 
                                   (PublicSchools.category_cd == 'A')) &
                                   (PublicSchools.student_num > 0) & 
                                   (PublicSchools.type_cd == 'P') & 
                                   (PublicSchools.school_type_txt == 'Regular School')
                                 ]

#Save the file to disk
ElementarySchools.to_csv(outputDir + 'PublicElementarySchools' + str(schoolYear) + '.csv', sep=',', index=False)

print('*********************************Regular Public Elementary Schools**************')
ElementarySchools.info(verbose=False)

*********************************All Public Schools****************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2617 entries, 0 to 2616
Columns: 1279 entries, vphone_ad to Number_Industry_Recognized_Crede
dtypes: float64(1234), int64(3), object(42)
memory usage: 25.6+ MB
*********************************Regular Public High Schools*******************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 470 entries, 2 to 2615
Columns: 1279 entries, vphone_ad to Number_Industry_Recognized_Crede
dtypes: float64(1234), int64(3), object(42)
memory usage: 4.6+ MB
*********************************Regular Public Middle Schools******************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 526 entries, 5 to 2615
Columns: 1279 entries, vphone_ad to Number_Industry_Recognized_Crede
dtypes: float64(1234), int64(3), object(42)
memory usage: 5.1+ MB
*********************************Regular Public Elementary Schools**************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12