In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Load in Master Funding Data
file_path = "../datasets/master_success.csv"

success_master = pd.read_csv(file_path, encoding = 'utf-8')
success_master = success_master.rename(columns = {'Unnamed: 0': 'STNAME'}).set_index(['STNAME'])
success_master

Unnamed: 0_level_0,STABR,2006_AFGR,2006_AMAFGR,2006_ASAFGR,2006_HIAFGR,2006_BLAFGR,2006_WHAFGR,2006_REGDIP,2006_AMREGDIP,2006_ASREGDIP,...,2010_WHOHCM,2010_WHOHCF,2010_WHOHC,2010_HPOHCM,2010_HPOHCF,2010_HPOHC,2010_TROHCM,2010_TROHCF,2010_TROHC,2010_RACECAT
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alabama,AL,66.2,74.4,84.3,60.1,57.6,71.2,37918,343,391,...,470.0,329.0,799.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0
alaska,AK,66.5,51.0,76.2,68.7,57.6,72.6,7361,1442,528,...,53.0,52.0,105.0,1.0,11.0,12.0,2.0,3.0,5.0,7.0
arizona,AZ,70.5,45.9,100.0,64.4,77.3,76.1,54091,2779,1689,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0
arkansas,AR,80.4,93.0,100.0,83.7,72.9,81.9,28790,172,467,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,7.0
california,CA,69.2,62.0,89.6,59.1,59.1,78.9,343515,2833,52334,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,7.0
colorado,CO,75.5,59.8,90.3,58.0,62.9,82.2,44424,398,1617,...,76.0,63.0,139.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0
connecticut,CT,80.9,77.0,100.0,60.3,69.4,86.4,36222,117,1251,...,103.0,16.0,119.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0
delaware,DE,76.3,74.1,100.0,65.3,70.0,79.2,7275,20,246,...,51.0,38.0,89.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0
district of columbia,DC,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,...,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0
florida,FL,63.6,71.3,93.1,61.1,51.1,69.2,134686,434,4018,...,513.0,545.0,1058.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,5.0


In [3]:
# Read in desired variables from Meta Data, always use 2006 meta to avoid using new variables not included in the beginning yrs
meta_path = "../resources/success_data/2006_meta.txt"
meta = pd.read_csv(meta_path, sep=r'\s{2,}', engine='python', header=None, names=['Variable Name', 'Start Position', 'End Position', 'Field Length', 'Data Type', 'Description'])
meta = meta.drop(meta.index[0:2])
meta.iloc[92:102,:]

Unnamed: 0,Variable Name,Start Position,End Position,Field Length,Data Type,Description
94,DRP912,837,844,8\tN\tDropout Rate (Grades 9â€“12).,,
95,DRP9,845,860,8\tN\tDropout Rate (Grade 9).,,
96,DRP10,853,860,8\tN,Dropout Rate (Grade 10).,
97,DRP11,861,868,8\tN\tDropout Rate (Grade 11).,,
98,DRP12,869,876,8\tN\tDropout Rate (Grade 12).,,
99,DRPAM,877,884,8\tN\tDropout Rate (American Indian/Alaska Nat...,,
100,DRPAS,885,892,"8\tN\tDropout Rate (Asian/Pacific Islander, Gr...",,
101,DRPHI,893,900,"8\tN\tDropout Rate (Hispanic, Grades 9â€“12).",,
102,DRPBL,901,908,"8\tN\tDropout Rate (Black, non-Hispanic, Grade...",,
103,DRPWH,909,916,"8\tN\tDropout Rate (White, non-Hispanic, Grade...",,


In [4]:
# Create Support Expenditure Subset

# Years will always be the same for all subsets
years = ['2006', '2007', '2008', '2009', '2010']

# Create starter dataframes with desired index, should mostly be statename(STNAME)
success_df = pd.DataFrame(success_master.index)

# loop through master set with desired variable codes from the meta set shown above
# forge the variable names with year and code
for year in years:
    for variable in meta['Variable Name'][92:102]:
        success_df[year+'_'+variable] = ''
        # Fill out dataset, fill none in for all 0,-1,and-2(those are N/As), also remove and subtotal and total vars
        # We will be creating our own totals and custom subtotals to avoid any double counting
        for i in range(len(success_df)):
            if success_master[year+'_'+variable][i] <= 0:
                success_df[year+'_'+variable][i] = None
            else: 
                success_df[year+'_'+variable][i] = success_master[year+'_'+variable][i]
#    supp_ex_df = supp_ex_df.drop(supp_ex_df[[str(year)+'_TE21', str(year)+'_TE22', str(year)+'_TE23',
#                                            str(year)+'_TE24', str(year)+'_TE25', str(year)+'_TE26',
#                                            str(year)+'_STE22', str(year)+'_STE23', str(year)+'_STE24',
#                                            str(year)+'_STE25', str(year)+'_STE26', str(year)+'_STE27',
#                                            str(year)+'_STE28', str(year)+'_STE2T']], axis = 1)

success_df = success_df.set_index(['STNAME'])
success_df

Unnamed: 0_level_0,2006_DRP912,2006_DRP9,2006_DRP10,2006_DRP11,2006_DRP12,2006_DRPAM,2006_DRPAS,2006_DRPHI,2006_DRPBL,2006_DRPWH,...,2010_DRP912,2010_DRP9,2010_DRP10,2010_DRP11,2010_DRP12,2010_DRPAM,2010_DRPAS,2010_DRPHI,2010_DRPBL,2010_DRPWH
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alabama,2.5,1.9,2.6,2.8,2.9,1.3,1.2,2.9,2.9,2.3,...,1.8,1.4,2.0,2.1,1.8,1.3,1.4,0.9,2.0,1.6
alaska,8.0,4.8,7.5,8.9,11.3,11.7,6.9,9.7,10.0,6.2,...,6.9,4.0,5.5,9.3,8.7,11.6,3.5,6.1,6.4,5.1
arizona,7.6,4.9,6.0,8.0,13.0,,4.4,9.3,8.1,,...,7.8,5.1,5.7,7.0,13.6,14.6,4.9,8.1,8.8,6.8
arkansas,3.1,1.8,3.0,4.1,3.9,2.5,2.4,3.8,4.2,2.7,...,3.6,1.9,3.2,4.4,5.3,4.9,1.4,4.1,5.0,3.1
california,3.7,2.3,2.3,3.0,8.1,4.4,1.8,4.8,6.5,2.3,...,4.6,2.6,3.1,4.2,8.89999,6.5,1.8,5.8,8.39999,2.8
colorado,7.8,5.7,7.4,8.5,10.1,13.5,4.8,15.8,12.5,4.7,...,5.3,3.1,3.7,5.3,9.7,10.1,2.4,9.89999,8.6,3.2
connecticut,2.0,1.9,1.9,2.3,1.9,2.3,0.7,4.6,2.8,1.3,...,3.0,2.8,2.6,3.4,3.2,3.0,1.1,6.9,6.8,1.4
delaware,5.5,6.5,5.7,5.3,4.0,6.8,2.9,9.8,7.3,4.2,...,3.9,4.7,3.7,3.4,3.5,10.3,3.2,4.7,4.9,3.1
district of columbia,,,,,,,7.4,,,4.0,...,7.0,8.1,5.9,4.2,4.0,,5.4,8.3,6.9,4.9
florida,4.1,3.7,3.7,4.2,4.9,3.7,1.7,5.0,5.8,2.9,...,2.3,1.9,2.2,2.4,2.8,2.7,0.8,2.8,3.5,1.6


In [5]:
# Break Support Expenditures Subset down by Year, Subsets start at E212 and end at E268
success_df_2006 = success_df.iloc[:,0:10]
success_df_2007 = success_df.iloc[:,10:20]
success_df_2008 = success_df.iloc[:,20:30]
success_df_2009 = success_df.iloc[:,30:40]
success_df_2010 = success_df.iloc[:,40:50]
success_df_2010

Unnamed: 0_level_0,2010_DRP912,2010_DRP9,2010_DRP10,2010_DRP11,2010_DRP12,2010_DRPAM,2010_DRPAS,2010_DRPHI,2010_DRPBL,2010_DRPWH
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
alabama,1.8,1.4,2.0,2.1,1.8,1.3,1.4,0.9,2.0,1.6
alaska,6.9,4.0,5.5,9.3,8.7,11.6,3.5,6.1,6.4,5.1
arizona,7.8,5.1,5.7,7.0,13.6,14.6,4.9,8.1,8.8,6.8
arkansas,3.6,1.9,3.2,4.4,5.3,4.9,1.4,4.1,5.0,3.1
california,4.6,2.6,3.1,4.2,8.89999,6.5,1.8,5.8,8.39999,2.8
colorado,5.3,3.1,3.7,5.3,9.7,10.1,2.4,9.89999,8.6,3.2
connecticut,3.0,2.8,2.6,3.4,3.2,3.0,1.1,6.9,6.8,1.4
delaware,3.9,4.7,3.7,3.4,3.5,10.3,3.2,4.7,4.9,3.1
district of columbia,7.0,8.1,5.9,4.2,4.0,,5.4,8.3,6.9,4.9
florida,2.3,1.9,2.2,2.4,2.8,2.7,0.8,2.8,3.5,1.6


In [6]:
# Save Datasets to CSVs
success_out_2006 = '../datasets/subsets/success/success_df_2006.csv'
success_out_2007 = '../datasets/subsets/success/success_df_2007.csv'
success_out_2008 = '../datasets/subsets/success/success_df_2008.csv'
success_out_2009 = '../datasets/subsets/success/success_df_2009.csv'
success_out_2010 = '../datasets/subsets/success/success_df_2010.csv'

success_df_2006.to_csv(success_out_2006)
success_df_2007.to_csv(success_out_2007)
success_df_2008.to_csv(success_out_2008)
success_df_2009.to_csv(success_out_2009)
success_df_2010.to_csv(success_out_2010)