In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Load in Master Funding Data
file_path = "../datasets/master_funding.csv"

funding_master = pd.read_csv(file_path, encoding = 'utf-8')
funding_master = funding_master.rename(columns = {'Unnamed: 0': 'STNAME'}).set_index(['STNAME'])
funding_master

Unnamed: 0_level_0,STABR,2006_R1A,2006_R1B,2006_R1C,2006_R1D,2006_R1E,2006_R1F,2006_R1G,2006_R1H,2006_R1I,...,2010_A14B,2010_PPE15,2010_MEMBR09,2010_ARRASTE1,2010_ARRATE5,2010_ARRAE81Z,2010_ARRATE10,2010_ARRASTE6,2010_ARRATLEIZ,2010_ARRASTE4
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alabama,AL,-2,-2,837722644,507421446,3563199,902980,0,86597,80153773,...,698208,8651,748889,369660033,381295043,138409,6187052,2689876,99711346,0
alaska,AK,-2,-2,169308873,180834960,0,0,0,0,11988478,...,-2,16668,131661,23470386,49963570,0,3835787,1228974,13751946,0
arizona,AZ,2610582721,-2,-2,40960642,580878,21656088,129949,403161,59366011,...,-2,8255,1077831,539065103,550231077,548353,29137598,232566,253804938,-2
arkansas,AR,1094994437,1245972,1520969,792048,4678927,2982173,60826,135367,45949378,...,-2,9481,480559,76412045,134544527,427520,61383167,76377545,37711346,-2
california,CA,13875619650,450091925,168400,-2,199551,216962720,23243683,36124922,1056239256,...,-2,9229,6263438,2332892784,3209610616,69829727,20955494,8622373,452377212,-1
colorado,CO,2828139818,71445787,-2,-2,47598986,5266907,5184240,111686,106929973,...,-2,9024,832368,69641156,111859035,6107904,10567842,72984,47800312,0
connecticut,CT,-2,-2,4771725433,-2,3053979,160635678,0,9311737,1146385,...,548787,15619,563968,368176090,368176090,1085172,10041669,-1,39247400,1753315
delaware,DE,355448768,-2,-2,-2,0,47385484,0,0,9687604,...,119879,12415,126801,47318827,60815739,53106,1762749,0,11821846,0
district of columbia,DC,-2,-2,228167996,815773094,557725,0,233388,0,45624,...,-2,20460,69433,94922392,100828760,796006,1268635,0,22900442,0
florida,FL,9914711981,-2,-2,-2,338296,0,10111304,4953368,403759340,...,-2,8597,2634522,850169705,1440267844,17253688,36363687,95990170,177551767,11026737


In [3]:
# Read in desired variables from Meta Data, always use 2006 meta to avoid using new variables not included in the beginning yrs
meta_path = "../resources/funding_data/2006_meta.txt"
meta = pd.read_csv(meta_path, sep=r'\s{2,}', engine='python', header=None, names=['Variable','Data Type', 'Data Element Position', 'Description'])
meta = meta.drop(meta.index[0:2])
meta.iloc[3:27,:]

Unnamed: 0,Variable,Data Type,Data Element Position,Description
5,R1A,N,5,LOCAL REVENUES PROPERTY TAX
6,R1B,N,6,LOCAL REVENUES NONPROPERTY TAX
7,R1C,N,7,LOCAL REVENUES LOCAL GOVERNMENT PROPERTY TAX
8,R1D,N,8,LOCAL REVENUES LOCAL GOVERNMENT NONPROPERTY TAX
9,R1E,N,9,LOCAL REVENUES INDIVIDUAL TUITION
10,R1F,N,10,LOCAL REVENUES TUITION FROM LEAS
11,R1G,N,11,LOCAL REVENUES TRANSPORTATION FEES FROM INDIVI...
12,R1H,N,12,LOCAL REVENUES TRANSPORTATION FEES FROM LEAS
13,R1I,N,13,LOCAL REVENUES EARNINGS ON INVESTMENTS
14,R1J,N,14,LOCAL REVENUES FOOD SERVICE


In [4]:
# Create Revenue Subset

# Years will always be the same for all subsets
years = ['2006', '2007', '2008', '2009', '2010']

# Create starter dataframes with desired index, should mostly be statename(STNAME)
revenue_df = pd.DataFrame(funding_master.index)

# loop through master set with desired variable codes from the meta set shown above
# forge the variable names with year and code
for year in years:
    for variable in meta['Variable'][3:27]:
        revenue_df[year+'_'+variable] = ''
        # Fill out dataset, fill none in for all 0,-1,and-2(those are N/As), also remove and subtotal and total vars
        # We will be creating our own totals and custom subtotals to avoid any double counting
        for i in range(len(revenue_df)):
            if funding_master[year+'_'+variable][i] <= 0:
                revenue_df[year+'_'+variable][i] = None
            else: 
                revenue_df[year+'_'+variable][i] = funding_master[year+'_'+variable][i]
    revenue_df = revenue_df.drop(revenue_df[[str(year)+'_STR1', str(year)+'_STR4', str(year)+'_TR']], axis = 1)

revenue_df = revenue_df.set_index(['STNAME'])
revenue_df

Unnamed: 0_level_0,2006_R1A,2006_R1B,2006_R1C,2006_R1D,2006_R1E,2006_R1F,2006_R1G,2006_R1H,2006_R1I,2006_R1J,...,2010_R1L,2010_R1M,2010_R1N,2010_R2,2010_R3,2010_R4A,2010_R4B,2010_R4C,2010_R4D,2010_R5
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alabama,,,837722644.0,507421446.0,3563199.0,902980.0,,86597.0,80153773.0,129652208,...,327971818,1211133.0,,26720185.0,3800153264.0,19420093,1124843535,332455.0,23419758.0,479515327.0
alaska,,,169308873.0,180834960.0,,,,,11988478.0,12186450,...,34323746,,,,1461906360.0,39328351,200116975,3880492.0,126403204.0,
arizona,2610582721.0,,,40960642.0,580878.0,21656088.0,129949.0,403161.0,59366011.0,117771588,...,505073847,713490.0,7425288.0,244178881.0,3896117202.0,24003209,1692185717,1282525.0,175826078.0,517278514.0
arkansas,1094994437.0,1245972.0,1520969.0,792048.0,4678927.0,2982173.0,60826.0,135367.0,45949378.0,54430437,...,76847733,13378.0,249293.0,2442736.0,2686230835.0,27293872,777093454,,15071643.0,258994335.0
california,13875619650.0,450091925.0,168400.0,,199551.0,216962720.0,23243683.0,36124922.0,1056239256.0,573634361,...,2559558982,,,,34743248852.0,449367225,8146441881,26262699.0,233174056.0,11503454360.0
colorado,2828139818.0,71445787.0,,,47598986.0,5266907.0,5184240.0,111686.0,106929973.0,94630866,...,388426476,3143325.0,2992850.0,15970646.0,3860026474.0,71842282,648824156,,9696761.0,600529742.0
connecticut,,,4771725433.0,,3053979.0,160635678.0,,9311737.0,1146385.0,126561878,...,46951515,,672931.0,,3463789585.0,38476754,802970022,,13198617.0,606773670.0
delaware,355448768.0,,,,,47385484.0,,,9687604.0,16674931,...,57797489,,,,1046317231.0,7640037,208423150,,2141189.0,179274061.0
district of columbia,,,228167996.0,815773094.0,557725.0,,233388.0,,45624.0,988870,...,52331835,406.0,1615963.0,,,2842846,152663303,,696218.0,3667352.0
florida,9914711981.0,,,,338296.0,,10111304.0,4953368.0,403759340.0,360923200,...,942541785,1775221.0,,,8216579182.0,213137678,3937297231,,49666007.0,773060931.0


In [5]:
# Break Revenue Subset down by Year, Subsets start at R1A and end at R5
revenue_df_2006 = revenue_df.iloc[:,0:21]
revenue_df_2007 = revenue_df.iloc[:,21:42]
revenue_df_2008 = revenue_df.iloc[:,42:63]
revenue_df_2009 = revenue_df.iloc[:,63:84]
revenue_df_2010 = revenue_df.iloc[:,84:105]
revenue_df_2010

Unnamed: 0_level_0,2010_R1A,2010_R1B,2010_R1C,2010_R1D,2010_R1E,2010_R1F,2010_R1G,2010_R1H,2010_R1I,2010_R1J,...,2010_R1L,2010_R1M,2010_R1N,2010_R2,2010_R3,2010_R4A,2010_R4B,2010_R4C,2010_R4D,2010_R5
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alabama,,,1066021960.0,510878707.0,4089031.0,447497.0,,68041.0,24867476.0,131231168,...,327971818,1211133.0,,26720185.0,3800153264.0,19420093,1124843535,332455.0,23419758.0,479515327.0
alaska,,,264898857.0,176461847.0,128736.0,197429.0,,,8362269.0,12385838,...,34323746,,,,1461906360.0,39328351,200116975,3880492.0,126403204.0,
arizona,3236621487.0,,,39250995.0,623204.0,23264158.0,231030.0,559826.0,27181631.0,112334694,...,505073847,713490.0,7425288.0,244178881.0,3896117202.0,24003209,1692185717,1282525.0,175826078.0,517278514.0
arkansas,1404473211.0,2326573.0,,3566662.0,8232474.0,4510065.0,72620.0,548162.0,23411699.0,55612046,...,76847733,13378.0,249293.0,2442736.0,2686230835.0,27293872,777093454,,15071643.0,258994335.0
california,16379820724.0,692642827.0,168400.0,,347461.0,289113426.0,19114777.0,38973491.0,422961228.0,457132827,...,2559558982,,,,34743248852.0,449367225,8146441881,26262699.0,233174056.0,11503454360.0
colorado,3453889419.0,54383205.0,,,61940236.0,5344187.0,5770364.0,152484.0,29338736.0,97073695,...,388426476,3143325.0,2992850.0,15970646.0,3860026474.0,71842282,648824156,,9696761.0,600529742.0
connecticut,,,5410540722.0,,4408494.0,191783486.0,,11734362.0,122505.0,114355944,...,46951515,,672931.0,,3463789585.0,38476754,802970022,,13198617.0,606773670.0
delaware,439792794.0,,,,,56601963.0,,,4877064.0,16385131,...,57797489,,,,1046317231.0,7640037,208423150,,2141189.0,179274061.0
district of columbia,,,600285240.0,900427861.0,819877.0,,117985.0,,132472.0,1815826,...,52331835,406.0,1615963.0,,,2842846,152663303,,696218.0,3667352.0
florida,11545455556.0,,,,631567.0,,11116922.0,7262782.0,106235947.0,322163722,...,942541785,1775221.0,,,8216579182.0,213137678,3937297231,,49666007.0,773060931.0


In [6]:
# Save Datasets to CSVs
revenue_out_2006 = '../datasets/subsets/revenue/rev_2006.csv'
revenue_out_2007 = '../datasets/subsets/revenue/rev_2007.csv'
revenue_out_2008 = '../datasets/subsets/revenue/rev_2008.csv'
revenue_out_2009 = '../datasets/subsets/revenue/rev_2009.csv'
revenue_out_2010 = '../datasets/subsets/revenue/rev_2010.csv'

revenue_df_2006.to_csv(revenue_out_2006)
revenue_df_2007.to_csv(revenue_out_2007)
revenue_df_2008.to_csv(revenue_out_2008)
revenue_df_2009.to_csv(revenue_out_2009)
revenue_df_2010.to_csv(revenue_out_2010)