In [14]:
import pandas as pd

df = pd.read_csv("2017_us_state_finances.csv")
df['Description'] = df['Description'].str.strip()

# Retains only columns that combines state and local government finances
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Drops rows relating to revenue sources
df.drop(df.index[0:66], inplace=True)

social = ["Public welfare", "Hospitals", "Health", "Employment security administration", "Veterans' services"]
educ = ["Education", "Libraries"]
govt = ["Financial administration", "Judicial and legal", "General public buildings", "Other governmental administration"]
transport = ["Highways", "Air transportation (airports)", "Parking facilities", "Sea and inland port facilities"]
others = ["Utility expenditure", "Expenditure1"]

# Retains only required rows
df = df[df["Description"].isin(social + educ + govt + transport + others)]

# Education (Education + Libraries) - 61
# Health & Social Services (Public welfare + Hospitals + Health + Security + Employment security administration +  Veterans' services) - 62
# Government Administration (Financial administration + Judicial and legal + General public buildings + Other governmental administration) - 92
# Utilities (Utility expenditure) - 22
# Transportation (Highways, Air transportation (airports), Parking facilities, Sea and inland port facilities) - 48/49

# Transposes dataframe and rename columns
df = df.transpose()
df.columns = df.iloc[0]
df = df[1:]
df.reset_index(inplace=True)
df.rename(columns = {'Expenditure1':'State Expenditure', 
"index" : "State", "Utility expenditure" : "Utilities"}, inplace = True)

for col in [col for col in df.columns]:
    if col != "State":
        df[col] = df[col].str.replace(',','')
        df[col] = df[col].astype(int)

df["Health and Social Services"] = df[social].sum(axis=1)
df["Education"] = df[educ].sum(axis=1)
df["Public Administration"] = df[educ].sum(axis=1)
df["Transportation"] = df[transport].sum(axis=1)
df.drop(columns = social + educ + govt + transport, inplace=True)

required_col_names = [col for col in df.columns[1:]]

for col in [col for col in required_col_names]:
    if col == "State Expenditure":
        col_name = col + " as % of US Expenditure"
    else:
        col_name = col + " (State Expenditure as % of Total US)"

    df[col + " (State Expenditure as % of Total US)"] = \
        df.apply(lambda x : (x[col] / int(df.loc[df["State"] ==
            "United States Total", col])) * 100, axis = 1)
    df[col + " (as % of Total State Expenditure)"] = (df[col] /
    df["State Expenditure"] * 100)

df.set_index(["State"], inplace = True)

cleaned_data = df.to_csv("cleaned_data")

df

Description,State Expenditure,Utilities,Health and Social Services,Public Administration,Transportation,State Expenditure (State Expenditure as % of Total US),State Expenditure (as % of Total State Expenditure),Utilities (State Expenditure as % of Total US),Utilities (as % of Total State Expenditure),Health and Social Services (State Expenditure as % of Total US),Health and Social Services (as % of Total State Expenditure),Public Administration (State Expenditure as % of Total US),Public Administration (as % of Total State Expenditure),Transportation (State Expenditure as % of Total US),Transportation (as % of Total State Expenditure)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
United States Total,3669857565,232443129,977846488,1038976176,215684582,100.0,100.0,100.0,6.333846,100.0,26.645353,100.0,28.311076,100.0,5.877192
Alabama,46961537,3174122,14205075,14200246,2689922,1.279656,100.0,1.365548,6.758982,1.45269,30.248318,1.366754,30.238035,1.247155,5.727926
Alaska,14939769,799060,3298694,3354861,1950228,0.407094,100.0,0.343766,5.348543,0.337343,22.079953,0.322901,22.455909,0.904204,13.053937
Arizona,60084606,5481940,16158938,16974322,3099014,1.637246,100.0,2.358401,9.123701,1.652503,26.893641,1.633755,28.2507,1.436827,5.15775
Arkansas,27688857,987682,9077837,8516519,2151829,0.754494,100.0,0.424913,3.567074,0.92835,32.785163,0.819703,30.757929,0.997674,7.771462
California,569430129,43593282,165956421,137009092,23091879,15.51641,100.0,18.754386,7.655598,16.971623,29.144299,13.186933,24.060738,10.706319,4.055261
Colorado,60919223,4513782,12181924,18042684,4139615,1.659989,100.0,1.941887,7.409454,1.245791,19.996847,1.736583,29.61739,1.919291,6.795252
Connecticut,41048798,1423979,6380256,13813353,2317356,1.118539,100.0,0.612614,3.468991,0.65248,15.543101,1.329516,33.651054,1.074419,5.645369
Delaware,12100844,540908,3124465,4131792,915689,0.329736,100.0,0.232706,4.470002,0.319525,25.820224,0.397679,34.14466,0.42455,7.56715
District of Columbia,18803720,4702459,4724139,3092961,485170,0.512383,100.0,2.023058,25.008131,0.483117,25.123428,0.297693,16.448665,0.224944,2.580181


In [159]:
main_df = pd.read_csv("cleaned_data", index_col = "State")

other_df = pd.read_csv("USAspending_data.csv")
other_df = other_df.rename(columns={"name": "State"})
other_df.set_index("State", inplace=True)
other_df = other_df[other_df["type"] == "state"]

other_df_2 = pd.read_csv("USAspending_data.csv")
other_df_2 = other_df_2.rename(columns={"name": "State"})
other_df_2.set_index("State", inplace=True)
other_df_2 = other_df[other_df["type"] == "state"]

other_df

Unnamed: 0_level_0,fips,code,type,amount,count
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,2,AK,state,14722772739.59,23678
Alabama,1,AL,state,59221536064.44,147991
Arkansas,5,AR,state,31344500215.28,140422
Arizona,4,AZ,state,102338176169.86,83603
California,6,CA,state,402969458929.12,675827
Colorado,8,CO,state,56584064426.02,134710
Connecticut,9,CT,state,104469014558.66,80887
Delaware,10,DE,state,10861482749.38,14103
Florida,12,FL,state,254080196656.68,409662
Georgia,13,GA,state,83751735310.76,322828


In [160]:
df_lst = [(other_df, ["amount"]), (other_df_2, ["code"])]

def combine_dataframes_by_state(main_df, df_lst):
    """
    Recursively concatenates multiple panda dataframes (with "State" 
    as the index) with only the required columns

    Inputs:
        df_lst (lst of tuples): (df, [cols to extract])
        ### If extracting all columns, [cols to extract] should be 
        an empty list ###
        
    Returns:
        final_df (pandas series): concatenated pandas dataframes
    """
    if len(df_lst) == 0:
        return main_df
    
    other_df, col_lst = df_lst.pop()
    if col_lst != []:
        new_df = main_df.merge(other_df[col_lst], on="State")
    else:
        new_df = main_df.merge(other_df, on="State")
    
    return combine_dataframes_by_state(new_df, df_lst)

combine_dataframes_by_state(main_df, df_lst)

Unnamed: 0_level_0,State Expenditure,Utility,Health and Social Services,Public Administration,Transportation,State Expenditure as % of US Expenditure,Utility (State Expenditure as % of Total US),Utility (as % of Total State Expenditure),Health and Social Services (State Expenditure as % of Total US),Health and Social Services (as % of Total State Expenditure),Public Administration (State Expenditure as % of Total US),Public Administration (as % of Total State Expenditure),Transportation (State Expenditure as % of Total US),Transportation (as % of Total State Expenditure),code,amount
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Alabama,48238993,3068894,14362206,14836381,3070165,1.263,1.276,6.362,1.41,29.773,1.383,30.756,1.324,6.364,AL,59221536064.44
Alaska,16461035,650386,3463461,3266832,2119873,0.431,0.27,3.951,0.34,21.04,0.305,19.846,0.914,12.878,AK,14722772739.59
Arizona,62755470,6384877,16809832,17753353,3372703,1.643,2.654,10.174,1.651,26.786,1.655,28.29,1.455,5.374,AZ,102338176169.86
Arkansas,27659935,980609,8570120,8890455,2139965,0.724,0.408,3.545,0.842,30.984,0.829,32.142,0.923,7.737,AR,31344500215.28
California,605649479,45603480,179493892,143806337,25214834,15.86,18.955,7.53,17.625,29.637,13.406,23.744,10.878,4.163,CA,402969458929.12
Colorado,64659578,4336042,13464559,19352174,4297083,1.693,1.802,6.706,1.322,20.824,1.804,29.929,1.854,6.646,CO,56584064426.02
Connecticut,41642052,1457829,6412577,13862651,2203005,1.091,0.606,3.501,0.63,15.399,1.292,33.29,0.95,5.29,CT,104469014558.66
Delaware,12200640,553703,3332623,4052720,837401,0.32,0.23,4.538,0.327,27.315,0.378,33.217,0.361,6.864,DE,10861482749.38
Florida,187612475,11919691,45302027,45933177,14890598,4.913,4.954,6.353,4.448,24.147,4.282,24.483,6.424,7.937,FL,254080196656.68
Georgia,89012045,6474160,20356320,29830150,5574860,2.331,2.691,7.273,1.999,22.869,2.781,33.512,2.405,6.263,GA,83751735310.76
