In [91]:
STATE_NAMES = ["Alaska", "Alabama", "Arkansas", "Arizona", "California",
"Colorado", "Connecticut", "District of Columbia", "Delaware", "Florida", "Georgia",
"Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana",
"Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi",
"Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey",
"New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
"Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia",
"Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

STATE_NAMES_WITH_UNITED_STATES = STATE_NAMES[:]
STATE_NAMES_WITH_UNITED_STATES.append("United States")


In [106]:
import pandas as pd

df = pd.read_csv("2017_us_state_finances.csv")
df['Description'] = df['Description'].str.strip()

# Retains only columns that combines state and local government finances
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Drops rows relating to revenue sources
df.drop(df.index[0:66], inplace=True)

social = ["Public welfare", "Hospitals", "Health", "Employment security administration", "Veterans' services"]
educ = ["Education", "Libraries"]
govt = ["Financial administration", "Judicial and legal", "General public buildings", "Other governmental administration"]
transport = ["Highways", "Air transportation (airports)", "Parking facilities", "Sea and inland port facilities"]
others = ["Utility expenditure", "Expenditure1"]

# Retains only required rows
df = df[df["Description"].isin(social + educ + govt + transport + others)]

# Education (Education + Libraries) - 61
# Health & Social Services (Public welfare + Hospitals + Health + Security + Employment security administration +  Veterans' services) - 62
# Government Administration (Financial administration + Judicial and legal + General public buildings + Other governmental administration) - 92
# Utilities (Utility expenditure) - 22
# Transportation (Highways, Air transportation (airports), Parking facilities, Sea and inland port facilities) - 48/49

# Transposes dataframe and rename columns
df = df.transpose()
df.columns = df.iloc[0]
df = df[1:]
df.reset_index(inplace=True)
df.rename(columns = {'Expenditure1':'State Expenditure', 
"index" : "State", "Utility expenditure" : "Utilities"}, inplace = True)

for col in [col for col in df.columns]:
    if col != "State":
        df[col] = df[col].str.replace(',','')
        df[col] = df[col].astype(int)

df["Health and Social Services"] = df[social].sum(axis=1)
df["Education"] = df[educ].sum(axis=1)
df["Public Administration"] = df[educ].sum(axis=1)
df["Transportation"] = df[transport].sum(axis=1)
df.drop(columns = social + educ + govt + transport, inplace=True)
df['State'] = df['State'].replace('United States Total','United States')

required_col_names = [col for col in df.columns[1:]]

for col in [col for col in required_col_names]:
    if col == "State Expenditure":
        col_name = col + " as % of US Expenditure"
    else:
        col_name = col + " (State Expenditure as % of Total US)"

    df[col + " (State Expenditure as % of Total US)"] = \
        df.apply(lambda x : (x[col] / int(df.loc[df["State"] ==
            "United States", col])) * 100, axis = 1)
    df[col + " (as % of Total State Expenditure)"] = (df[col] /
    df["State Expenditure"] * 100)

df.set_index(["State"], inplace = True)

df

Description,State Expenditure,Utilities,Health and Social Services,Public Administration,Transportation,State Expenditure (State Expenditure as % of Total US),State Expenditure (as % of Total State Expenditure),Utilities (State Expenditure as % of Total US),Utilities (as % of Total State Expenditure),Health and Social Services (State Expenditure as % of Total US),Health and Social Services (as % of Total State Expenditure),Public Administration (State Expenditure as % of Total US),Public Administration (as % of Total State Expenditure),Transportation (State Expenditure as % of Total US),Transportation (as % of Total State Expenditure)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
United States,3669857565,232443129,977846488,1038976176,215684582,100.0,100.0,100.0,6.333846,100.0,26.645353,100.0,28.311076,100.0,5.877192
Alabama,46961537,3174122,14205075,14200246,2689922,1.279656,100.0,1.365548,6.758982,1.45269,30.248318,1.366754,30.238035,1.247155,5.727926
Alaska,14939769,799060,3298694,3354861,1950228,0.407094,100.0,0.343766,5.348543,0.337343,22.079953,0.322901,22.455909,0.904204,13.053937
Arizona,60084606,5481940,16158938,16974322,3099014,1.637246,100.0,2.358401,9.123701,1.652503,26.893641,1.633755,28.2507,1.436827,5.15775
Arkansas,27688857,987682,9077837,8516519,2151829,0.754494,100.0,0.424913,3.567074,0.92835,32.785163,0.819703,30.757929,0.997674,7.771462
California,569430129,43593282,165956421,137009092,23091879,15.51641,100.0,18.754386,7.655598,16.971623,29.144299,13.186933,24.060738,10.706319,4.055261
Colorado,60919223,4513782,12181924,18042684,4139615,1.659989,100.0,1.941887,7.409454,1.245791,19.996847,1.736583,29.61739,1.919291,6.795252
Connecticut,41048798,1423979,6380256,13813353,2317356,1.118539,100.0,0.612614,3.468991,0.65248,15.543101,1.329516,33.651054,1.074419,5.645369
Delaware,12100844,540908,3124465,4131792,915689,0.329736,100.0,0.232706,4.470002,0.319525,25.820224,0.397679,34.14466,0.42455,7.56715
District of Columbia,18803720,4702459,4724139,3092961,485170,0.512383,100.0,2.023058,25.008131,0.483117,25.123428,0.297693,16.448665,0.224944,2.580181


In [93]:
pop_df = pd.read_csv("us_census_population.csv")
pop_df = pop_df.iloc[:,0:2]
pop_df.columns = ["State", "2020 Census Population"]
pop_df['State'] = pop_df['State'].str.strip()
pop_df = pop_df[pop_df["State"].isin(STATE_NAMES_WITH_UNITED_STATES)]
pop_df.set_index("State", inplace=True)

pop_df


Unnamed: 0_level_0,2020 Census Population
State,Unnamed: 1_level_1
United States,331449520
Alabama,5024356
Alaska,733378
Arizona,7151507
Arkansas,3011555
California,39538245
Colorado,5773733
Connecticut,3605942
Delaware,989957
District of Columbia,689546


In [94]:
poverty_df = pd.read_csv("us_poverty_by_state.csv")
poverty_df = poverty_df.iloc[:,0:2]
poverty_df.columns = ["State", "3-Year Average Poverty Rate (2018-2020)"]
poverty_df['State'] = poverty_df['State'].str.strip()
poverty_df = poverty_df[poverty_df["State"].isin(STATE_NAMES_WITH_UNITED_STATES)]
poverty_df.set_index("State", inplace=True)

poverty_df

Unnamed: 0_level_0,3-Year Average Poverty Rate (2018-2020)
State,Unnamed: 1_level_1
United States,11.2
Alabama,14.6
Alaska,12.2
Arizona,11.2
Arkansas,14.7
California,11.0
Colorado,9.3
Connecticut,9.9
Delaware,8.1
District of Columbia,14.6


In [107]:
df_lst = [(poverty_df, []), (pop_df, [])]

def combine_dataframes_by_state(main_df, df_lst):
    """
    Recursively concatenates multiple panda dataframes (with "State" 
    as the index) with only the required columns

    Inputs:
        df_lst (lst of tuples): (df, [cols to extract])
        ### If extracting all columns, [cols to extract] should be 
        an empty list ###
        
    Returns:
        final_df (pandas series): concatenated pandas dataframes
    """
    if len(df_lst) == 0:
        print("BASE")
        return main_df
    
    print(len(df_lst))
    other_df, col_lst = df_lst.pop()
    if col_lst != []:
        new_df = main_df.merge(other_df[col_lst], on="State")
    else:
        new_df = main_df.merge(other_df, on="State")
    
    return combine_dataframes_by_state(new_df, df_lst)

combine_dataframes_by_state(df, df_lst)

2
1
BASE


Unnamed: 0_level_0,State Expenditure,Utilities,Health and Social Services,Public Administration,Transportation,State Expenditure (State Expenditure as % of Total US),State Expenditure (as % of Total State Expenditure),Utilities (State Expenditure as % of Total US),Utilities (as % of Total State Expenditure),Health and Social Services (State Expenditure as % of Total US),Health and Social Services (as % of Total State Expenditure),Public Administration (State Expenditure as % of Total US),Public Administration (as % of Total State Expenditure),Transportation (State Expenditure as % of Total US),Transportation (as % of Total State Expenditure),2020 Census Population,3-Year Average Poverty Rate (2018-2020)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
United States,3669857565,232443129,977846488,1038976176,215684582,100.0,100.0,100.0,6.333846,100.0,26.645353,100.0,28.311076,100.0,5.877192,331449520,11.2
Alabama,46961537,3174122,14205075,14200246,2689922,1.279656,100.0,1.365548,6.758982,1.45269,30.248318,1.366754,30.238035,1.247155,5.727926,5024356,14.6
Alaska,14939769,799060,3298694,3354861,1950228,0.407094,100.0,0.343766,5.348543,0.337343,22.079953,0.322901,22.455909,0.904204,13.053937,733378,12.2
Arizona,60084606,5481940,16158938,16974322,3099014,1.637246,100.0,2.358401,9.123701,1.652503,26.893641,1.633755,28.2507,1.436827,5.15775,7151507,11.2
Arkansas,27688857,987682,9077837,8516519,2151829,0.754494,100.0,0.424913,3.567074,0.92835,32.785163,0.819703,30.757929,0.997674,7.771462,3011555,14.7
California,569430129,43593282,165956421,137009092,23091879,15.51641,100.0,18.754386,7.655598,16.971623,29.144299,13.186933,24.060738,10.706319,4.055261,39538245,11.0
Colorado,60919223,4513782,12181924,18042684,4139615,1.659989,100.0,1.941887,7.409454,1.245791,19.996847,1.736583,29.61739,1.919291,6.795252,5773733,9.3
Connecticut,41048798,1423979,6380256,13813353,2317356,1.118539,100.0,0.612614,3.468991,0.65248,15.543101,1.329516,33.651054,1.074419,5.645369,3605942,9.9
Delaware,12100844,540908,3124465,4131792,915689,0.329736,100.0,0.232706,4.470002,0.319525,25.820224,0.397679,34.14466,0.42455,7.56715,989957,8.1
District of Columbia,18803720,4702459,4724139,3092961,485170,0.512383,100.0,2.023058,25.008131,0.483117,25.123428,0.297693,16.448665,0.224944,2.580181,689546,14.6
