In [239]:
import pandas as pd

STATE_NAMES = ["Alaska", "Alabama", "Arkansas", "Arizona", "California",
"Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
"Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana",
"Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi",
"Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey",
"New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
"Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia",
"Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

STATE_NAMES_AND_UNITED_STATES = STATE_NAMES[:]
STATE_NAMES_AND_UNITED_STATES.append("United States")

us_state_abbreviations = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
}

US_STATE_CODES = dict(map(reversed, us_state_abbreviations.items()))

NAICS_SECTOR_CODES = {
    "Agriculture, Forestry, Fishing and Hunting" : ("11",),
    "Mining, Quarrying, and Oil and Gas Extraction" : ("21",),
    "Utilities" : ("22",),
    "Construction" : ("23",),
    "Manufacturing" : ("31", "32", "33",),
    "Wholesale Trade" : ("42",),
    "Retail Trade" : ("44", "45",),
    "Transportation and Warehousing" : ("48", "49",),
    "Information" : ("51",),
    "Finance and Insurance" : ("52",),
    "Real Estate and Rental and Leasing" : ("53",),
    "Professional, Scientific, and Technical Services" : ("54",),
    "Administrative and Support and Waste Management and Remediation Services" : ("56",),
    "Educational Services" : ("61",),
    "Health Care and Social Assistance" : ("62",),
    "Arts, Entertainment, and Recreation" : ("71",),
    "Accommodation and Food Services" : ("72",),
    "Other Services (except Public Administration)" : ("81",),
    "Public Administration (not covered in economic census)" : ("92",)
}

NAICS_SECTOR_LST = [k for k in NAICS_SECTOR_CODES]

def clean_funding(raw_funding_df, year):
    """
    ###
    """
    raw_funding_df["code"] = raw_funding_df["code"].astype(str)

    # Creates structure for funding dataframe
    naics_sector_lst = [k for k in NAICS_SECTOR_CODES.keys()]
    funding_df = pd.DataFrame(STATE_NAMES_AND_UNITED_STATES, columns=["State"])
    funding_df = pd.concat([funding_df,pd.DataFrame(columns = naics_sector_lst)])
    
    funding_df_within_state = pd.DataFrame(STATE_NAMES_AND_UNITED_STATES, columns=["State"]) # Within State
    funding_df_by_state = pd.DataFrame(STATE_NAMES_AND_UNITED_STATES, columns=["State"]) # By State

    funding_df_lst = [funding_df, funding_df_within_state, funding_df_by_state]

    # Calculates funding for each category in each state and inputs values into funding_df
    for state_code, state in US_STATE_CODES.items():
        for sector, naics_code_tuple in NAICS_SECTOR_CODES.items():
            if len(naics_code_tuple) == 1:
                subset_df = raw_funding_df[(raw_funding_df["code"].apply(lambda x : x.startswith(naics_code_tuple[0]))) & (raw_funding_df["State"] == state_code) & (raw_funding_df["amount"] >= 0)]
                sum_val = subset_df["amount"].sum() / 1000
            else:
                sum_val = 0
                for naics_code in naics_code_tuple:
                    subset_df = raw_funding_df[(raw_funding_df["code"].apply(lambda x : x.startswith(naics_code))) & (raw_funding_df["State"] == state_code) & (raw_funding_df["amount"] >= 0)]
                    sum_val += subset_df["amount"].sum() / 1000
            
            funding_df.loc[funding_df["State"] == state, sector] = int(sum_val)

    required_col_names = [col for col in funding_df.columns[1:]]

    # Calculates total funding across categories for each state
    for df in funding_df_lst:
        df["Total Funding Received"] = funding_df[required_col_names].sum(axis=1)

    for col in required_col_names:
        #Calculates total funding for each category at the national level
        funding_df.loc[funding_df["State"] == "United States", col] = funding_df[col].sum()

        # Calculates funding for each state as a share of US by category
        funding_df_by_state[col] = funding_df.apply(lambda x : (x[col] / funding_df.loc[funding_df["State"] == "United States", col]) * 100, axis = 1)

        # Calculates funding for each category as a share of the total funding received by a state
        funding_df_without_us = funding_df.iloc[0:len(funding_df)-1]
        funding_df_within_state[col] = (funding_df_without_us[col] / funding_df_without_us["Total Funding Received"]) * 100

    funding_df["Year"] = year
    funding_df_cols = list(funding_df.columns)
    funding_df = funding_df[[funding_df_cols[-1]] + funding_df_cols[:-1]]

    funding_df_within_state["Year"] = year
    funding_df_2_cols = list(funding_df_within_state.columns)
    funding_df_within_state = funding_df_within_state[[funding_df_2_cols[-1]] + funding_df_2_cols[:-1]]

    funding_df_by_state["Year"] = year
    funding_df_3_cols = list(funding_df_by_state.columns)
    funding_df_by_state = funding_df_by_state[[funding_df_3_cols[-1]] + funding_df_3_cols[:-1]]

    funding_df.set_index("State", inplace=True)
    funding_df_within_state.set_index("State", inplace=True)
    funding_df_by_state.set_index("State", inplace=True)


    return funding_df, funding_df_by_state, funding_df_within_state


In [7]:
a, b, c = clean_funding(pd.read_csv("2016_us_funding.csv"))

In [223]:
def clean_census_expenditure(df):
    """
    ###
    """

    df['Description'] = df['Description'].str.strip()

    df.rename(columns = {'United States Total':'United States'}, inplace = True)

    # Retains only columns with combined state and local government expenditure
    df = df[["Description"] + STATE_NAMES_AND_UNITED_STATES]

    # Drops rows relating to state revenue sources
    df.drop(df.index[0:66], inplace=True)

    social = ["Public welfare", "Hospitals", "Health", "Employment security administration", "Veterans' services"]
    educ = ["Education", "Libraries"]
    govt = ["Financial administration", "Judicial and legal", "General public buildings", "Other governmental administration"]
    transport = ["Highways", "Air transportation (airports)", "Parking facilities", "Sea and inland port facilities"]
    others = ["Utility expenditure", "Expenditure1"]

    # Retains only required rows
    df = df[df["Description"].isin(social + educ + govt + transport + others)]

    # Transposes dataframe and sets new column names
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df[1:]
    df.reset_index(inplace=True)
    df.rename(columns = {'Expenditure1':'State Expenditure (in thousands)', 
    "index" : "State", "Utility expenditure" : "Utilities"}, inplace = True)

    # Converts specific columns in to integers
    for col in [col for col in df.columns]:
        if col != "State":
            df[col] = df[col].str.replace(',','')
            df[col] = df[col].astype(int)
    
    # Merges sub-categories into single categories
    df["Health and Social Services Expenditure"] = df[social].sum(axis=1)
    df["Education Related Expenditure"] = df[educ].sum(axis=1)
    df["Public Administration Expenditure"] = df[educ].sum(axis=1)
    df["Transportation Expenditure"] = df[transport].sum(axis=1)
    df.drop(columns = social + educ + govt + transport, inplace=True)

    required_col_names = [col for col in df.columns[1:]]

    for col in required_col_names:

        # Calculates state expenditure in one cateogry as a proportion
        # of total US expenditure for this category
        if col == "State Expenditure (in thousands)":
            df["State Total as % of US Total"] = \
                df.apply(lambda x : (x[col] / int(df.loc[df["State"] ==
                    "United States", col])) * 100, axis = 1)

        # Calculates the state expenditure in one category as a proportion
        # of total state expenditure across all cateogries
        else:
            df[col + " (State as % of US)"] = \
                df.apply(lambda x : (x[col] / int(df.loc[df["State"] ==
                    "United States", col])) * 100, axis = 1)

            df[col + " (% of Total Expenditure)"] = (df[col] /
            df["State Expenditure (in thousands)"] * 100)

    # Sets "State" column as index of dataframe
    df.set_index(["State"], inplace = True)

    return df


def clean_census_population(pop_df):
    """
    ###
    """
    pop_df = pop_df.iloc[:,0:2]
    pop_df.columns = ["State", "Population"]
    pop_df['State'] = pop_df['State'].str.strip()
    pop_df = pop_df[pop_df["State"].isin(STATE_NAMES_AND_UNITED_STATES)]
    pop_df["Population"] = pop_df["Population"].str.replace(',','')
    pop_df["Population"] = pop_df["Population"].astype(int)
    pop_df.set_index("State", inplace=True)

    return pop_df


def clean_census_poverty(poverty_df):
    """
    ###
    """
    poverty_df = poverty_df.iloc[:,0:2]
    poverty_df.columns = ["State", "3-Year Average Poverty Rate (2018-2020)"]
    poverty_df['State'] = poverty_df['State'].str.strip()
    poverty_df = poverty_df[poverty_df["State"].isin(STATE_NAMES_AND_UNITED_STATES)]
    poverty_df.set_index("State", inplace=True)

    return poverty_df

In [249]:
def analyze_expenditure_and_funding(years):
    """
    ###

    Inputs:
        years (lst of str)

    Returns:
        cleaned_and_combined (dct)
    """
    # Creates and outputs population and poverty data
    poverty_df = clean_census_poverty(pd.read_csv("us_poverty_by_state.csv"))
    population_df = clean_census_population(pd.read_csv("us_census_population.csv")) 

    # poverty_df.to_csv("us_poverty_cleaned")
    # population_df.to_csv("us_population_cleaned")

    # Clean and combines census data and funding data from each year from 2016 to 2020
    expenditure_file_name = "_us_state_finances.csv"
    funding_file_name = "_us_funding.csv"

    cleaned_df_dct = {}

    for year in years:
        expenditure_csv = year + expenditure_file_name # "2016_us_state_finances.csv"
        funding_csv = year + funding_file_name # "2016_us_funding.csv"

        expenditure_df = clean_census_expenditure(pd.read_csv(expenditure_csv))
        funding_df, funding_df_by_state, funding_df_within_state  = clean_funding(pd.read_csv(funding_csv), year)
    
        per_capita_df = pd.DataFrame(columns=["Expenditure per Capita (in thousands)", "Funding received per Capita (in thousands)"])
        per_capita_df["Expenditure per Capita (in thousands)"] = expenditure_df["State Expenditure (in thousands)"] / population_df["Population"]
        per_capita_df["Funding received per Capita (in thousands)"] = funding_df_by_state["Total Funding Received"] / population_df["Population"]

        cleaned_df_dct[year] = (expenditure_df, funding_df_by_state, per_capita_df, funding_df, funding_df_within_state)

        # Outputs files into directory
        # funding_df_by_state.to_csv(year + "_cleaned_funding_by_state.csv")
        # funding_df_within_state.to_csv(year + "_cleaned_funding_within_state.csv")
        # funding_df.to_csv(year + "_cleaned_funding_full.csv")
        # expenditure_df.to_csv(year + "_cleaned_expenditure.csv")

    return cleaned_df_dct

In [19]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]

In [250]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]
this_dct = analyze_expenditure_and_funding(FINAL_YEARS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poverty_df['State'] = poverty_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df['State'] = pop_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.index[0:66], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyda

In [251]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]

by_state_2016 = this_dct["2016"][1]
# by_state_2017 = this_dct["2017"][1]
# by_state_2018 = this_dct["2018"][1]

funding_df_lst = []
for year in FINAL_YEARS:
    funding_df_lst.append(this_dct.get(year)[1])

new = pd.concat(funding_df_lst)
new = new[new.index != "United States"]
new = new.sort_values(["State", "Year"])
# new = new.sort_values("Year")

# new = pd.concat([by_state_2016, by_state_2017, by_state_2018])

# new = new[new.index != "United States"]

# # new

# new = new.sort_values("State")

new


# def combine_multiple_years(year_lst):
    
#     funding_df_lst = []
#     for year in FINAL_YEARS:
#         funding_df_lst.append(this_dct.get(year)[1])

#     combined_df = pd.concat(funding_df_lst)
#     combined_df = new[new.index != "United States"]
#     combined_df = new.sort_values("State")

#     return combined_df

Unnamed: 0_level_0,Year,Total Funding Received,"Agriculture, Forestry, Fishing and Hunting","Mining, Quarrying, and Oil and Gas Extraction",Utilities,Construction,Manufacturing,Wholesale Trade,Retail Trade,Transportation and Warehousing,...,Finance and Insurance,Real Estate and Rental and Leasing,"Professional, Scientific, and Technical Services",Administrative and Support and Waste Management and Remediation Services,Educational Services,Health Care and Social Assistance,"Arts, Entertainment, and Recreation",Accommodation and Food Services,Other Services (except Public Administration),Public Administration (not covered in economic census)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,2016,11684439.0,0.347488,0.372776,0.920666,7.943458,1.863820,0.394121,0.160624,4.885675,...,0.366127,0.124675,3.377963,0.654152,1.394501,1.527035,0.211649,1.869921,1.408550,0.716867
Alabama,2017,12104077.0,0.568783,0.663459,1.371500,8.833260,1.787462,0.272654,0.094219,2.818814,...,0.490258,0.089001,3.437711,0.682898,1.370940,1.146102,0.174499,1.566233,1.347048,0.452445
Alabama,2018,15665904.0,0.429977,0.337180,2.518674,7.446313,2.348701,0.137982,0.131973,1.258020,...,0.146203,0.348533,4.082669,0.590029,1.431522,1.243038,0.101393,1.588319,1.306327,0.491612
Alabama,2019,12503066.0,0.546280,0.826942,1.225837,5.512885,1.525987,0.213903,0.281522,0.710724,...,0.229673,0.278895,3.550843,0.697449,1.858430,1.377096,0.161808,2.470479,2.947308,0.520705
Alabama,2020,12180702.0,0.236026,0.237738,1.218848,4.062597,1.424163,0.069485,0.166701,0.385128,...,0.091982,3.418748,3.137177,0.586778,2.274710,1.742754,0.186051,1.017485,2.712570,0.737441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,2016,149557.0,0.734603,0.246796,0.167450,0.122540,0.007308,0.133128,0.021710,0.091500,...,0.000000,0.302536,0.028816,0.033175,0.009282,0.055865,0.033900,0.200232,0.035254,0.058860
Wyoming,2017,133557.0,0.382007,0.360469,0.115515,0.094420,0.005616,0.001671,0.006888,0.079124,...,0.000000,0.166972,0.033346,0.034263,0.008228,0.029623,0.037667,0.133567,0.030751,0.027137
Wyoming,2018,156715.0,0.917983,0.294246,0.105284,0.121376,0.007479,0.000821,0.005218,0.056927,...,0.000000,0.034775,0.027632,0.030043,0.012779,0.052106,0.034736,0.240957,0.031633,0.036621
Wyoming,2019,171689.0,0.748713,0.067439,0.168411,0.197801,0.008192,0.000111,0.006780,0.067186,...,0.000000,0.017389,0.012059,0.051632,0.016255,0.054683,0.404749,0.310705,0.028292,0.035511
