In [239]:
import pandas as pd

STATE_NAMES = ["Alaska", "Alabama", "Arkansas", "Arizona", "California",
"Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
"Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana",
"Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi",
"Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey",
"New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
"Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia",
"Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

STATE_NAMES_AND_UNITED_STATES = STATE_NAMES[:]
STATE_NAMES_AND_UNITED_STATES.append("United States")

us_state_abbreviations = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
}

US_STATE_CODES = dict(map(reversed, us_state_abbreviations.items()))

NAICS_SECTOR_CODES = {
    "Agriculture, Forestry, Fishing and Hunting" : ("11",),
    "Mining, Quarrying, and Oil and Gas Extraction" : ("21",),
    "Utilities" : ("22",),
    "Construction" : ("23",),
    "Manufacturing" : ("31", "32", "33",),
    "Wholesale Trade" : ("42",),
    "Retail Trade" : ("44", "45",),
    "Transportation and Warehousing" : ("48", "49",),
    "Information" : ("51",),
    "Finance and Insurance" : ("52",),
    "Real Estate and Rental and Leasing" : ("53",),
    "Professional, Scientific, and Technical Services" : ("54",),
    "Administrative and Support and Waste Management and Remediation Services" : ("56",),
    "Educational Services" : ("61",),
    "Health Care and Social Assistance" : ("62",),
    "Arts, Entertainment, and Recreation" : ("71",),
    "Accommodation and Food Services" : ("72",),
    "Other Services (except Public Administration)" : ("81",),
    "Public Administration (not covered in economic census)" : ("92",)
}

NAICS_SECTOR_LST = [k for k in NAICS_SECTOR_CODES]

def clean_funding(raw_funding_df, year):
    """
    ###
    """
    raw_funding_df["code"] = raw_funding_df["code"].astype(str)

    # Creates structure for funding dataframe
    naics_sector_lst = [k for k in NAICS_SECTOR_CODES.keys()]
    funding_df = pd.DataFrame(STATE_NAMES_AND_UNITED_STATES, columns=["State"])
    funding_df = pd.concat([funding_df,pd.DataFrame(columns = naics_sector_lst)])
    
    funding_df_within_state = pd.DataFrame(STATE_NAMES_AND_UNITED_STATES, columns=["State"]) # Within State
    funding_df_by_state = pd.DataFrame(STATE_NAMES_AND_UNITED_STATES, columns=["State"]) # By State

    funding_df_lst = [funding_df, funding_df_within_state, funding_df_by_state]

    # Calculates funding for each category in each state and inputs values into funding_df
    for state_code, state in US_STATE_CODES.items():
        for sector, naics_code_tuple in NAICS_SECTOR_CODES.items():
            if len(naics_code_tuple) == 1:
                subset_df = raw_funding_df[(raw_funding_df["code"].apply(lambda x : x.startswith(naics_code_tuple[0]))) & (raw_funding_df["State"] == state_code) & (raw_funding_df["amount"] >= 0)]
                sum_val = subset_df["amount"].sum() / 1000
            else:
                sum_val = 0
                for naics_code in naics_code_tuple:
                    subset_df = raw_funding_df[(raw_funding_df["code"].apply(lambda x : x.startswith(naics_code))) & (raw_funding_df["State"] == state_code) & (raw_funding_df["amount"] >= 0)]
                    sum_val += subset_df["amount"].sum() / 1000
            
            funding_df.loc[funding_df["State"] == state, sector] = int(sum_val)

    required_col_names = [col for col in funding_df.columns[1:]]

    # Calculates total funding across categories for each state
    for df in funding_df_lst:
        df["Total Funding Received"] = funding_df[required_col_names].sum(axis=1)

    for col in required_col_names:
        #Calculates total funding for each category at the national level
        funding_df.loc[funding_df["State"] == "United States", col] = funding_df[col].sum()

        # Calculates funding for each state as a share of US by category
        funding_df_by_state[col] = funding_df.apply(lambda x : (x[col] / funding_df.loc[funding_df["State"] == "United States", col]) * 100, axis = 1)

        # Calculates funding for each category as a share of the total funding received by a state
        funding_df_without_us = funding_df.iloc[0:len(funding_df)-1]
        funding_df_within_state[col] = (funding_df_without_us[col] / funding_df_without_us["Total Funding Received"]) * 100

    funding_df["Year"] = year
    funding_df_cols = list(funding_df.columns)
    funding_df = funding_df[[funding_df_cols[-1]] + funding_df_cols[:-1]]

    funding_df_within_state["Year"] = year
    funding_df_2_cols = list(funding_df_within_state.columns)
    funding_df_within_state = funding_df_within_state[[funding_df_2_cols[-1]] + funding_df_2_cols[:-1]]

    funding_df_by_state["Year"] = year
    funding_df_3_cols = list(funding_df_by_state.columns)
    funding_df_by_state = funding_df_by_state[[funding_df_3_cols[-1]] + funding_df_3_cols[:-1]]

    funding_df.set_index("State", inplace=True)
    funding_df_within_state.set_index("State", inplace=True)
    funding_df_by_state.set_index("State", inplace=True)


    return funding_df, funding_df_by_state, funding_df_within_state


In [7]:
a, b, c = clean_funding(pd.read_csv("2016_us_funding.csv"))

In [223]:
def clean_census_expenditure(df):
    """
    ###
    """

    df['Description'] = df['Description'].str.strip()

    df.rename(columns = {'United States Total':'United States'}, inplace = True)

    # Retains only columns with combined state and local government expenditure
    df = df[["Description"] + STATE_NAMES_AND_UNITED_STATES]

    # Drops rows relating to state revenue sources
    df.drop(df.index[0:66], inplace=True)

    social = ["Public welfare", "Hospitals", "Health", "Employment security administration", "Veterans' services"]
    educ = ["Education", "Libraries"]
    govt = ["Financial administration", "Judicial and legal", "General public buildings", "Other governmental administration"]
    transport = ["Highways", "Air transportation (airports)", "Parking facilities", "Sea and inland port facilities"]
    others = ["Utility expenditure", "Expenditure1"]

    # Retains only required rows
    df = df[df["Description"].isin(social + educ + govt + transport + others)]

    # Transposes dataframe and sets new column names
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df[1:]
    df.reset_index(inplace=True)
    df.rename(columns = {'Expenditure1':'State Expenditure (in thousands)', 
    "index" : "State", "Utility expenditure" : "Utilities"}, inplace = True)

    # Converts specific columns in to integers
    for col in [col for col in df.columns]:
        if col != "State":
            df[col] = df[col].str.replace(',','')
            df[col] = df[col].astype(int)
    
    # Merges sub-categories into single categories
    df["Health and Social Services Expenditure"] = df[social].sum(axis=1)
    df["Education Related Expenditure"] = df[educ].sum(axis=1)
    df["Public Administration Expenditure"] = df[educ].sum(axis=1)
    df["Transportation Expenditure"] = df[transport].sum(axis=1)
    df.drop(columns = social + educ + govt + transport, inplace=True)

    required_col_names = [col for col in df.columns[1:]]

    for col in required_col_names:

        # Calculates state expenditure in one cateogry as a proportion
        # of total US expenditure for this category
        if col == "State Expenditure (in thousands)":
            df["State Total as % of US Total"] = \
                df.apply(lambda x : (x[col] / int(df.loc[df["State"] ==
                    "United States", col])) * 100, axis = 1)

        # Calculates the state expenditure in one category as a proportion
        # of total state expenditure across all cateogries
        else:
            df[col + " (State as % of US)"] = \
                df.apply(lambda x : (x[col] / int(df.loc[df["State"] ==
                    "United States", col])) * 100, axis = 1)

            df[col + " (% of Total Expenditure)"] = (df[col] /
            df["State Expenditure (in thousands)"] * 100)

    # Sets "State" column as index of dataframe
    df.set_index(["State"], inplace = True)

    return df


def clean_census_population(pop_df):
    """
    ###
    """
    pop_df = pop_df.iloc[:,0:2]
    pop_df.columns = ["State", "Population"]
    pop_df['State'] = pop_df['State'].str.strip()
    pop_df = pop_df[pop_df["State"].isin(STATE_NAMES_AND_UNITED_STATES)]
    pop_df["Population"] = pop_df["Population"].str.replace(',','')
    pop_df["Population"] = pop_df["Population"].astype(int)
    pop_df.set_index("State", inplace=True)

    return pop_df


def clean_census_poverty(poverty_df):
    """
    ###
    """
    poverty_df = poverty_df.iloc[:,0:2]
    poverty_df.columns = ["State", "3-Year Average Poverty Rate (2018-2020)"]
    poverty_df['State'] = poverty_df['State'].str.strip()
    poverty_df = poverty_df[poverty_df["State"].isin(STATE_NAMES_AND_UNITED_STATES)]
    poverty_df.set_index("State", inplace=True)

    return poverty_df

In [249]:
def analyze_expenditure_and_funding(years):
    """
    ###

    Inputs:
        years (lst of str)

    Returns:
        cleaned_and_combined (dct)
    """
    # Creates and outputs population and poverty data
    poverty_df = clean_census_poverty(pd.read_csv("us_poverty_by_state.csv"))
    population_df = clean_census_population(pd.read_csv("us_census_population.csv")) 

    # poverty_df.to_csv("us_poverty_cleaned")
    # population_df.to_csv("us_population_cleaned")

    # Clean and combines census data and funding data from each year from 2016 to 2020
    expenditure_file_name = "_us_state_finances.csv"
    funding_file_name = "_us_funding.csv"

    cleaned_df_dct = {}

    for year in years:
        expenditure_csv = year + expenditure_file_name # "2016_us_state_finances.csv"
        funding_csv = year + funding_file_name # "2016_us_funding.csv"

        expenditure_df = clean_census_expenditure(pd.read_csv(expenditure_csv))
        funding_df, funding_df_by_state, funding_df_within_state  = clean_funding(pd.read_csv(funding_csv), year)
    
        per_capita_df = pd.DataFrame(columns=["Expenditure per Capita (in thousands)", "Funding received per Capita (in thousands)"])
        per_capita_df["Expenditure per Capita (in thousands)"] = expenditure_df["State Expenditure (in thousands)"] / population_df["Population"]
        per_capita_df["Funding received per Capita (in thousands)"] = funding_df_by_state["Total Funding Received"] / population_df["Population"]

        cleaned_df_dct[year] = (expenditure_df, funding_df_by_state, per_capita_df, funding_df, funding_df_within_state)

        # Outputs files into directory
        funding_df_by_state.to_csv("../clean_data/" + year + "_cleaned_funding_by_state.csv")
        funding_df_within_state.to_csv("../clean_data/" + year + "_cleaned_funding_within_state.csv")
        funding_df.to_csv("../clean_data/" + year + "_cleaned_funding_full.csv")
        # expenditure_df.to_csv(year + "_cleaned_expenditure.csv")

    return cleaned_df_dct

In [19]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]

In [250]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]
this_dct = analyze_expenditure_and_funding(FINAL_YEARS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poverty_df['State'] = poverty_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df['State'] = pop_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.index[0:66], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyda

In [252]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]

by_state_2016 = this_dct["2016"][1]
# by_state_2017 = this_dct["2017"][1]
# by_state_2018 = this_dct["2018"][1]

funding_df_lst = []
for year in FINAL_YEARS:
    funding_df_lst.append(this_dct.get(year)[1])

new = pd.concat(funding_df_lst)
new = new[new.index != "United States"]
new = new.sort_values(["State", "Year"])
new.to_csv("all_years_funding_by_state")
# new = new.sort_values("Year")

# new = pd.concat([by_state_2016, by_state_2017, by_state_2018])

# new = new[new.index != "United States"]

# # new

# new = new.sort_values("State")

# def combine_multiple_years(year_lst):
    
#     funding_df_lst = []
#     for year in FINAL_YEARS:
#         funding_df_lst.append(this_dct.get(year)[1])

#     combined_df = pd.concat(funding_df_lst)
#     combined_df = new[new.index != "United States"]
#     combined_df = new.sort_values("State")

#     return combined_df

In [253]:
def create_expenditure_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    expenditure_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        expenditure_df = clean_df_dct.get(year)[0]
        us_row_only = expenditure_df.loc[expenditure_df.index == "United States"]
        sum = int(us_row_only.loc[us_row_only.index == "United States", "State Expenditure (in thousands)"])
        us_row_only = us_row_only[["Utilities", "Health and Social Services Expenditure", "Education Related Expenditure", "Public Administration Expenditure", "Transportation Expenditure"]]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        us_row_only["Sum"] = sum
        expenditure_time_series[year] = (us_row_only["Amount"] / us_row_only["Sum"]) * 100
    
    expenditure_time_series.index.names = ["Category"]

    # Outputs file into directory
    expenditure_time_series.to_csv("us_expenditure_time_series.csv")
            
    return expenditure_time_series

In [254]:
FINAL_YEARS = ["2016", "2017", "2018", "2019", "2020"]

create_expenditure_time_series_df(FINAL_YEARS, this_dct)

Unnamed: 0_level_0,2016,2017,2018,2019,2020
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Utilities,6.327077,6.333846,6.300536,6.240368,6.030829
Health and Social Services Expenditure,26.475273,26.645353,26.669575,26.999864,26.9826
Education Related Expenditure,27.882814,27.971945,27.751767,27.757852,27.062293
Public Administration Expenditure,27.882814,27.971945,27.751767,27.757852,27.062293
Transportation Expenditure,5.948208,5.877192,6.070463,6.105645,5.83227


In [260]:
from collections import namedtuple
CleanedData = namedtuple("CleanedData", ["expenditure_df", "per_capita_df", "funding_df", "funding_df_by_state", "funding_df_within_state"])

def analyze_expenditure_and_funding(years):
    """
    ###

    Inputs:
        years (lst of str)

    Returns:
        cleaned_and_combined (dct)
    """
    # Creates and outputs population and poverty data
    poverty_df = clean_census_poverty(pd.read_csv("us_poverty_by_state.csv"))
    population_df = clean_census_population(pd.read_csv("us_census_population.csv")) 

    # poverty_df.to_csv("us_poverty_cleaned")
    # population_df.to_csv("us_population_cleaned")

    # Clean and combines census data and funding data from each year from 2016 to 2020
    expenditure_file_name = "_us_state_finances.csv"
    funding_file_name = "_us_funding.csv"

    cleaned_df_dct = {}

    for year in years:
        expenditure_csv = year + expenditure_file_name # "2016_us_state_finances.csv"
        funding_csv = year + funding_file_name # "2016_us_funding.csv"

        expenditure_df = clean_census_expenditure(pd.read_csv(expenditure_csv))
        funding_df, funding_df_by_state, funding_df_within_state  = clean_funding(pd.read_csv(funding_csv), year)
    
        per_capita_df = pd.DataFrame(columns=["Expenditure per Capita (in thousands)", "Funding received per Capita (in thousands)"])
        per_capita_df["Expenditure per Capita (in thousands)"] = expenditure_df["State Expenditure (in thousands)"] / population_df["Population"]
        per_capita_df["Funding received per Capita (in thousands)"] = funding_df_by_state["Total Funding Received"] / population_df["Population"]

        cleaned_df_dct[year] = CleanedData(expenditure_df, per_capita_df, funding_df, funding_df_by_state, funding_df_within_state)

        # Outputs files into directory
        # funding_df_by_state.to_csv(year + "_cleaned_funding_by_state.csv")
        # funding_df_within_state.to_csv(year + "_cleaned_funding_within_state.csv")
        # funding_df.to_csv(year + "_cleaned_funding_full.csv")
        # expenditure_df.to_csv(year + "_cleaned_expenditure.csv")

    return cleaned_df_dct


def create_funding_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    funding_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        funding_df = clean_df_dct.get(year).funding_df
        us_row_only = funding_df.loc[funding_df.index == "United States"]
        us_row_only = us_row_only[NAICS_SECTOR_LST]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        funding_time_series[year] = (us_row_only["Amount"] / us_row_only["Amount"].sum()) * 100

    funding_time_series.index.names = ["NAICS Category"]

    # Outputs file into directory
    # funding_time_series.to_csv("us_funding_time_series.csv")

    return funding_time_series


def create_expenditure_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    expenditure_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        expenditure_df = clean_df_dct.get(year).expenditure_df
        us_row_only = expenditure_df.loc[expenditure_df.index == "United States"]
        sum = int(us_row_only.loc[us_row_only.index == "United States", "State Expenditure (in thousands)"])
        us_row_only = us_row_only[["Utilities", "Health and Social Services Expenditure", "Education Related Expenditure", "Public Administration Expenditure", "Transportation Expenditure"]]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        us_row_only["Sum"] = sum
        expenditure_time_series[year] = (us_row_only["Amount"] / us_row_only["Sum"]) * 100
    
    expenditure_time_series.index.names = ["Category"]

    # Outputs file into directory
    # expenditure_time_series.to_csv("us_expenditure_time_series.csv")
            
    return expenditure_time_series


# def combine_multiple_years(year_lst, clean_df_dct):
#     """
#     ###
#     """
#     funding_df_lst = []
#     for year in year_lst:
#         funding_df_lst.append(clean_df_dct.get(year).funding_df_by_state)

#     combined_df = pd.concat(funding_df_lst)
#     combined_df = combined_df[combined_df.index != "United States"]
#     combined_df = combined_df.sort_values()

#     return combined_df.to_csv("all_years_funding_by_state.csv")

In [263]:
FINAL_YEARS = ["2016"]

new_dct = analyze_expenditure_and_funding(FINAL_YEARS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poverty_df['State'] = poverty_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df['State'] = pop_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.index[0:66], inplace=True)


In [268]:
new_dct["2016"].funding_df_within_state

Unnamed: 0_level_0,Year,Total Funding Received,"Agriculture, Forestry, Fishing and Hunting","Mining, Quarrying, and Oil and Gas Extraction",Utilities,Construction,Manufacturing,Wholesale Trade,Retail Trade,Transportation and Warehousing,...,Finance and Insurance,Real Estate and Rental and Leasing,"Professional, Scientific, and Technical Services",Administrative and Support and Waste Management and Remediation Services,Educational Services,Health Care and Social Assistance,"Arts, Entertainment, and Recreation",Accommodation and Food Services,Other Services (except Public Administration),Public Administration (not covered in economic census)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaska,2016,3928810.0,0.048437,0.028329,3.332332,20.234142,8.388392,0.938452,0.037187,5.000165,...,0.07445,0.098121,17.573209,39.416846,2.424627,0.124236,0.021253,0.392485,0.295942,0.459859
Alabama,2016,11684439.0,0.013659,0.006179,0.126861,17.932808,28.63974,0.222381,0.02001,6.681057,...,0.522926,0.011716,40.367894,2.626724,0.51174,1.187468,0.001977,0.195816,0.357672,0.051179
Arkansas,2016,491985.0,1.418946,0.203258,1.258372,18.857485,40.144923,1.754932,0.192486,2.447026,...,0.0,0.056709,10.991595,15.465106,0.489446,4.075734,0.007521,0.400419,0.403061,0.315254
Arizona,2016,12864054.0,0.064272,0.024759,0.357477,2.989952,60.772156,0.156405,0.034958,1.584982,...,0.16473,0.150264,14.536382,1.201985,0.966095,13.722323,0.011816,0.156879,0.427439,0.046074
California,2016,60323115.0,0.146919,0.065859,0.241032,5.49457,45.464237,0.341846,0.112403,2.569348,...,5.91052,0.120063,28.525598,5.43265,0.7973,2.56287,0.007947,0.111569,0.571098,0.310672
Colorado,2016,10963096.0,0.138674,0.011776,0.270717,6.098077,15.746738,0.181026,0.017942,13.800919,...,0.020596,0.04257,38.674887,14.457175,0.983135,2.883638,0.013646,0.161697,1.696464,0.045188
Connecticut,2016,13640654.0,0.003402,0.000352,0.078083,0.915506,80.0343,0.086169,0.011436,4.280872,...,0.03807,0.156752,13.351237,0.462155,0.054975,0.221404,0.005403,0.059367,0.160615,0.018921
Delaware,2016,394672.0,0.126687,0.055236,0.619248,4.671474,22.050716,0.393998,0.047128,10.082296,...,37.393329,0.152785,8.308925,10.279169,2.456724,0.697034,0.05929,0.845259,1.262061,0.059796
Florida,2016,17083445.0,0.096573,0.002423,0.531831,10.073261,42.879021,0.625933,0.050581,4.518427,...,0.555626,0.430118,24.777052,7.847703,1.599426,2.944664,0.070384,0.311143,1.345092,0.611785
Georgia,2016,9908169.0,0.032044,0.370724,0.672082,8.009159,54.932238,0.445794,0.237006,4.963228,...,0.348198,0.773301,20.121478,4.517757,0.579996,1.190462,0.132477,0.798402,0.29948,0.174825


In [16]:
import pandas as pd

pd.read_csv("../clean_data/2018_cleaned_expenditure.csv")



Unnamed: 0,State,State Expenditure (in thousands),Utilities,Health and Social Services Expenditure,Education Related Expenditure,Public Administration Expenditure,Transportation Expenditure,State Total as % of US Total,Utilities (State as % of US),Utilities (% of Total Expenditure),Health and Social Services Expenditure (State as % of US),Health and Social Services Expenditure (% of Total Expenditure),Education Related Expenditure (State as % of US),Education Related Expenditure (% of Total Expenditure),Public Administration Expenditure (State as % of US),Public Administration Expenditure (% of Total Expenditure),Transportation Expenditure (State as % of US),Transportation Expenditure (% of Total Expenditure)
0,Alaska,16461035,650386,3463461,3222987,3222987,2119873,0.431075,0.270327,3.951064,0.340087,21.04036,0.304133,19.579492,0.304133,19.579492,0.914499,12.878127
1,Alabama,48238993,3068894,14362206,14742285,14742285,3070165,1.263262,1.275557,6.361853,1.410264,29.773022,1.391136,30.56093,1.391136,30.56093,1.324449,6.364488
2,Arkansas,27659935,980609,8570120,8784757,8784757,2139965,0.724347,0.407581,3.545232,0.841523,30.983876,0.828962,31.759861,0.828962,31.759861,0.923167,7.736696
3,Arizona,62755470,6384877,16809832,17554634,17554634,3372703,1.643414,2.653813,10.174216,1.650603,26.786242,1.656519,27.973074,1.656519,27.973074,1.454962,5.374357
4,California,605649479,45603480,179493892,142038480,142038480,25214834,15.860494,18.954653,7.529682,17.624992,29.636596,13.403269,23.452258,13.403269,23.452258,10.877513,4.163272
5,Colorado,64659578,4336042,13464559,19029373,19029373,4297083,1.693278,1.802235,6.705955,1.322122,20.823766,1.795681,29.430092,1.795681,29.430092,1.853733,6.645702
6,Connecticut,41642052,1457829,6412577,13686484,13686484,2203005,1.090505,0.605933,3.500858,0.629668,15.399282,1.291507,32.866978,1.291507,32.866978,0.950362,5.290337
7,Delaware,12200640,553703,3332623,4006572,4006572,837401,0.319505,0.230141,4.538311,0.327239,27.315149,0.378075,32.839031,0.378075,32.839031,0.361249,6.863583
8,Florida,187612475,11919691,45302027,45363601,45363601,14890598,4.913117,4.954306,6.353357,4.448329,24.146596,4.280675,24.179416,4.280675,24.179416,6.423706,7.936891
9,Georgia,89012045,6474160,20356320,29659996,29659996,5574860,2.33101,2.690923,7.273353,1.998842,22.869175,2.798825,33.321329,2.798825,33.321329,2.404958,6.26304


In [6]:

import sys
print(sys.path)

['/home/foosuonchuang/capp30122/30122-project-cappy-funding/data/raw_data', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/foosuonchuang/.local/lib/python3.8/site-packages', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages']
