In [8]:
import pandas as pd
from clean_census import clean_census_expenditure, clean_census_population, clean_census_poverty
from clean_funding import clean_funding
from utils_clean_and_analyze import NAICS_SECTOR_LST, CleanedData

YEARS = ["2016", "2017", "2018", "2019", "2020"]

CLEAN_DATA_DIR = "../data/clean_data/"
RAW_DATA_DIR = "../data/raw_data/"

def analyze_expenditure_and_funding(years):
    """
    ###

    Inputs:
        years (lst of str)

    Returns:
        cleaned_and_combined (dct)
    """
    # Creates and outputs population and poverty data
    poverty_df = clean_census_poverty(pd.read_csv(RAW_DATA_DIR + "us_poverty_by_state.csv"))
    population_df = clean_census_population(pd.read_csv(RAW_DATA_DIR + "us_census_population.csv")) 

    # poverty_df.to_csv(CLEAN_DATA_DIR + "us_poverty_cleaned.csv")
    # population_df.to_csv(CLEAN_DATA_DIR + "us_population_cleaned.csv")

    # Clean and combines census data and funding data from each year from 2016 to 2020

    cleaned_df_dct = {}

    for year in years:
        expenditure_csv = RAW_DATA_DIR + year + "_us_state_finances.csv" # "2016_us_state_finances.csv"
        funding_csv = RAW_DATA_DIR + year + "_us_funding.csv" # "2016_us_funding.csv"

        expenditure_df = clean_census_expenditure(pd.read_csv(expenditure_csv))
        funding_df, funding_df_by_state, funding_df_within_state  = clean_funding(pd.read_csv(funding_csv), year)
    
        per_capita_df = pd.DataFrame(columns=["Expenditure per Capita (in thousands)", "Funding received per Capita (in thousands)"])
        per_capita_df["Expenditure per Capita (in thousands)"] = expenditure_df["State Expenditure (in thousands)"] / population_df["Population"]
        per_capita_df["Funding received per Capita (in thousands)"] = funding_df_by_state["Total Funding Received"] / population_df["Population"]

        cleaned_df_dct[year] = CleanedData(expenditure_df, per_capita_df, funding_df, funding_df_by_state, funding_df_within_state)

        # Outputs files into directory
        funding_df_by_state.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_by_state.csv")
        funding_df_within_state.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_within_state.csv")
        funding_df.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_full.csv")
        expenditure_df.to_csv(CLEAN_DATA_DIR + year + "_cleaned_expenditure.csv")

    return cleaned_df_dct


def create_funding_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    funding_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        funding_df = clean_df_dct.get(year).funding_df
        us_row_only = funding_df.loc[funding_df.index == "United States"]
        us_row_only = us_row_only[NAICS_SECTOR_LST]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        funding_time_series[year] = (us_row_only["Amount"] / us_row_only["Amount"].sum()) * 100

    funding_time_series.index.names = ["NAICS Category"]

    # Outputs file into directory
    funding_time_series.to_csv(CLEAN_DATA_DIR + "us_funding_time_series.csv")

    return funding_time_series


def create_expenditure_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    expenditure_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        expenditure_df = clean_df_dct.get(year).expenditure_df
        us_row_only = expenditure_df.loc[expenditure_df.index == "United States"]
        sum = int(us_row_only.loc[us_row_only.index == "United States", "State Expenditure (in thousands)"])
        us_row_only = us_row_only[["Utilities", "Health and Social Services Expenditure", "Education Related Expenditure", "Public Administration Expenditure", "Transportation Expenditure"]]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        us_row_only["Sum"] = sum
        expenditure_time_series[year] = (us_row_only["Amount"] / us_row_only["Sum"]) * 100
    
    expenditure_time_series.index.names = ["Category"]

    # Outputs file into directory
    expenditure_time_series.to_csv(CLEAN_DATA_DIR + "us_expenditure_time_series.csv")
            
    return expenditure_time_series


def combine_multiple_years(year_lst, clean_df_dct):
    """
    ###
    """
    funding_df_lst = []
    for year in year_lst:
        funding_df_lst.append(clean_df_dct.get(year).funding_df_by_state)

    combined_df = pd.concat(funding_df_lst)
    combined_df = combined_df[combined_df.index != "United States"]
    combined_df = combined_df.sort_values()

    # Outputs file into directory
    combined_df.to_csv(CLEAN_DATA_DIR + "all_years_funding_by_state.csv")

    return combined_df

In [9]:
TEST_YEARS = ["2016", "2017", "2018", "2019"]

test_dct = analyze_expenditure_and_funding(TEST_YEARS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poverty_df['State'] = poverty_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df['State'] = pop_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.index[0:66], inplace=True)


In [7]:
test_dct["2016"].expenditure_df

Description,State Expenditure (in thousands),Utilities,Health and Social Services Expenditure,Education Related Expenditure,Public Administration Expenditure,Transportation Expenditure,State Total as % of US Total,Utilities (State as % of US),Utilities (% of Total Expenditure),Health and Social Services Expenditure (State as % of US),Health and Social Services Expenditure (% of Total Expenditure),Education Related Expenditure (State as % of US),Education Related Expenditure (% of Total Expenditure),Public Administration Expenditure (State as % of US),Public Administration Expenditure (% of Total Expenditure),Transportation Expenditure (State as % of US),Transportation Expenditure (% of Total Expenditure)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alaska,15839199,830825,2983373,3588591,3588591,2059193,0.448344,0.371693,5.245373,0.318966,18.835378,0.364305,22.656392,0.364305,22.656392,0.979916,13.000613
Alabama,47285873,3002048,15285779,13697154,13697154,2658932,1.338472,1.343051,6.348721,1.634275,32.326312,1.390502,28.966694,1.390502,28.966694,1.265316,5.6231
Arkansas,26782688,987246,8445575,8456323,8456323,1876958,0.75811,0.441672,3.686135,0.902956,31.533709,0.858466,31.57384,0.858466,31.57384,0.893195,7.008102
Arizona,56101988,5537984,14832578,14711408,14711408,2973625,1.588021,2.477573,9.871279,1.585821,26.438596,1.493467,26.222614,1.493467,26.222614,1.41507,5.300391
California,550055846,40883140,161625554,126135404,126135404,22916107,15.569859,18.290224,7.432543,17.280148,29.383481,12.804962,22.931381,12.804962,22.931381,10.905173,4.166142
Colorado,57851779,4549461,11803799,16620488,16620488,3792070,1.63755,2.035329,7.863995,1.262,20.403519,1.687272,28.729433,1.687272,28.729433,1.804546,6.554803
Connecticut,41772915,1501501,6262342,13812769,13812769,2663902,1.182422,0.671739,3.594437,0.669536,14.991393,1.402239,33.066328,1.402239,33.066328,1.267681,6.377103
Delaware,11268907,547992,2999430,3729041,3729041,662993,0.318977,0.24516,4.862867,0.320683,26.616867,0.378563,33.091417,0.378563,33.091417,0.315501,5.883383
Florida,167777134,10945998,40697684,42068401,42068401,13752966,4.749093,4.897,6.52413,4.351181,24.256991,4.270683,25.073978,4.270683,25.073978,6.544675,8.197164
Georgia,82044149,5908847,18829369,26942827,26942827,5098379,2.322338,2.643489,7.202033,2.013136,22.950289,2.735171,32.839425,2.735171,32.839425,2.426185,6.21419
