In [17]:
import pandas as pd
from clean_census import clean_census_expenditure, clean_census_population, clean_census_poverty
from clean_funding import clean_funding
from utils_clean_and_analyze import NAICS_SECTOR_LST, CleanedData

YEARS = ["2016", "2017", "2018", "2019", "2020"]

CLEAN_DATA_DIR = "../data/clean_data/"
RAW_DATA_DIR = "../data/raw_data/"

def analyze_expenditure_and_funding(years):
    """
    ###

    Inputs:
        years (lst of str)

    Returns:
        cleaned_and_combined (dct)
    """
    # Creates and outputs population and poverty data
    poverty_df = clean_census_poverty(pd.read_csv(RAW_DATA_DIR + "us_poverty_by_state.csv"))
    population_df = clean_census_population(pd.read_csv(RAW_DATA_DIR + "us_census_population.csv")) 

    # poverty_df.to_csv(CLEAN_DATA_DIR + "us_poverty_cleaned.csv")
    # population_df.to_csv(CLEAN_DATA_DIR + "us_population_cleaned.csv")

    # Clean and combines census data and funding data from each year from 2016 to 2020

    cleaned_df_dct = {}

    for year in years:
        expenditure_csv = RAW_DATA_DIR + year + "_us_state_finances.csv" # "2016_us_state_finances.csv"
        funding_csv = RAW_DATA_DIR + year + "_us_funding.csv" # "2016_us_funding.csv"

        expenditure_df = clean_census_expenditure(pd.read_csv(expenditure_csv))
        funding_df, funding_df_by_state, funding_df_within_state  = clean_funding(pd.read_csv(funding_csv), year)
    
        per_capita_df = pd.DataFrame(columns=["Expenditure per Capita (in thousands)", "Funding received per Capita (in thousands)"])
        per_capita_df["Expenditure per Capita (in thousands)"] = expenditure_df["State Expenditure (in thousands)"] / population_df["Population"]
        per_capita_df["Funding received per Capita (in thousands)"] = funding_df_by_state["Total Funding Received"] / population_df["Population"]

        cleaned_df_dct[year] = CleanedData(expenditure_df, per_capita_df, funding_df, funding_df_by_state, funding_df_within_state)

        # Outputs files into directory
        # funding_df_by_state.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_by_state.csv")
        # funding_df_within_state.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_within_state.csv")
        # funding_df.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_full.csv")
        # expenditure_df.to_csv(CLEAN_DATA_DIR + year + "_cleaned_expenditure.csv")
        # per_capita_df.to_csv(CLEAN_DATA_DIR + year + "_per_capita_analysis.csv")

    return cleaned_df_dct


def create_funding_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    funding_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        funding_df = clean_df_dct.get(year).funding_df
        us_row_only = funding_df.loc[funding_df.index == "United States"]
        us_row_only = us_row_only[NAICS_SECTOR_LST]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        funding_time_series[year] = (us_row_only["Amount"] / us_row_only["Amount"].sum()) * 100

    funding_time_series.index.names = ["NAICS Category"]

    # Outputs file into directory
    # funding_time_series.to_csv(CLEAN_DATA_DIR + "us_funding_time_series.csv")

    return funding_time_series


def create_expenditure_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    expenditure_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        expenditure_df = clean_df_dct.get(year).expenditure_df
        us_row_only = expenditure_df.loc[expenditure_df.index == "United States"]
        sum = int(us_row_only.loc[us_row_only.index == "United States", "State Expenditure (in thousands)"])
        us_row_only = us_row_only[["Utilities", "Health and Social Services Expenditure", "Education Related Expenditure", "Public Administration Expenditure", "Transportation Expenditure"]]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        us_row_only["Sum"] = sum
        expenditure_time_series[year] = (us_row_only["Amount"] / us_row_only["Sum"]) * 100
    
    expenditure_time_series.index.names = ["Category"]

    # Outputs file into directory
    # expenditure_time_series.to_csv(CLEAN_DATA_DIR + "us_expenditure_time_series.csv")
            
    return expenditure_time_series


def combine_multiple_years(year_lst, clean_df_dct):
    """
    ###
    """
    funding_df_lst = []
    for year in year_lst:
        funding_df_lst.append(clean_df_dct.get(year).funding_df_by_state)

    combined_df = pd.concat(funding_df_lst)
    combined_df = combined_df[combined_df.index != "United States"]
    combined_df = combined_df.sort_values(["State", "Year"])

    # Outputs file into directory
    # combined_df.to_csv(CLEAN_DATA_DIR + "all_years_funding_by_state.csv")

    return combined_df

In [18]:
TEST_YEARS = ["2016", "2017", "2018", "2019", "2020"]

test_dct = analyze_expenditure_and_funding(TEST_YEARS)
# create_funding_time_series_df(TEST_YEARS, test_dct)
# create_expenditure_time_series_df(TEST_YEARS, test_dct)
combine_multiple_years(TEST_YEARS, test_dct)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poverty_df['State'] = poverty_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df['State'] = pop_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.index[0:66], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyda

Unnamed: 0_level_0,Year,Total Funding Received,"Agriculture, Forestry, Fishing and Hunting","Mining, Quarrying, and Oil and Gas Extraction",Utilities,Construction,Manufacturing,Wholesale Trade,Retail Trade,Transportation and Warehousing,...,Finance and Insurance,Real Estate and Rental and Leasing,"Professional, Scientific, and Technical Services",Administrative and Support and Waste Management and Remediation Services,Educational Services,Health Care and Social Assistance,"Arts, Entertainment, and Recreation",Accommodation and Food Services,Other Services (except Public Administration),Public Administration (not covered in economic census)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,2016,11684439.0,0.347488,0.372776,0.920666,7.943458,1.863820,0.394121,0.160624,4.885675,...,0.366127,0.124675,3.377963,0.654152,1.394501,1.527035,0.211649,1.869921,1.408550,0.716867
Alabama,2017,12104077.0,0.568783,0.663459,1.371500,8.833260,1.787462,0.272654,0.094219,2.818814,...,0.490258,0.089001,3.437711,0.682898,1.370940,1.146102,0.174499,1.566233,1.347048,0.452445
Alabama,2018,15665904.0,0.429977,0.337180,2.518674,7.446313,2.348701,0.137982,0.131973,1.258020,...,0.146203,0.348533,4.082669,0.590029,1.431522,1.243038,0.101393,1.588319,1.306327,0.491612
Alabama,2019,12503066.0,0.546280,0.826942,1.225837,5.512885,1.525987,0.213903,0.281522,0.710724,...,0.229673,0.278895,3.550843,0.697449,1.858430,1.377096,0.161808,2.470479,2.947308,0.520705
Alabama,2020,12180702.0,0.236026,0.237738,1.218848,4.062597,1.424163,0.069485,0.166701,0.385128,...,0.091982,3.418748,3.137177,0.586778,2.274710,1.742754,0.186051,1.017485,2.712570,0.737441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,2016,149557.0,0.734603,0.246796,0.167450,0.122540,0.007308,0.133128,0.021710,0.091500,...,0.000000,0.302536,0.028816,0.033175,0.009282,0.055865,0.033900,0.200232,0.035254,0.058860
Wyoming,2017,133557.0,0.382007,0.360469,0.115515,0.094420,0.005616,0.001671,0.006888,0.079124,...,0.000000,0.166972,0.033346,0.034263,0.008228,0.029623,0.037667,0.133567,0.030751,0.027137
Wyoming,2018,156715.0,0.917983,0.294246,0.105284,0.121376,0.007479,0.000821,0.005218,0.056927,...,0.000000,0.034775,0.027632,0.030043,0.012779,0.052106,0.034736,0.240957,0.031633,0.036621
Wyoming,2019,171689.0,0.748713,0.067439,0.168411,0.197801,0.008192,0.000111,0.006780,0.067186,...,0.000000,0.017389,0.012059,0.051632,0.016255,0.054683,0.404749,0.310705,0.028292,0.035511
