In [43]:
import pandas as pd
from collections import namedtuple
from clean_census import clean_census_expenditure, clean_census_population, clean_census_poverty
from clean_funding import clean_funding
from utils_clean_and_analyze import NAICS_SECTOR_LST

CleanedData = namedtuple("CleanedData", ["expenditure_df", "per_capita_df", "funding_df_absolute", "funding_df_by_state", "funding_df_within_state"])

YEARS = ["2016", "2017", "2018", "2019", "2020"]

CLEAN_DATA_DIR = "../data/clean_data/"
RAW_DATA_DIR = "../data/raw_data/"

def clean_and_analyze_all(year_lst):
    """
    ###
    """
    cleaned_df_dct = analyze_expenditure_and_funding(year_lst)
    create_funding_time_series_df(year_lst, cleaned_df_dct)
    create_expenditure_time_series_df(year_lst, cleaned_df_dct)
    combine_multiple_years(year_lst, cleaned_df_dct)


def analyze_expenditure_and_funding(years):
    """
    ###

    Inputs:
        years (lst of str)

    Returns:
        cleaned_and_combined (dct)
    """
    # Creates and outputs population and poverty data
    poverty_df = clean_census_poverty(pd.read_csv(RAW_DATA_DIR + "us_poverty_by_state.csv"))
    population_df = clean_census_population(pd.read_csv(RAW_DATA_DIR + "us_census_population.csv")) 

    # poverty_df.to_csv(CLEAN_DATA_DIR + "us_poverty_cleaned.csv")
    # population_df.to_csv(CLEAN_DATA_DIR + "us_population_cleaned.csv")

    # Clean and combines census data and funding data from each year from 2016 to 2020

    cleaned_df_dct = {}

    for year in years:
        expenditure_csv = RAW_DATA_DIR + year + "_us_state_finances.csv" # "2016_us_state_finances.csv"
        funding_csv = RAW_DATA_DIR + year + "_us_funding.csv" # "2016_us_funding.csv"

        expenditure_df = clean_census_expenditure(pd.read_csv(expenditure_csv))
        funding_df_absolute, funding_df_by_state, funding_df_within_state  = clean_funding(pd.read_csv(funding_csv), year)
    
        per_capita_df = pd.DataFrame(columns=["Expenditure per Capita (in thousands)", "Funding received per Capita (in thousands)"])
        per_capita_df["Expenditure per Capita (in thousands)"] = expenditure_df["State Expenditure (in thousands)"] / population_df["Population"]
        per_capita_df["Funding received per Capita (in thousands)"] = funding_df_by_state["Total Funding Received"] / population_df["Population"]

        cleaned_df_dct[year] = CleanedData(expenditure_df, per_capita_df, funding_df_absolute, funding_df_by_state, funding_df_within_state)

        # Outputs files into directory
        # funding_df_by_state.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_by_state.csv")
        # funding_df_within_state.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_within_state.csv")
        # funding_df_absolute.to_csv(CLEAN_DATA_DIR + year + "_cleaned_funding_absolute.csv")
        # expenditure_df.to_csv(CLEAN_DATA_DIR + year + "_cleaned_expenditure.csv")
        # per_capita_df.to_csv(CLEAN_DATA_DIR + year + "_per_capita_analysis.csv")

    return cleaned_df_dct


def create_funding_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    funding_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        funding_df_absolute = clean_df_dct.get(year).funding_df_absolute
        us_row_only = funding_df_absolute.loc[funding_df_absolute.index == "United States"]
        us_row_only = us_row_only[NAICS_SECTOR_LST]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        funding_time_series[year] = (us_row_only["Amount"] / us_row_only["Amount"].sum()) * 100

    funding_time_series.index.names = ["NAICS Category"]

    # Outputs file into directory
    # funding_time_series.to_csv(CLEAN_DATA_DIR + "us_funding_time_series.csv")

    return funding_time_series


def create_expenditure_time_series_df(year_lst, clean_df_dct):
    """
    ###
    """
    expenditure_time_series = pd.DataFrame(columns = year_lst)
    for year in year_lst:
        expenditure_df = clean_df_dct.get(year).expenditure_df
        us_row_only = expenditure_df.loc[expenditure_df.index == "United States"]
        sum = int(us_row_only.loc[us_row_only.index == "United States", "State Expenditure (in thousands)"])
        us_row_only = us_row_only[["Utilities", "Health and Social Services Expenditure", "Education Related Expenditure", "Public Administration Expenditure", "Transportation Expenditure"]]
        us_row_only = us_row_only.transpose()
        us_row_only.rename(columns = {'United States':'Amount'}, inplace = True)
        us_row_only["Sum"] = sum
        expenditure_time_series[year] = (us_row_only["Amount"] / us_row_only["Sum"]) * 100
    
    expenditure_time_series.index.names = ["Category"]

    # Outputs file into directory
    # expenditure_time_series.to_csv(CLEAN_DATA_DIR + "us_expenditure_time_series.csv")
            
    return expenditure_time_series


def combine_multiple_years(year_lst, clean_df_dct):
    """
    ###
    """
    funding_df_lst = []
    for year in year_lst:
        funding_df_lst.append(clean_df_dct.get(year).funding_df_by_state)

    combined_df = pd.concat(funding_df_lst)
    combined_df = combined_df[combined_df.index != "United States"]
    combined_df = combined_df.sort_values(["State", "Year"])

    # Outputs file into directory
    # combined_df.to_csv(CLEAN_DATA_DIR + "all_years_funding_by_state.csv")

    return combined_df

In [44]:
# ORIGINAL CODE TO RUN ALL FILES
def clean_and_analyze_all(year_lst):
    """
    ###
    """
    cleaned_df_dct = analyze_expenditure_and_funding(year_lst)
    create_funding_time_series_df(year_lst, cleaned_df_dct)
    create_expenditure_time_series_df(year_lst, cleaned_df_dct)
    combine_multiple_years(year_lst, cleaned_df_dct)

# FOR TESTING
def clean_and_analyze_all(year_lst):
    """
    ###
    """
    cleaned_df_dct = analyze_expenditure_and_funding(year_lst)
    funding_time_series = create_funding_time_series_df(year_lst, cleaned_df_dct)
    expenditure_time_series = create_expenditure_time_series_df(year_lst, cleaned_df_dct)
    all_years_funding_by_state = combine_multiple_years(year_lst, cleaned_df_dct)

    # FOR TESTING
    return cleaned_df_dct, funding_time_series, expenditure_time_series, all_years_funding_by_state

In [45]:
TEST_YEARS = ["2016", "2017", "2018", "2019", "2020"]

cleaned_df_dct, funding_time_series, expenditure_time_series, all_years_funding_by_state = clean_and_analyze_all(TEST_YEARS)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poverty_df['State'] = poverty_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df['State'] = pop_df['State'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.index[0:66], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyda

In [48]:
cleaned_df_dct["2020"].funding_df_absolute

Unnamed: 0_level_0,Year,"Agriculture, Forestry, Fishing and Hunting","Mining, Quarrying, and Oil and Gas Extraction",Utilities,Construction,Manufacturing,Wholesale Trade,Retail Trade,Transportation and Warehousing,Information,...,Real Estate and Rental and Leasing,"Professional, Scientific, and Technical Services",Administrative and Support and Waste Management and Remediation Services,Educational Services,Health Care and Social Assistance,"Arts, Entertainment, and Recreation",Accommodation and Food Services,Other Services (except Public Administration),Public Administration (not covered in economic census),Total Funding Received
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaska,2020,8520,40406,99600,1556319,366765,27762,967,185398,59886,...,10593,1208638,1613136,159612,32632,687,10948,36063,16264,5434282.0
Alabama,2020,2323,575,24123,1761312,3475478,9580,1893,74815,41465,...,32921,5891315,310338,105204,307376,348,15148,102419,4306,12180702.0
Arkansas,2020,12826,95,3638,116397,316915,1066,220,9608,2498,...,414,80799,156894,2369,22962,31,1613,2619,2028,732996.0
Arizona,2020,30233,3872,32132,611755,8186219,39446,4222,236965,233222,...,18910,2440171,266076,235192,7505325,1028,43855,48429,1408,23575622.0
California,2020,120684,45745,195621,6078632,21782637,287868,156451,1685017,1080202,...,73511,19400196,1320188,357895,1224832,40108,111789,442964,18461,57859429.0
Colorado,2020,32971,973,96474,622345,2117555,14915,747,942876,684756,...,4327,6194044,2217639,114022,778449,4030,37271,199403,9608,14074552.0
Connecticut,2020,188,87,2801,71185,16176330,11905,1525,566150,31165,...,13417,2677887,73651,16574,52215,531,6874,17277,2988,19727600.0
Delaware,2020,213,696,1390,23411,671753,612,14,78526,3077,...,817,46600,23383,10235,2320,100,3212,7794,44,1035408.0
Florida,2020,26921,4758,100311,1674501,11992314,495923,16388,1702978,105002,...,67330,4802636,1506876,439868,450956,28584,85606,212324,102418,23899859.0
Georgia,2020,6363,19456,62133,396935,4173862,64754,50568,599048,164537,...,14086,2323081,499184,116157,66854,11556,109600,139154,4196,8845570.0
