# Initial Preprocessing of input data

Some of the dataset was available as individual .csv files.  1 per state.  Here we concatenate those files and save 1 file that can be loaded into GitHub

In [1]:
# Import dependencies
import pandas as pd
import numpy as np

## Load a file and see how it looks

In [2]:
# Test loading and processing 1 file
path = "Resources/Education_Welfare_TotalByState/"
filename = "New York.csv"
path_filename = path + filename

state_df = pd.read_csv(path_filename, delimiter=",", header=1)
state_df

Unnamed: 0,Year,GDP-NY $ million nominal,Population-NY million,Remaining Spending - Total $ million nominal,Unnamed: 4,Education - Total $ million nominal,Unnamed: 6,Welfare - Total $ million nominal,Unnamed: 8,Unemployment - Total $ million nominal,Unnamed: 10
0,1970,112349,18.241,10238.5,a,5576.4,a,2803.9,i,339.6,a
1,1971,119145,18.172,11887.3,a,6543.1,a,3462.4,i,582.4,a
2,1972,127221,18.103,12965.1,a,7686.2,a,4743.4,i,738.1,a
3,1973,136396,18.034,14957,a,7702.1,a,4389.6,i,618.9,a
4,1974,145419,17.965,16891.9,a,8102.2,a,4546.4,i,593.9,a
...,...,...,...,...,...,...,...,...,...,...,...
67,Data Sources for 2026:,,,,,,,,,,
68,,https://usgovernmentspending.blogspot.com/sear...,State GDP Information,,,,,,,,
69,Federal:,https://www.govinfo.gov/app/collection/budget/...,"Fed. Budget: Hist. Tables 3.2, 5.1, 7.1",,,,,,,,
70,State and Local:,https://www.census.gov/programs-surveys/gov-fi...,State and Local Gov. Finances,,,,,,,,


In [3]:
state_df.columns.tolist()
# use this to filter the columns to load below

['Year',
 'GDP-NY $ million nominal',
 'Population-NY million',
 'Remaining Spending - Total $ million nominal',
 'Unnamed: 4',
 'Education - Total $ million nominal',
 'Unnamed: 6',
 'Welfare - Total $ million nominal',
 'Unnamed: 8',
 'Unemployment - Total $ million nominal',
 'Unnamed: 10']

## Now get 1 file loaded and in the shape needed

In [4]:
# Test loading and processing 1 file
path = "Resources/Education_Welfare_TotalByState/"
filename = "New York.csv"
path_filename = path + filename

#columns_to_load = ['Year','Population-AL million', 'Education - Total $ million nominal','Welfare - Total $ million nominal']
columns_to_load = [0,2,5,7]    # Use column positions if the heading changes from file to file

rows_to_skip_before_header = 1     # default should be 0 if there are no empty rows at the top of the header
rows_to_load = 57      #  use this if the length of data is known/fixed and there are rows at the bottom you need to exclude.

state_df = pd.read_csv(path_filename, delimiter=",", header=rows_to_skip_before_header,
                       usecols=columns_to_load, nrows=rows_to_load)
state_df.columns = ['year','population_million','education_million','welfare_million']

# Add state name
state_name = filename.split(".")[0]   # Assume filename is "statename.csv"
state_df['state'] = state_name

# Reorder columns
state_df = state_df[['year','state','population_million', 'education_million', 'welfare_million']]
state_df

Unnamed: 0,year,state,population_million,education_million,welfare_million
0,1970,New York,18.241,5576.4,2803.9
1,1971,New York,18.172,6543.1,3462.4
2,1972,New York,18.103,7686.2,4743.4
3,1973,New York,18.034,7702.1,4389.6
4,1974,New York,17.965,8102.2,4546.4
5,1975,New York,17.897,8962.7,5231.2
6,1976,New York,17.828,9339.9,7643.9
7,1977,New York,17.76,9866.4,8169.2
8,1978,New York,17.693,10269.8,7808.9
9,1979,New York,17.625,10734.4,7429.7


## If happy make the above code into a function

In [5]:
# Take a path and filename, format to desired and return a DataFrame
def process_state(path, filename, empty_before_header=0):
    path_filename = path + filename

    # arguments
    columns_to_load = [0,2,5,7]    # Use column positions if the heading changes from file to file
    rows_to_skip_before_header = empty_before_header     # default should be 0 if there are no empty rows at the top of the header
    rows_to_load = 57      #  use this if the length of data is known/fixed and there are rows at the bottom you need to exclude.
    
    #read file
    state_df = pd.read_csv(path_filename, delimiter=",", header=rows_to_skip_before_header,
                       usecols=columns_to_load, nrows=rows_to_load)
    state_df.columns = ['year','population_million','education_million','welfare_million']

    # Add state name
    state_name = filename.split(".")[0]   # Assume filename is "statename.csv"
    state_df['state'] = state_name

    # Reorder columns
    state_df = state_df[['year','state','population_million', 'education_million', 'welfare_million']]
    state_df
    
    return state_df

## Iterate through all files in a directory and concatenate

In [6]:
import os
import sys

welfare_education_df = process_state("Resources/","National.education_wellfare_total_1970_2026.csv",0)

path = "Resources/Education_Welfare_TotalByState/"
empty_rows_above_header = 1

for filename in os.listdir(path):
    # print('Process state: '+ filename)
    state_df = process_state(path, filename, empty_rows_above_header)
    welfare_education_df = pd.concat([welfare_education_df, state_df], axis=0, ignore_index=True)

welfare_education_df.tail(20)

Unnamed: 0,year,state,population_million,education_million,welfare_million
2944,2007,Wyoming,0.523,2060.3,239.4
2945,2008,Wyoming,0.533,2286.7,259.5
2946,2009,Wyoming,0.544,2503.6,342.2
2947,2010,Wyoming,0.565,2505.6,484.0
2948,2011,Wyoming,0.567,2530.4,427.4
2949,2012,Wyoming,0.577,2672.5,381.5
2950,2013,Wyoming,0.583,2647.7,342.6
2951,2014,Wyoming,0.583,2783.0,338.3
2952,2015,Wyoming,0.586,2990.3,319.0
2953,2016,Wyoming,0.585,3181.0,358.8


## Check some statistics and details

In [7]:
print('---  STATISTICS  ---')
print(welfare_education_df.describe())
print('---  TOTAL NAs  ---')
print(welfare_education_df.isna().sum())
print('---  DTYPES  ---')
print(welfare_education_df.dtypes)
print('---  LIST OF STATES  ---')
print(welfare_education_df['state'].value_counts().count())
print(welfare_education_df['state'].value_counts())

---  STATISTICS  ---
              year  population_million  education_million  welfare_million
count  2964.000000         2964.000000       2.964000e+03     2.964000e+03
mean   1998.000000           10.594938       2.225726e+04     9.389528e+03
std      16.454726           37.865413       1.032112e+05     6.274371e+04
min    1970.000000            0.303000       1.199000e+02     1.280000e+01
25%    1984.000000            1.570750       1.931550e+03     5.900000e+02
50%    1998.000000            3.821500       4.919800e+03     1.313950e+03
75%    2012.000000            6.632750       1.372025e+04     3.561275e+03
max    2026.000000          343.192000       1.533802e+06     1.803767e+06
---  TOTAL NAs  ---
year                  0
state                 0
population_million    0
education_million     0
welfare_million       0
dtype: int64
---  DTYPES  ---
year                    int64
state                  object
population_million    float64
education_million     float64
welfare_millio

## Store into a csv,  drop the index when saving

In [8]:
welfare_education_df.to_csv('Resources/welfare_education_1970-2026.csv', index = False, index_label=False)