In [1]:
#import dependencies
import pandas as pd
from pathlib import Path

In [2]:
#read in homeless csv
home_df = pd.read_csv(Path('Resources/clean_homeless.csv'))
home_df.head()

Unnamed: 0,State_Year,Year,State,Measures,Count
0,2007_AK,2007,AK,Chronically Homeless Individuals,224
1,2007_AK,2007,AK,Homeless Individuals,696
2,2007_AK,2007,AK,Homeless People in Families,278
3,2007_AK,2007,AK,Sheltered Chronically Homeless Individuals,187
4,2007_AK,2007,AK,Sheltered Homeless,842


In [3]:
#check data types
home_df.dtypes

State_Year    object
Year           int64
State         object
Measures      object
Count          int64
dtype: object

In [4]:
#convert Count to int
home_df['Count'] = pd.to_numeric(home_df['Count'])
home_df.dtypes

State_Year    object
Year           int64
State         object
Measures      object
Count          int64
dtype: object

In [6]:
#drop measures column
home_df = home_df.drop(columns='Measures')
home_df.head()

Unnamed: 0,State_Year,Year,State,Count
0,2007_AK,2007,AK,224
1,2007_AK,2007,AK,696
2,2007_AK,2007,AK,278
3,2007_AK,2007,AK,187
4,2007_AK,2007,AK,842


In [18]:
#rename count column
home_df = home_df.rename(columns={"Count": "Homeless_Count"})
home_df.head()

Unnamed: 0,State_Year,Year,State,Homeless_Count
0,2007_AK,2007,AK,224
1,2007_AK,2007,AK,696
2,2007_AK,2007,AK,278
3,2007_AK,2007,AK,187
4,2007_AK,2007,AK,842


In [19]:
#groupby to aggregate count
final_home_df = home_df.groupby(['State_Year','Year','State'], as_index=False).sum()
final_home_df.head()

Unnamed: 0,State_Year,Year,State,Homeless_Count
0,2007_AK,2007,AK,7124
1,2007_AL,2007,AL,23794
2,2007_AR,2007,AR,17048
3,2007_AZ,2007,AZ,64192
4,2007_CA,2007,CA,636626


In [20]:
#export preprocessed data
final_home_df.to_csv('Resources/processed_homeless.csv',index=False)

In [21]:
#read in education csv
edu_df = pd.read_csv(Path('Resources/clean_education.csv'))
edu_df.head()

Unnamed: 0,State_Year,STATE,YEAR,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_PK_G,GRADES_KG_G,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G
0,1992_AL,AL,1992,2678885.0,2653798.0,8224.0,55460.0,57948.0,58025.0,41167.0,0.0,0.0,731634.0
1,1992_AK,AK,1992,1049591.0,972488.0,2371.0,10152.0,9748.0,8789.0,6714.0,0.0,0.0,122487.0
2,1992_AZ,AZ,1992,3258079.0,3401580.0,2544.0,53497.0,55433.0,49081.0,37410.0,0.0,0.0,673477.0
3,1992_AR,AR,1992,1711959.0,1743022.0,808.0,33511.0,34632.0,36011.0,27651.0,0.0,0.0,441490.0
4,1992_CA,CA,1992,26260025.0,27138832.0,59067.0,431763.0,418418.0,363296.0,270675.0,0.0,0.0,5254844.0


In [22]:
edu_df.dtypes

State_Year            object
STATE                 object
YEAR                   int64
TOTAL_REVENUE        float64
TOTAL_EXPENDITURE    float64
GRADES_PK_G          float64
GRADES_KG_G          float64
GRADES_4_G           float64
GRADES_8_G           float64
GRADES_12_G          float64
GRADES_1_8_G         float64
GRADES_9_12_G        float64
GRADES_ALL_G         float64
dtype: object

In [23]:
#convert floats to int
edu_df['TOTAL_REVENUE'] = edu_df['TOTAL_REVENUE'].astype(int)
edu_df['TOTAL_EXPENDITURE'] = edu_df['TOTAL_EXPENDITURE'].astype(int)
edu_df['GRADES_9_12_G'] = edu_df['GRADES_9_12_G'].astype(int)
edu_df['GRADES_ALL_G'] = edu_df['GRADES_ALL_G'].astype(int)
edu_df['GRADES_PK_G'] = edu_df['GRADES_PK_G'].astype(int)
edu_df['GRADES_KG_G'] = edu_df['GRADES_KG_G'].astype(int)
edu_df['GRADES_4_G'] = edu_df['GRADES_4_G'].astype(int)
edu_df['GRADES_8_G'] = edu_df['GRADES_8_G'].astype(int)
edu_df['GRADES_12_G'] = edu_df['GRADES_12_G'].astype(int)
edu_df['GRADES_1_8_G'] = edu_df['GRADES_1_8_G'].astype(int)
edu_df.head()

Unnamed: 0,State_Year,STATE,YEAR,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_PK_G,GRADES_KG_G,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G
0,1992_AL,AL,1992,2678885,2653798,8224,55460,57948,58025,41167,0,0,731634
1,1992_AK,AK,1992,1049591,972488,2371,10152,9748,8789,6714,0,0,122487
2,1992_AZ,AZ,1992,3258079,3401580,2544,53497,55433,49081,37410,0,0,673477
3,1992_AR,AR,1992,1711959,1743022,808,33511,34632,36011,27651,0,0,441490
4,1992_CA,CA,1992,26260025,27138832,59067,431763,418418,363296,270675,0,0,5254844


In [24]:
#export preprocessed data
edu_df.to_csv('Resources/processed_education.csv',index=False)

In [25]:
#merge datasets
merged_df = final_home_df.merge(edu_df, on='State_Year',how='inner')
merged_df.head()

Unnamed: 0,State_Year,Year,State,Homeless_Count,STATE,YEAR,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_PK_G,GRADES_KG_G,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G
0,2007_AK,2007,AK,7124,AK,2007,1800616,1938755,1679,9625,9420,9953,10092,77676,42049,131029
1,2007_AL,2007,AL,23794,AL,2007,7069040,7196459,3592,55972,57795,58792,46200,466414,216941,742919
2,2007_AR,2007,AR,17048,AR,2007,4415981,4779308,12795,38192,35710,35944,30708,288518,138921,479016
3,2007_AZ,2007,AZ,64192,AZ,2007,8724434,8709531,15854,86742,82876,81196,76275,668290,316376,1087447
4,2007_CA,2007,CA,636626,CA,2007,72516936,73225422,68002,454743,467305,490054,468281,3801685,2011865,6343471


In [26]:
merged_df = merged_df.drop(columns=['STATE','YEAR','State_Year'])
merged_df.head()

Unnamed: 0,Year,State,Homeless_Count,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_PK_G,GRADES_KG_G,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G
0,2007,AK,7124,1800616,1938755,1679,9625,9420,9953,10092,77676,42049,131029
1,2007,AL,23794,7069040,7196459,3592,55972,57795,58792,46200,466414,216941,742919
2,2007,AR,17048,4415981,4779308,12795,38192,35710,35944,30708,288518,138921,479016
3,2007,AZ,64192,8724434,8709531,15854,86742,82876,81196,76275,668290,316376,1087447
4,2007,CA,636626,72516936,73225422,68002,454743,467305,490054,468281,3801685,2011865,6343471


In [27]:
merged_df = merged_df[['Year','State','TOTAL_REVENUE','TOTAL_EXPENDITURE','GRADES_PK_G','GRADES_KG_G','GRADES_4_G','GRADES_8_G','GRADES_12_G','GRADES_1_8_G','GRADES_9_12_G',
             'GRADES_ALL_G','Homeless_Count']]
merged_df.head()

Unnamed: 0,Year,State,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_PK_G,GRADES_KG_G,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,Homeless_Count
0,2007,AK,1800616,1938755,1679,9625,9420,9953,10092,77676,42049,131029,7124
1,2007,AL,7069040,7196459,3592,55972,57795,58792,46200,466414,216941,742919,23794
2,2007,AR,4415981,4779308,12795,38192,35710,35944,30708,288518,138921,479016,17048
3,2007,AZ,8724434,8709531,15854,86742,82876,81196,76275,668290,316376,1087447,64192
4,2007,CA,72516936,73225422,68002,454743,467305,490054,468281,3801685,2011865,6343471,636626


In [16]:
#encode State
#merged_encoded_df = pd.get_dummies(merged_df, columns=['State'])
#merged_encoded_df

In [26]:
#scale columns (standardscaler)
#def scale(val):
#    return val/100

#merged_encoded_df['TOTAL_REVENUE'] = merged_encoded_df['TOTAL_REVENUE'].apply(scale)
#merged_encoded_df['TOTAL_EXPENDITURE'] = merged_encoded_df['TOTAL_EXPENDITURE'].apply(scale)
#merged_encoded_df['GRADES_9_12_G'] = merged_encoded_df['GRADES_9_12_G'].apply(scale)
#merged_encoded_df['GRADES_ALL_G'] = merged_encoded_df['GRADES_ALL_G'].apply(scale)
#merged_encoded_df

In [28]:
#export preprocessed data
merged_df.to_csv('Resources/homeless_edu.csv',index=False)