In [1]:
import pandas as pd
from census import Census
from config import api_key, db_username, db_password
from sqlalchemy import create_engine
c = Census(api_key, year=2017)

In [2]:
census_data = c.acs5.get(("NAME", "B19013_001E", 
                          "B01003_001E", 
                          "B19301_001E",
                          "B17001_002E",
                          "B23025_005E",
                         "B25077_001E"), {'for': 'zip code tabulation area:*'})

# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

# Column Reordering
census_pd = census_pd.rename(columns={"B01003_001E": "population",
                                      "B19013_001E": "household_income",
                                      "B19301_001E": "per_capita_income",
                                      "B17001_002E": "poverty_count",
                                      "B23025_005E": "unemployment_count",
                                      "B25077_001E": "median_home_value",
                                      "NAME": "Name", "zip code tabulation area": "zip_code"})

In [3]:
census_cleaned = census_pd[["zip_code","population",
                            "median_home_value",
                            "per_capita_income",
                            "household_income","poverty_count","unemployment_count"]]

census_cleaned = census_cleaned.dropna()
census_cleaned.drop_duplicates("zip_code", inplace=True)

census_cleaned["zip_code"]= census_cleaned["zip_code"].astype(int)
census_cleaned["population"]= census_cleaned["population"].astype(int)
census_cleaned["per_capita_income"]= census_cleaned["per_capita_income"].astype(int)
census_cleaned["household_income"]= census_cleaned["household_income"].astype(int)
census_cleaned["median_home_value"]= census_cleaned["median_home_value"].astype(int)
census_cleaned["poverty_count"]= census_cleaned["poverty_count"].astype(int)
census_cleaned["unemployment_count"]= census_cleaned["unemployment_count"].astype(int)

census_cleaned = census_cleaned[census_cleaned["zip_code"].between(32004, 34997)]
census_cleaned = census_cleaned[census_cleaned["median_home_value"].gt(0)]
census_cleaned["poverty_rate"] = round(census_cleaned["poverty_count"]/census_cleaned["population"],3)*100


census_cleaned.set_index("zip_code", inplace=True)
census_cleaned.head(10)

Unnamed: 0_level_0,population,median_home_value,per_capita_income,household_income,poverty_count,unemployment_count,poverty_rate
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
32008,4808,74900,21277,42235,766,51,15.9
32009,3647,154900,25970,65469,313,40,8.6
32011,14446,157900,27153,61176,1492,376,10.3
32024,19479,120200,25607,49825,2643,618,13.6
32025,22427,109700,20110,43891,2002,583,8.9
32033,4683,136300,24873,49107,1077,348,23.0
32034,33695,294400,41397,68533,4061,983,12.1
32038,9891,99700,22686,41325,1564,433,15.8
32040,7553,131600,22052,56886,1352,303,17.9
32043,24381,168000,28857,59960,2730,944,11.2


In [4]:
connection_string = f"{db_username}:{db_password}@localhost:5432/etl_project_db"
engine = create_engine(f'postgresql://{connection_string}')

In [5]:
census_cleaned.to_sql(name='census_data', con=engine, if_exists='append', index=True)

In [6]:
csv_file = "Resources/merged_school_zhi_value.csv"
merged_data = pd.read_csv(csv_file)
merged_data = merged_data.drop(columns="Unnamed: 0")
merged_data.head()

Unnamed: 0,Zip Code,School Name,English Language Arts Achievement,English Language Arts Learning Gains,English Language Arts Learning Gains of the Lowest 25%,Mathematics Achievement,Mathematics Learning Gains,Mathematics Learning Gains of the Lowest 25%,Science Achievement,Social Studies Achievement,...,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02
0,32603,J. J. FINLEY ELEMENTARY SCHOOL,55.0,57.0,45.0,56.0,59.0,45.0,60.0,,...,286035.0,287334.0,287821.0,289008.0,290524.0,291696.0,291216.0,291212.0,292060.0,292616.0
1,32609,STEPHEN FOSTER ELEMENTARY SCHOOL,63.0,61.0,38.0,69.0,66.0,40.0,58.0,,...,133012.0,133550.0,134383.0,135429.0,137031.0,138650.0,140052.0,141110.0,142404.0,144014.0
2,32641,LAKE FOREST ELEMENTARY SCHOOL,23.0,31.0,56.0,26.0,32.0,37.0,18.0,,...,102785.0,103191.0,103928.0,104809.0,106085.0,107431.0,108637.0,109839.0,110873.0,112122.0
3,32605,LITTLEWOOD ELEMENTARY SCHOOL,63.0,61.0,50.0,61.0,66.0,50.0,56.0,,...,207737.0,208196.0,208743.0,209731.0,211050.0,212435.0,213381.0,214302.0,215809.0,218061.0
4,32609,W. A. METCALFE ELEMENTARY SCHOOL,29.0,57.0,80.0,48.0,71.0,62.0,39.0,,...,133012.0,133550.0,134383.0,135429.0,137031.0,138650.0,140052.0,141110.0,142404.0,144014.0


In [11]:
# Select columns we need
school_grade_df = merged_data.iloc[:, : 45]
school_grade_df = school_grade_df[['Zip Code','School Name', 'English Language Arts Achievement', 'Mathematics Achievement', 'Science Achievement', 'Social Studies Achievement', 'Grade 2017']]
school_grade_df = school_grade_df.rename(columns={'Zip Code': 'zip_code', 'School Name': 'school_name', 'English Language Arts Achievement': 'english_achievement', 'Mathematics Achievement': 'math_archievement', 'Science Achievement': 'science_achievement', 'Social Studies Achievement': 'social_studies_achievement', 'Grade 2017': 'grade_2017'})
school_grade_df = school_grade_df.dropna()
school_grade_df.head(5)

Unnamed: 0,zip_code,school_name,english_achievement,math_archievement,science_achievement,social_studies_achievement,grade_2017
6,32641,ABRAHAM LINCOLN MIDDLE SCHOOL,62.0,61.0,65.0,68.0,B
7,32609,HOWARD W. BISHOP MIDDLE SCHOOL,59.0,60.0,55.0,69.0,B
8,32605,WESTWOOD MIDDLE SCHOOL,55.0,57.0,56.0,69.0,B
9,32609,GAINESVILLE HIGH SCHOOL,58.0,55.0,64.0,75.0,B
12,32640,HAWTHORNE MIDDLE/HIGH SCHOOL,41.0,42.0,39.0,64.0,D


In [12]:
engine.table_names()

['census_data']

In [13]:
school_grade_df.to_sql(name='school_grades', con=engine, if_exists='append', index=False)

In [14]:
# Import csv file
single_family_housing = "data/Zip_Zhvi_SingleFamilyResidence.csv"

# Create dataframe from csv
sfh_df = pd.read_csv(single_family_housing)
sfh_df.head()

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2009-01,2009-02,2009-03,...,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,1553689.0,1526876.0,1518443.0,...,1365272,1364535,1357186,1354644,1343961,1336894,1333066,1334392,1343701,1351552
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,808608.0,800968.0,791874.0,...,967517,966429,967322,967450,967003,966057,965021,963959,964246,966092
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,...,1506127,1494778,1481698,1470185,1461983,1457776,1454757,1454081,1460325,1465530
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,258171.0,258591.0,259182.0,...,335155,335051,335418,335480,335596,335547,335395,335309,334819,334762
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1111184.0,1097752.0,1087805.0,...,1201189,1200418,1201438,1201133,1199177,1197266,1194983,1193391,1193661,1196524


In [15]:
# Rename "RegionName" column to "Zip Code"
sfh_df.rename(columns={"RegionName":"zip_code"}, inplace=True)

# Drop useless columns
clean_df_1 = sfh_df.drop(sfh_df.iloc[:, 7:103], axis=1)
clean_df_2 = clean_df_1.drop(clean_df_1.iloc[:, 19:60], axis=1)
clean_df_2.head()

Unnamed: 0,RegionID,zip_code,City,State,Metro,CountyName,SizeRank,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,1476509.0,1476630.0,1479410.0,1478879.0,1481672.0,1476174.0,1482127.0,1483230.0,1494017.0,1486270.0,1479032.0,1461584.0
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,948850.0,943920.0,947429.0,950081.0,951676.0,950563.0,948773.0,950302.0,953132.0,952897.0,950087.0,950589.0
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,1630033.0,1608263.0,1591918.0,1583357.0,1581908.0,1579167.0,1581337.0,1585707.0,1589950.0,1579912.0,1568794.0,1554248.0
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,329980.0,330687.0,331548.0,331712.0,330742.0,330434.0,330664.0,330748.0,330532.0,330509.0,330513.0,331409.0
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1170197.0,1168909.0,1176444.0,1182024.0,1186085.0,1186644.0,1184674.0,1186064.0,1190520.0,1193080.0,1193256.0,1196558.0
