In [1]:
# dependencies
import pandas as pd
import numpy as np
import os
from sqlalchemy import create_engine

# config data saved to config.py - change values as instructed
# from config import postgresUname,postgresPword,postgresHost,postgresDb

ModuleNotFoundError: No module named 'config'

In [None]:
# define filename
house_prices_file = "all_perth_310121.csv"

# assign filepath
house_prices_path = os.path.join("resources", house_prices_file)

In [None]:
# load in data
house_prices_df = pd.read_csv(house_prices_path)

# inspect data
house_prices_df.head()

In [None]:
# inspect data types
house_prices_df.dtypes

In [None]:
# transform datatype to date
house_prices_df["DATE_SOLD"] = pd.to_datetime(house_prices_df["DATE_SOLD"])

# transform date to unix
house_prices_df["DATE_SOLD_UNIX"] = (house_prices_df["DATE_SOLD"] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# confirm dates are unix format
house_prices_df["DATE_SOLD_UNIX"]

In [None]:
# find house duplicates
house_prices_df.ADDRESS.duplicated().sum()

In [None]:
# prepare sale info dataframe
sale_info_df = house_prices_df[['PRICE', 'DATE_SOLD_UNIX']]
sale_info_df = sale_info_df.rename(columns={'PRICE':'price','DATE_SOLD_UNIX':'date'})
sale_info_df['sale_id'] = sale_info_df.index + 100001
sale_info_df = sale_info_df[['sale_id','price','date']]

sale_info_df.head()

In [None]:
# prepare location info dataframe
location_info_df = house_prices_df[['ADDRESS','SUBURB','POSTCODE','LATITUDE','LONGITUDE','CBD_DIST','NEAREST_STN','NEAREST_STN_DIST','NEAREST_SCH','NEAREST_SCH_DIST','NEAREST_SCH_RANK']]
location_info_df = location_info_df.rename(columns={'ADDRESS':'address','SUBURB':'suburb','POSTCODE':'postcode','LATITUDE':'latitude','LONGITUDE':'longitude','CBD_DIST':'cbd_dist', 'NEAREST_STN':'nearest_stn','NEAREST_STN_DIST':'nearest_stn_dist','NEAREST_SCH':'nearest_sch','NEAREST_SCH_DIST':'nearest_sch_dist','NEAREST_SCH_RANK':'nearest_sch_rank'})
location_info_df = location_info_df.drop_duplicates(subset='address')
location_info_df = location_info_df.reset_index()
location_info_df['house_id'] = location_info_df.index + 200001
location_info_df = location_info_df[['house_id','address','suburb','postcode','latitude','longitude','cbd_dist','nearest_stn','nearest_stn_dist','nearest_sch','nearest_sch_dist','nearest_sch_rank']]

location_info_df.head()

In [None]:
# prepare house info dataframe
house_df = house_prices_df[['ADDRESS','SUBURB','BEDROOMS','BATHROOMS','GARAGE','LAND_AREA','FLOOR_AREA','BUILD_YEAR']]
house_df = house_df.rename(columns={'ADDRESS':'address','SUBURB':'suburb','BEDROOMS':'bedrooms','BATHROOMS':'bathrooms','GARAGE':'garage','LAND_AREA':'land_area','FLOOR_AREA':'floor_area','BUILD_YEAR':'build_year'})
house_df['sale_id'] = house_df.index + 100001
house_df = house_df[['sale_id','address','suburb','bedrooms','bathrooms','garage','land_area','floor_area','build_year']]

# merge location info and house dataframes
house_info_df = pd.DataFrame.merge(house_df,location_info_df,how="right",on=["address", "suburb"])
house_info_df = house_info_df[['sale_id','house_id','bedrooms','bathrooms','garage','land_area','floor_area','build_year']]
house_info_df.head()

In [None]:
# connection string using config data
connection = f'{postgresUname}:{postgresPword}@{postgresHost}/{postgresDb}'
engine = create_engine(f'postgresql://{connection}')

In [None]:
# confirm tables
engine.table_names()

In [None]:
# load sale info data
sale_info_df.to_sql(name='sale_info', con=engine, if_exists='append', index=False)

In [None]:
# load location info data
location_info_df.to_sql(name='location_info', con=engine, if_exists='append', index=False)

In [None]:
# load house info data
house_info_df.to_sql(name='house_info', con=engine, if_exists='append', index=False)