In [1]:
# import dependencies
import pandas as pd
import os
from sqlalchemy import create_engine

In [2]:
# define filename
house_prices_file = "all_perth_310121.csv"

# assign filepath
house_prices_path = os.path.join("resources", house_prices_file)

In [3]:
# load in data
house_prices_df = pd.read_csv(house_prices_path)

# inspect data
house_prices_df.head()

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK
0,1 Acorn Place,South Lake,565000,4,2,2.0,600,160,2003.0,18300,Cockburn Central Station,1800,09-2018\r,6164,-32.1159,115.84245,LAKELAND SENIOR HIGH SCHOOL,0.828339,
1,1 Addis Way,Wandi,365000,3,2,2.0,351,139,2013.0,26900,Kwinana Station,4900,02-2019\r,6167,-32.19347,115.859553,ATWELL COLLEGE,5.524324,129.0
2,1 Ainsley Court,Camillo,287000,3,1,1.0,719,86,1979.0,22600,Challis Station,1900,06-2015\r,6111,-32.120578,115.993579,KELMSCOTT SENIOR HIGH SCHOOL,1.649178,113.0
3,1 Albert Street,Bellevue,255000,2,1,2.0,651,59,1953.0,17900,Midland Station,3600,07-2018\r,6056,-31.900547,116.038009,SWAN VIEW SENIOR HIGH SCHOOL,1.571401,
4,1 Aman Place,Lockridge,325000,4,1,2.0,466,131,1998.0,11200,Bassendean Station,2000,11-2016\r,6054,-31.88579,115.94778,KIARA COLLEGE,1.514922,


In [4]:
# inspect data types
house_prices_df.dtypes

ADDRESS              object
SUBURB               object
PRICE                 int64
BEDROOMS              int64
BATHROOMS             int64
GARAGE              float64
LAND_AREA             int64
FLOOR_AREA            int64
BUILD_YEAR          float64
CBD_DIST              int64
NEAREST_STN          object
NEAREST_STN_DIST      int64
DATE_SOLD            object
POSTCODE              int64
LATITUDE            float64
LONGITUDE           float64
NEAREST_SCH          object
NEAREST_SCH_DIST    float64
NEAREST_SCH_RANK    float64
dtype: object

In [5]:
# transform datatype to date
house_prices_df["DATE_SOLD"] = pd.to_datetime(house_prices_df["DATE_SOLD"])

# transform date to unix
house_prices_df["DATE_SOLD_UNIX"] = (house_prices_df["DATE_SOLD"] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# confirm dates are unix format
house_prices_df["DATE_SOLD_UNIX"]

0        1535760000
1        1548979200
2        1433116800
3        1530403200
4        1477958400
            ...    
33651    1456790400
33652    1485907200
33653    1485907200
33654    1475280000
33655    1462060800
Name: DATE_SOLD_UNIX, Length: 33656, dtype: int64

In [9]:
# find house duplicates
house_prices_df.ADDRESS.duplicated().sum()

90

In [18]:
# prepare sale info dataframe
sale_info_df = house_prices_df[['PRICE', 'DATE_SOLD_UNIX']]
sale_info_df = sale_info_df.rename(columns={'PRICE':'price_sold','DATE_SOLD_UNIX':'date_sold'})
sale_info_df['sale_id'] = sale_info_df.index + 100000


sale_info_df.head()

Unnamed: 0,price_sold,date_sold,sale_id
0,565000,1535760000,100000
1,365000,1548979200,100001
2,287000,1433116800,100002
3,255000,1530403200,100003
4,325000,1477958400,100004
