# Extract - Melbourne House Data 

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Reading in dataframe from CSV
df = pd.read_csv("Resources/Melbourne_housing_FULL.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


# EDA AND TRANSFORM - Melbourne House Data 

In [3]:
full_len = len(df)
full_len

34857

In [4]:
# Splitting the date
df[['Day', 'Month', 'Year']] = df["Date"].str.split("/", expand=True,)
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Day,Month,Year
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0,3,9,2016
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0,3,12,2016
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0,4,2,2016
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0,4,2,2016
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0,4,3,2017


In [5]:
# List of columns
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount', 'Day', 'Month', 'Year'],
      dtype='object')

In [6]:
# Reducing to columns of interest
# 'Landsize'
df = df.loc[:,['Suburb', 'Rooms', 'Type', 'Price', 'Bathroom', 'Car', 'Month', 'Year']]
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Bathroom,Car,Month,Year
0,Abbotsford,2,h,,1.0,1.0,9,2016
1,Abbotsford,2,h,1480000.0,1.0,1.0,12,2016
2,Abbotsford,2,h,1035000.0,1.0,0.0,2,2016
3,Abbotsford,3,u,,2.0,1.0,2,2016
4,Abbotsford,3,h,1465000.0,2.0,0.0,3,2017


In [7]:
# Changing column headers to lower case
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,suburb,rooms,type,price,bathroom,car,month,year
0,Abbotsford,2,h,,1.0,1.0,9,2016
1,Abbotsford,2,h,1480000.0,1.0,1.0,12,2016
2,Abbotsford,2,h,1035000.0,1.0,0.0,2,2016
3,Abbotsford,3,u,,2.0,1.0,2,2016
4,Abbotsford,3,h,1465000.0,2.0,0.0,3,2017


In [8]:
# Investigating columns with null values
df.isnull().sum()

suburb         0
rooms          0
type           0
price       7610
bathroom    8226
car         8728
month          0
year           0
dtype: int64

In [9]:
# Dropping these null values
house_df = df.dropna() 
house_df.head()

Unnamed: 0,suburb,rooms,type,price,bathroom,car,month,year
1,Abbotsford,2,h,1480000.0,1.0,1.0,12,2016
2,Abbotsford,2,h,1035000.0,1.0,0.0,2,2016
4,Abbotsford,3,h,1465000.0,2.0,0.0,3,2017
5,Abbotsford,3,h,850000.0,2.0,1.0,3,2017
6,Abbotsford,4,h,1600000.0,1.0,2.0,6,2016


In [10]:
# Checking the length and that removal of na has worked
final_len = len(house_df)
print(final_len)
print(house_df.isnull().sum())

20423
suburb      0
rooms       0
type        0
price       0
bathroom    0
car         0
month       0
year        0
dtype: int64


In [11]:
difference = full_len - final_len
percent = (difference/full_len)*100
(f"The dataframe has been reduced from {full_len} rows to {final_len} rows, by {round(percent,2)} percent")

'The dataframe has been reduced from 34857 rows to 20423 rows, by 41.41 percent'

In [12]:
# Investigating different types of houses
house_df['type'].unique()

array(['h', 'u', 't'], dtype=object)

In [24]:
df['year'].unique()

array(['2016', '2017', '2018'], dtype=object)

# Extract - VIC Crime Data

In [15]:
crime_vic_df = pd.read_csv("Resources/CrimeSuburbYear.csv")
crime_vic_df.head()

Unnamed: 0,postcode,suburb,lat,lon,Local Government Area,Region,Year,A20,A50,A70,...,F20,F30,F90,Total,A,B,C,D,E,F
0,3000,melbourne,-37.814563,144.970267,Melbourne,Northern Metropolitan,2011,1032,116,99,...,13,36,3,14175,1414,7331,404,3764,1210,52
1,3002,east melbourne,-37.81664,144.987811,Melbourne,Northern Metropolitan,2011,53,12,4,...,0,9,0,753,76,476,32,149,11,9
2,3003,west melbourne,-37.806255,144.941123,Melbourne,Northern Metropolitan,2011,54,9,3,...,2,1,2,633,80,403,32,107,6,5
3,3006,southbank,-37.823258,144.965926,Melbourne,Southern Metropolitan,2011,237,21,14,...,0,14,2,2059,310,1103,60,545,25,16
4,3008,docklands,-37.814719,144.948039,Melbourne,Southern Metropolitan,2011,113,7,8,...,4,6,3,1244,149,641,35,389,17,13


# EDA AND TRANSFORM - VIC Crime Data

In [16]:
# Investigating columns
crime_vic_df.columns

Index(['postcode', 'suburb', 'lat', 'lon', 'Local Government Area', 'Region',
       'Year', 'A20', 'A50', 'A70', 'A80', 'A90', 'B10', 'B20', 'B30', 'B40',
       'B50', 'B60', 'C10', 'C20', 'C30', 'C90', 'D10', 'D20', 'D30', 'D40',
       'E10', 'E20', 'F10', 'F20', 'F30', 'F90', 'Total', 'A', 'B', 'C', 'D',
       'E', 'F'],
      dtype='object')

In [17]:
crime_vic_df["Year"].unique()

array([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [19]:
# Selecting columns of interest
crime_vic_df = crime_vic_df.loc[:,['suburb', 'Year', 'Total']]
crime_vic_df.head()

Unnamed: 0,suburb,Year,Total
0,melbourne,2011,14175
1,east melbourne,2011,753
2,west melbourne,2011,633
3,southbank,2011,2059
4,docklands,2011,1244


In [20]:
# Renaming columns
crime_vic_df.columns = 'suburb', 'year', 'total_incidents'
crime_vic_df.head()

Unnamed: 0,suburb,year,total_incidents
0,melbourne,2011,14175
1,east melbourne,2011,753
2,west melbourne,2011,633
3,southbank,2011,2059
4,docklands,2011,1244


2016, 2017, 2018 are the only years in the housing data

In [25]:
# Seperating 2016 incidents 
crime_2016_df = crime_vic_df.loc[(crime_vic_df["year"] == 2016), :]
crime_2016_df.head()

Unnamed: 0,suburb,year,total_incidents
14135,melbourne,2016,15485
14136,east melbourne,2016,818
14137,west melbourne,2016,705
14138,southbank,2016,2197
14139,docklands,2016,1578


In [26]:
crime_2016_df.dtypes

suburb             object
year                int64
total_incidents     int64
dtype: object

In [27]:
# Seperating 2017 incidents 
crime_2017_df = crime_vic_df.loc[(crime_vic_df["year"] == 2017), :]
crime_2017_df.head()

Unnamed: 0,suburb,year,total_incidents
16962,melbourne,2017,15162
16963,east melbourne,2017,666
16964,west melbourne,2017,743
16965,southbank,2017,2309
16966,docklands,2017,1777


In [28]:
# Seperating 2018 incidents 
crime_2018_df = crime_vic_df.loc[(crime_vic_df["year"] == 2018), :]
crime_2018_df.head()

Unnamed: 0,suburb,year,total_incidents
19789,melbourne,2018,14780
19790,east melbourne,2018,782
19791,west melbourne,2018,864
19792,southbank,2018,2476
19793,docklands,2018,2317


Merging 2016 with housing data

In [44]:
house_2018_df = pd.concat([house_df, crime_2018_df], axis=0, ignore_index=True)
house_2018_df

Unnamed: 0,suburb,rooms,type,price,bathroom,car,month,year,total_incidents
0,Abbotsford,2.0,h,1480000.0,1.0,1.0,12,2016,
1,Abbotsford,2.0,h,1035000.0,1.0,0.0,02,2016,
2,Abbotsford,3.0,h,1465000.0,2.0,0.0,03,2017,
3,Abbotsford,3.0,h,850000.0,2.0,1.0,03,2017,
4,Abbotsford,4.0,h,1600000.0,1.0,2.0,06,2016,
...,...,...,...,...,...,...,...,...,...
23245,wattle bank,,,,,,,2018,4.0
23246,wonthaggi,,,,,,,2018,498.0
23247,woolamai,,,,,,,2018,3.0
23248,inverloch,,,,,,,2018,177.0


In [45]:
house_2018_17_df = pd.concat([house_2018_df, crime_2017_df], axis=0, ignore_index=True)
house_2018_17_df

Unnamed: 0,suburb,rooms,type,price,bathroom,car,month,year,total_incidents
0,Abbotsford,2.0,h,1480000.0,1.0,1.0,12,2016,
1,Abbotsford,2.0,h,1035000.0,1.0,0.0,02,2016,
2,Abbotsford,3.0,h,1465000.0,2.0,0.0,03,2017,
3,Abbotsford,3.0,h,850000.0,2.0,1.0,03,2017,
4,Abbotsford,4.0,h,1600000.0,1.0,2.0,06,2016,
...,...,...,...,...,...,...,...,...,...
26072,wattle bank,,,,,,,2017,5.0
26073,wonthaggi,,,,,,,2017,647.0
26074,woolamai,,,,,,,2017,18.0
26075,inverloch,,,,,,,2017,192.0


In [46]:
house_2018_17_16_df = pd.concat([house_2018_17_df, crime_2016_df], axis=0, ignore_index=True)
house_2018_17_16_df

Unnamed: 0,suburb,rooms,type,price,bathroom,car,month,year,total_incidents
0,Abbotsford,2.0,h,1480000.0,1.0,1.0,12,2016,
1,Abbotsford,2.0,h,1035000.0,1.0,0.0,02,2016,
2,Abbotsford,3.0,h,1465000.0,2.0,0.0,03,2017,
3,Abbotsford,3.0,h,850000.0,2.0,1.0,03,2017,
4,Abbotsford,4.0,h,1600000.0,1.0,2.0,06,2016,
...,...,...,...,...,...,...,...,...,...
28899,wattle bank,,,,,,,2016,2.0
28900,wonthaggi,,,,,,,2016,610.0
28901,woolamai,,,,,,,2016,7.0
28902,inverloch,,,,,,,2016,164.0


In [48]:
house_2018_17_16_df = df.dropna() 
house_2018_17_16_df

Unnamed: 0,suburb,rooms,type,price,bathroom,car,month,year
1,Abbotsford,2,h,1480000.0,1.0,1.0,12,2016
2,Abbotsford,2,h,1035000.0,1.0,0.0,02,2016
4,Abbotsford,3,h,1465000.0,2.0,0.0,03,2017
5,Abbotsford,3,h,850000.0,2.0,1.0,03,2017
6,Abbotsford,4,h,1600000.0,1.0,2.0,06,2016
...,...,...,...,...,...,...,...,...
34849,Wollert,3,h,570000.0,2.0,2.0,02,2018
34852,Yarraville,4,h,1480000.0,1.0,3.0,02,2018
34853,Yarraville,2,h,888000.0,2.0,1.0,02,2018
34854,Yarraville,2,t,705000.0,1.0,2.0,02,2018


In [49]:
combined_len = len(house_2018_17_16_df)
print(f"Combined df length: {combined_len}")
print(f"We have lost {final_len - combined_len} rows of housing data when merging the 2 dataframes")

Combined df length: 20423
We have lost 0 rows of housing data when merging the 2 dataframes


In [50]:
# Number of suburbs in housing dataframe
house_subs = house_df['suburb'].nunique()
# Number of suburbs in combined dataframe
combined_subs = house_2018_17_16_df['suburb'].nunique()
print(f"Number of suburbs in combined df: {combined_subs}")
print(f'We have lost a total of {house_subs - combined_subs} suburbs from merging housing data with crime data.')

Number of suburbs in combined df: 338
We have lost a total of 0 suburbs from merging housing data with crime data.


# LOAD / EXPORT TO CSV

In [58]:
# Export final df to csv to create machine learning model
combined_df.to_csv("Resources/house_crime.csv")