In [1]:
import pandas as pd

### Importing housing csv

In [2]:
house_df = pd.read_csv("archive/Melbourne_housing_FULL.csv")
house_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
# Checking NaN values
house_df.isna().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
YearBuilt        19306
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
dtype: int64

### Getting rid of the things I actually don't want

In [5]:
# break down date into day month year
# maybe keep month and year
# find day of the week for the 
# 0 - 11 for the month
# use end two numbers for year

# for model to be able to find pattern 
# need to number the features
# might need to scale
# normalise
# min max scaler
#


# check the importance of the features after building the model
# then see which ones we don't need




In [6]:
## actually maybe split into day/month/year first

In [7]:
## need to check for duplicate houses first


In [8]:
# then drop columns

In [9]:
# Determining the columns in the dataframe

house_df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [10]:
# Checking the difference between landsize and building area. 
# note: most property listings include landsize over building area so would prefer to keep that one.
house_df[['Landsize', 'BuildingArea']].head()

Unnamed: 0,Landsize,BuildingArea
0,126.0,
1,202.0,
2,156.0,79.0
3,0.0,
4,134.0,150.0


In [11]:
# Counting the NaN values in each column
# there are more in building area, so again better to drop

house_df[['Landsize', 'BuildingArea']].isna().sum()

Landsize        11810
BuildingArea    21115
dtype: int64

In [12]:
# Checking the difference between Rooms and Bedroom2. 
# note: the kaggle information explains that both are the number of bedrooms for a property but from different sources

house_df[['Rooms', 'Bedroom2']].head()

Unnamed: 0,Rooms,Bedroom2
0,2,2.0
1,2,2.0
2,2,2.0
3,3,3.0
4,3,3.0


In [13]:
# Counting the NaN values in each column
house_df[['Rooms', 'Bedroom2']].isna().sum()

# The Rooms column has no NaN values whereas the Bedroom2 column has over 8000, so will drop the Bedroom2 column

Rooms          0
Bedroom2    8217
dtype: int64

In [14]:
# Will also be dropping the 'Address', 'Method', 'SellerG','YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount' columns
# These are either irrelevant or they are not available on property listing websites so would not be suitable for trainingte model

house_df = house_df.drop(columns=['Address', 'Method', 'SellerG', 'Bedroom2', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount'])
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname
0,Abbotsford,2,h,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan
1,Abbotsford,2,h,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan
2,Abbotsford,2,h,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan
3,Abbotsford,3,u,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan
4,Abbotsford,3,h,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan


In [15]:
# Keeping CouncilArea and Regionname for now as these may be needed when joining to crime data

In [16]:
# could also drop 'Distance' because this will require calculation to get it from domain address
# seems in this table it is based on suburb
# so maybe not saying anything different

# same with postcode
# is this needed if I have suburb??

In [17]:
# Checking NaN values for each column
house_df.isna().sum()

Suburb             0
Rooms              0
Type               0
Price           7610
Date               0
Distance           1
Postcode           1
Bathroom        8226
Car             8728
Landsize       11810
CouncilArea        3
Regionname         3
dtype: int64

### Changing 'Type' column to numerical

In [18]:
house_df["Type"].unique()

# house is set as house, unit, townhouse

array(['h', 'u', 't'], dtype=object)

In [19]:
# replacing the values with numbers

house_df = house_df.replace({"Type": {'h':0, 'u': 1,'t':2 }})

In [20]:
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan


# breaking up date into three columns

In [21]:
import datetime
#pandas datetimeindex docs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DatetimeIndex.html

In [22]:
# Adding a 'Year' column
house_df['Year'] = pd.DatetimeIndex(house_df['Date']).year
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017


In [23]:
house_df["Year"].unique()

array([2016, 2017, 2018])

In [24]:
# Adding a 'Month' column
house_df['Month'] = pd.DatetimeIndex(house_df['Date']).month
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,3
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,3
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,4
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,4
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,4


In [25]:
house_df["Month"].dtype

dtype('int64')

In [26]:
# subtracting one from each month so that they start with Jan as 0, all the way through to december as 11

house_df["Month"] = house_df["Month"].sub(1)
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,2
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,2
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,3
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,3
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,3


In [27]:
# Adding a 'Day' column
house_df['Day'] = pd.DatetimeIndex(house_df['Date']).day
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month,Day
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,2,9
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,2,12
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,3,2
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,3,2
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,3,3


In [28]:
house_suburbs = house_df["Suburb"].unique()
len(house_suburbs)

351

Now would drop columns 

In [None]:
##house_no_date_df = house_df(columns=['Date'])

In [None]:
# Drop the null columns where all values are null
house_df = house_df.dropna(axis='columns', how='all')
# Drop the null rows
house_df = house_df.dropna()

### Crime Data

In [29]:
crime_df = pd.read_csv("suburbCrimeDataClean.csv")
crime_df.head()

Unnamed: 0,lga_name,postcode,suburb,crime_id,crime_name,average_incident
0,Banyule,3079,ivanhoe,A20,Assault and related offences,43.6
1,Banyule,3079,ivanhoe,A50,Robbery,3.0
2,Banyule,3079,ivanhoe,A70,"Stalking, harassment and threatening behaviour",8.1
3,Banyule,3079,ivanhoe,A80,Dangerous and negligent acts endangering people,3.333333
4,Banyule,3079,ivanhoe,B10,Arson,3.4


In [30]:
crime_suburbs = crime_df["suburb"].unique()

In [31]:
set(house_suburbs) == set(crime_suburbs)

False

In [32]:
# so they don't ahve the same suburbs

In [33]:
sorted(house_suburbs) == sorted(crime_suburbs)

False

In [34]:
from collections import Counter
Counter(house_suburbs) == Counter(crime_suburbs)

False

In [35]:
house_suburbs.count

AttributeError: 'numpy.ndarray' object has no attribute 'count'

In [36]:
house_suburbs = list(house_suburbs)

In [37]:
crime_suburbs = list(crime_suburbs)

In [38]:
set(house_suburbs) == set(crime_suburbs)

False

In [39]:
sorted(house_suburbs) == sorted(crime_suburbs)

False

In [40]:
from collections import Counter
Counter(house_suburbs) == Counter(crime_suburbs)

False

In [41]:
len(house_suburbs)

351

In [42]:
len(crime_suburbs)

313

In [43]:
## is this data set just melbourne??

In [44]:
## melbourne has 321 suburbs apparently

In [None]:
crime_df = crime_df.groupby(by="lga_name").mean()

#### Second one

In [45]:
# really not sure what this second one is saying though
# like what are the columns

In [46]:
crime_df_2 = pd.read_csv("CrimeSuburbYear.csv")
crime_df_2.head()

Unnamed: 0,postcode,suburb,lat,lon,Local Government Area,Region,Year,A20,A50,A70,...,F20,F30,F90,Total,A,B,C,D,E,F
0,3000,melbourne,-37.814563,144.970267,Melbourne,Northern Metropolitan,2011,1032,116,99,...,13,36,3,14175,1414,7331,404,3764,1210,52
1,3002,east melbourne,-37.81664,144.987811,Melbourne,Northern Metropolitan,2011,53,12,4,...,0,9,0,753,76,476,32,149,11,9
2,3003,west melbourne,-37.806255,144.941123,Melbourne,Northern Metropolitan,2011,54,9,3,...,2,1,2,633,80,403,32,107,6,5
3,3006,southbank,-37.823258,144.965926,Melbourne,Southern Metropolitan,2011,237,21,14,...,0,14,2,2059,310,1103,60,545,25,16
4,3008,docklands,-37.814719,144.948039,Melbourne,Southern Metropolitan,2011,113,7,8,...,4,6,3,1244,149,641,35,389,17,13


In [47]:
crime_df_2

Unnamed: 0,postcode,suburb,lat,lon,Local Government Area,Region,Year,A20,A50,A70,...,F20,F30,F90,Total,A,B,C,D,E,F
0,3000,melbourne,-37.814563,144.970267,Melbourne,Northern Metropolitan,2011,1032,116,99,...,13,36,3,14175,1414,7331,404,3764,1210,52
1,3002,east melbourne,-37.816640,144.987811,Melbourne,Northern Metropolitan,2011,53,12,4,...,0,9,0,753,76,476,32,149,11,9
2,3003,west melbourne,-37.806255,144.941123,Melbourne,Northern Metropolitan,2011,54,9,3,...,2,1,2,633,80,403,32,107,6,5
3,3006,southbank,-37.823258,144.965926,Melbourne,Southern Metropolitan,2011,237,21,14,...,0,14,2,2059,310,1103,60,545,25,16
4,3008,docklands,-37.814719,144.948039,Melbourne,Southern Metropolitan,2011,113,7,8,...,4,6,3,1244,149,641,35,389,17,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28265,3995,wattle bank,-38.576309,145.700931,Bass Coast,Eastern Victoria,2020,0,0,0,...,0,0,0,4,0,3,0,0,1,0
28266,3995,wonthaggi,-38.606312,145.588583,Bass Coast,Eastern Victoria,2020,73,0,19,...,0,1,6,509,112,223,23,23,121,7
28267,3995,woolamai,-38.498651,145.520561,Bass Coast,Eastern Victoria,2020,3,0,1,...,0,0,0,15,4,8,0,0,3,0
28268,3996,inverloch,-38.632958,145.729641,Bass Coast,Eastern Victoria,2020,26,1,4,...,0,0,5,177,41,87,9,7,28,5


In [48]:
crime_df_2.columns

Index(['postcode', 'suburb', 'lat', 'lon', 'Local Government Area', 'Region',
       'Year', 'A20', 'A50', 'A70', 'A80', 'A90', 'B10', 'B20', 'B30', 'B40',
       'B50', 'B60', 'C10', 'C20', 'C30', 'C90', 'D10', 'D20', 'D30', 'D40',
       'E10', 'E20', 'F10', 'F20', 'F30', 'F90', 'Total', 'A', 'B', 'C', 'D',
       'E', 'F'],
      dtype='object')

In [49]:
crime_df_2["suburb"].value_counts()

brimboal            10
north wangaratta    10
harcourt            10
colac west          10
mount emu           10
                    ..
warrandyte south    10
balliang east       10
yambuk              10
moorooduc           10
banyena             10
Name: suburb, Length: 2827, dtype: int64

In [50]:
crime_df_2["postcode"].value_counts()

3352    560
3678    340
3401    330
3551    320
3691    310
       ... 
3273     10
3705     10
3753     10
3785     10
3025     10
Name: postcode, Length: 675, dtype: int64

In [None]:
## this has 2827 suburbs what??
## THIS MUST BE FOR ALL OF VICTORIA NOT JUST MELBOURNE
# didn't mean the caps

In [51]:
crime_df_2.head()

Unnamed: 0,postcode,suburb,lat,lon,Local Government Area,Region,Year,A20,A50,A70,...,F20,F30,F90,Total,A,B,C,D,E,F
0,3000,melbourne,-37.814563,144.970267,Melbourne,Northern Metropolitan,2011,1032,116,99,...,13,36,3,14175,1414,7331,404,3764,1210,52
1,3002,east melbourne,-37.81664,144.987811,Melbourne,Northern Metropolitan,2011,53,12,4,...,0,9,0,753,76,476,32,149,11,9
2,3003,west melbourne,-37.806255,144.941123,Melbourne,Northern Metropolitan,2011,54,9,3,...,2,1,2,633,80,403,32,107,6,5
3,3006,southbank,-37.823258,144.965926,Melbourne,Southern Metropolitan,2011,237,21,14,...,0,14,2,2059,310,1103,60,545,25,16
4,3008,docklands,-37.814719,144.948039,Melbourne,Southern Metropolitan,2011,113,7,8,...,4,6,3,1244,149,641,35,389,17,13


In [52]:
crime_df_2["Region"].unique()

array(['Northern Metropolitan', 'Southern Metropolitan',
       'Western Metropolitan', 'Eastern Metropolitan',
       'Northern Victoria', 'Eastern Victoria',
       'South-Eastern Metropolitan', 'Western Victoria'], dtype=object)

In [53]:
crime_df_2["Local Government Area"].unique()

array(['Melbourne', 'Maribyrnong', 'Brimbank', 'Hobsons Bay', 'Melton',
       'Wyndham', 'Moonee Valley', 'Hume', 'Moreland', 'Darebin', 'Yarra',
       'Whittlesea', 'Banyule', 'Nillumbik', 'Boroondara', 'Manningham',
       'Yarra Ranges', 'Monash', 'Whitehorse', 'Maroondah', 'Stonnington',
       'Glen Eira', 'Knox', 'Cardinia', 'Kingston', 'Greater Dandenong',
       'Casey', 'Port Phillip', 'Bayside', 'Frankston', 'Greater Geelong',
       'Surf Coast', 'Colac Otway', 'Corangamite', 'Moyne', 'Glenelg',
       'Ararat', 'Southern Grampians', 'West Wimmera', 'Golden Plains',
       'Moorabool', 'Ballarat', 'Hepburn', 'Central Goldfields',
       'Pyrenees', 'Northern Grampians', 'Horsham', 'Yarriambiack',
       'Hindmarsh', 'Macedon Ranges', 'Mount Alexander', 'Buloke',
       'Mildura', 'Greater Bendigo', 'Loddon', 'Mitchell', 'Gannawarra',
       'Swan Hill', 'Campaspe', 'Strathbogie', 'Greater Shepparton',
       'Moira', 'Murrindindi', 'Benalla', 'Wangaratta', 'Indigo',
      

In [54]:
# Specifying wanted columns
crime_df_2_updated = crime_df_2[['suburb', 'Year', 'Total']]
crime_df_2_updated.head()

Unnamed: 0,suburb,Year,Total
0,melbourne,2011,14175
1,east melbourne,2011,753
2,west melbourne,2011,633
3,southbank,2011,2059
4,docklands,2011,1244
...,...,...,...
28265,wattle bank,2020,4
28266,wonthaggi,2020,509
28267,woolamai,2020,15
28268,inverloch,2020,177


In [56]:
##crime_df_group_by = crime_df_2.groupby(by="suburb").mean()

Doing a loc for each year, that way will have suburb crime data for each year needed.

In [57]:
crime_2016_df = crime_df_2_updated.loc[(crime_df_2_updated["Year"] == 2016), :]
crime_2016_df

Unnamed: 0,suburb,Year,Total
14135,melbourne,2016,15485
14136,east melbourne,2016,818
14137,west melbourne,2016,705
14138,southbank,2016,2197
14139,docklands,2016,1578
...,...,...,...
16957,wattle bank,2016,2
16958,wonthaggi,2016,610
16959,woolamai,2016,7
16960,inverloch,2016,164


In [58]:
# Updating column name
crime_2016_df = crime_2016_df.rename(columns = {"Total" : "Total: 2016"})
crime_2016_df

Unnamed: 0,suburb,Year,Total: 2016
14135,melbourne,2016,15485
14136,east melbourne,2016,818
14137,west melbourne,2016,705
14138,southbank,2016,2197
14139,docklands,2016,1578
...,...,...,...
16957,wattle bank,2016,2
16958,wonthaggi,2016,610
16959,woolamai,2016,7
16960,inverloch,2016,164


In [59]:
# Removing year column
crime_2016_df = crime_2016_df[["suburb", "Total: 2016"]]
crime_2016_df.head()

Unnamed: 0,suburb,Total: 2016
14135,melbourne,15485
14136,east melbourne,818
14137,west melbourne,705
14138,southbank,2197
14139,docklands,1578


In [60]:
crime_2017_df = crime_df_2_updated.loc[(crime_df_2_updated["Year"] == 2017), :]
crime_2017_df

Unnamed: 0,suburb,Year,Total
16962,melbourne,2017,15162
16963,east melbourne,2017,666
16964,west melbourne,2017,743
16965,southbank,2017,2309
16966,docklands,2017,1777
...,...,...,...
19784,wattle bank,2017,5
19785,wonthaggi,2017,647
19786,woolamai,2017,18
19787,inverloch,2017,192


In [61]:
# Updating column name
crime_2017_df = crime_2017_df.rename(columns = {"Total" : "Total: 2017"})
crime_2017_df.head()

Unnamed: 0,suburb,Year,Total: 2017
16962,melbourne,2017,15162
16963,east melbourne,2017,666
16964,west melbourne,2017,743
16965,southbank,2017,2309
16966,docklands,2017,1777


In [62]:
# Removing year column
crime_2017_df = crime_2017_df[["suburb", "Total: 2017"]]
crime_2017_df.head()

Unnamed: 0,suburb,Total: 2017
16962,melbourne,15162
16963,east melbourne,666
16964,west melbourne,743
16965,southbank,2309
16966,docklands,1777


In [63]:
crime_2018_df = crime_df_2_updated.loc[(crime_df_2_updated["Year"] == 2018), :]
crime_2018_df

Unnamed: 0,suburb,Year,Total
19789,melbourne,2018,14780
19790,east melbourne,2018,782
19791,west melbourne,2018,864
19792,southbank,2018,2476
19793,docklands,2018,2317
...,...,...,...
22611,wattle bank,2018,4
22612,wonthaggi,2018,498
22613,woolamai,2018,3
22614,inverloch,2018,177


In [64]:
# Updating column name
crime_2018_df = crime_2018_df.rename(columns = {"Total" : "Total: 2018"})
crime_2018_df.head()

Unnamed: 0,suburb,Year,Total: 2018
19789,melbourne,2018,14780
19790,east melbourne,2018,782
19791,west melbourne,2018,864
19792,southbank,2018,2476
19793,docklands,2018,2317


In [65]:
# Removing year column
crime_2018_df = crime_2018_df[["suburb", "Total: 2018"]]
crime_2018_df.head()

Unnamed: 0,suburb,Total: 2018
19789,melbourne,14780
19790,east melbourne,782
19791,west melbourne,864
19792,southbank,2476
19793,docklands,2317


In [66]:
crime_2020_df = crime_df_2_updated.loc[(crime_df_2_updated["Year"] == 2020), :]
crime_2020_df

Unnamed: 0,suburb,Year,Total
25443,melbourne,2020,14174
25444,east melbourne,2020,907
25445,west melbourne,2020,1082
25446,southbank,2020,2794
25447,docklands,2020,2439
...,...,...,...
28265,wattle bank,2020,4
28266,wonthaggi,2020,509
28267,woolamai,2020,15
28268,inverloch,2020,177


In [67]:
# Updating column name
crime_2020_df = crime_2020_df.rename(columns = {"Total" : "Total: 2020"})
crime_2020_df.head()

Unnamed: 0,suburb,Year,Total: 2020
25443,melbourne,2020,14174
25444,east melbourne,2020,907
25445,west melbourne,2020,1082
25446,southbank,2020,2794
25447,docklands,2020,2439


In [68]:
# Removing year column
crime_2020_df = crime_2020_df[["suburb", "Total: 2020"]]
crime_2020_df.head()

Unnamed: 0,suburb,Total: 2020
25443,melbourne,14174
25444,east melbourne,907
25445,west melbourne,1082
25446,southbank,2794
25447,docklands,2439


In [99]:
print(f"max: {crime_2020_df['Total: 2020'].max()}")
print(f"min: {crime_2020_df['Total: 2020'].min()}")

max: 14174
min: 0


### Merging 2016, 2017, 2018

In [69]:
# They all have 2827 rows so they really should be able to be merged on the suburb name

In [None]:
crime_2016_df
crime_2017_df
crime_2018_df

In [70]:
crime_16_17_df = crime_2016_df.merge(crime_2017_df)

In [71]:
crime_16_17_df.head()

Unnamed: 0,suburb,Total: 2016,Total: 2017
0,melbourne,15485,15162
1,east melbourne,818,666
2,west melbourne,705,743
3,southbank,2197,2309
4,docklands,1578,1777


In [72]:
crime_16_17_18_df = crime_16_17_df.merge(crime_2018_df)
crime_16_17_18_df

Unnamed: 0,suburb,Total: 2016,Total: 2017,Total: 2018
0,melbourne,15485,15162,14780
1,east melbourne,818,666,782
2,west melbourne,705,743,864
3,southbank,2197,2309,2476
4,docklands,1578,1777,2317
...,...,...,...,...
2822,wattle bank,2,5,4
2823,wonthaggi,610,647,498
2824,woolamai,7,18,3
2825,inverloch,164,192,177


In [73]:
crime_16_17_18_df.head()

Unnamed: 0,suburb,Total: 2016,Total: 2017,Total: 2018
0,melbourne,15485,15162,14780
1,east melbourne,818,666,782
2,west melbourne,705,743,864
3,southbank,2197,2309,2476
4,docklands,1578,1777,2317


Now will take average of the three total columns

In [74]:
crime_16_17_18_df['Crime Average'] = ((crime_16_17_18_df['Total: 2016'] + crime_16_17_18_df['Total: 2017'] + crime_16_17_18_df['Total: 2018'])/3)
crime_16_17_18_df.head()

Unnamed: 0,suburb,Total: 2016,Total: 2017,Total: 2018,Crime Average
0,melbourne,15485,15162,14780,15142.333333
1,east melbourne,818,666,782,755.333333
2,west melbourne,705,743,864,770.666667
3,southbank,2197,2309,2476,2327.333333
4,docklands,1578,1777,2317,1890.666667


In [75]:
crime_round_df = crime_16_17_18_df.round({'Crime Average':0})
crime_round_df.head()

Unnamed: 0,suburb,Total: 2016,Total: 2017,Total: 2018,Crime Average
0,melbourne,15485,15162,14780,15142.0
1,east melbourne,818,666,782,755.0
2,west melbourne,705,743,864,771.0
3,southbank,2197,2309,2476,2327.0
4,docklands,1578,1777,2317,1891.0


In [76]:
average_crime_df = crime_round_df[['suburb', 'Crime Average']]
average_crime_df.head()

Unnamed: 0,suburb,Crime Average
0,melbourne,15142.0
1,east melbourne,755.0
2,west melbourne,771.0
3,southbank,2327.0
4,docklands,1891.0


In [77]:
# renaming suburb
average_crime_df = average_crime_df.rename(columns = {"suburb" : "Suburb"})
average_crime_df.head()


Unnamed: 0,Suburb,Crime Average
0,melbourne,15142.0
1,east melbourne,755.0
2,west melbourne,771.0
3,southbank,2327.0
4,docklands,1891.0


### Joining the house data and crime data

In [78]:
# previewing House df
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month,Day
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,2,9
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,2,12
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,3,2
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,3,2
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,3,3


In [79]:
average_crime_df.head()

Unnamed: 0,Suburb,Crime Average
0,melbourne,15142.0
1,east melbourne,755.0
2,west melbourne,771.0
3,southbank,2327.0
4,docklands,1891.0


In [80]:
average_crime_df["Suburb"].value_counts()

brimboal           1
enfield            1
watsons creek      1
hawthorn           1
myrrhee            1
                  ..
newborough         1
darriman           1
fyansford          1
cochranes creek    1
banyena            1
Name: Suburb, Length: 2827, dtype: int64

In [81]:
# No sure that this will work
# but hoping that the crime average will add to each column with that suburb in the house data

In [82]:
combined_df = house_df.merge(average_crime_df, how='left', on='Suburb')
combined_df

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month,Day,Crime Average
0,Abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,2,9,
1,Abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,2,12,
2,Abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,3,2,
3,Abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,3,2,
4,Abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,3,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,0,1480000.0,24/02/2018,6.3,3013.0,1.0,3.0,593.0,Maribyrnong City Council,Western Metropolitan,2018,1,24,
34853,Yarraville,2,0,888000.0,24/02/2018,6.3,3013.0,2.0,1.0,98.0,Maribyrnong City Council,Western Metropolitan,2018,1,24,
34854,Yarraville,2,2,705000.0,24/02/2018,6.3,3013.0,1.0,2.0,220.0,Maribyrnong City Council,Western Metropolitan,2018,1,24,
34855,Yarraville,3,0,1140000.0,24/02/2018,6.3,3013.0,,,,Maribyrnong City Council,Western Metropolitan,2018,1,24,


In [83]:
# so that didn't work



In [84]:
# Checking NaN values
combined_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Date                 0
Distance             1
Postcode             1
Bathroom          8226
Car               8728
Landsize         11810
CouncilArea          3
Regionname           3
Year                 0
Month                0
Day                  0
Crime Average    34855
dtype: int64

In [85]:
combined_2_df = house_df.merge(average_crime_df)
combined_2_df

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month,Day,Crime Average
0,croydon,3,0,730000.0,26/08/2017,23.0,3136.0,,,,Maroondah City Council,Eastern Metropolitan,2017,7,26,1364.0
1,viewbank,3,2,885000.0,28/10/2017,8.9,3084.0,,,,Banyule City Council,Eastern Metropolitan,2017,9,28,170.0


In [86]:
# OMG THEY HAVE A CAPITAL LETTER!!!

In [87]:
house_df['Suburb'] = house_df['Suburb'].str.lower()
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month,Day
0,abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,2,9
1,abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,2,12
2,abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,3,2
3,abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,3,2
4,abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,3,3


In [None]:
#trying again

In [88]:
combined_2_df = house_df.merge(average_crime_df, how='left', on='Suburb')
combined_2_df

Unnamed: 0,Suburb,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Regionname,Year,Month,Day,Crime Average
0,abbotsford,2,0,,3/09/2016,2.5,3067.0,1.0,1.0,126.0,Yarra City Council,Northern Metropolitan,2016,2,9,1113.0
1,abbotsford,2,0,1480000.0,3/12/2016,2.5,3067.0,1.0,1.0,202.0,Yarra City Council,Northern Metropolitan,2016,2,12,1113.0
2,abbotsford,2,0,1035000.0,4/02/2016,2.5,3067.0,1.0,0.0,156.0,Yarra City Council,Northern Metropolitan,2016,3,2,1113.0
3,abbotsford,3,1,,4/02/2016,2.5,3067.0,2.0,1.0,0.0,Yarra City Council,Northern Metropolitan,2016,3,2,1113.0
4,abbotsford,3,0,1465000.0,4/03/2017,2.5,3067.0,2.0,0.0,134.0,Yarra City Council,Northern Metropolitan,2017,3,3,1113.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,yarraville,4,0,1480000.0,24/02/2018,6.3,3013.0,1.0,3.0,593.0,Maribyrnong City Council,Western Metropolitan,2018,1,24,716.0
34853,yarraville,2,0,888000.0,24/02/2018,6.3,3013.0,2.0,1.0,98.0,Maribyrnong City Council,Western Metropolitan,2018,1,24,716.0
34854,yarraville,2,2,705000.0,24/02/2018,6.3,3013.0,1.0,2.0,220.0,Maribyrnong City Council,Western Metropolitan,2018,1,24,716.0
34855,yarraville,3,0,1140000.0,24/02/2018,6.3,3013.0,,,,Maribyrnong City Council,Western Metropolitan,2018,1,24,716.0


In [None]:
# hallelujah

In [89]:
# Dropping unwanted columns
# will add these to earlier drops later

final_df = combined_2_df.drop(columns=['Date', 'Distance', 'CouncilArea', 'Regionname'])
final_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Postcode,Bathroom,Car,Landsize,Year,Month,Day,Crime Average
0,abbotsford,2,0,,3067.0,1.0,1.0,126.0,2016,2,9,1113.0
1,abbotsford,2,0,1480000.0,3067.0,1.0,1.0,202.0,2016,2,12,1113.0
2,abbotsford,2,0,1035000.0,3067.0,1.0,0.0,156.0,2016,3,2,1113.0
3,abbotsford,3,1,,3067.0,2.0,1.0,0.0,2016,3,2,1113.0
4,abbotsford,3,0,1465000.0,3067.0,2.0,0.0,134.0,2017,3,3,1113.0


In [90]:
# Checking NaN values
final_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Postcode             1
Bathroom          8226
Car               8728
Landsize         11810
Year                 0
Month                0
Day                  0
Crime Average      513
dtype: int64

In [92]:
# Drop the null rows
finalised_df = final_df.dropna()
finalised_df

Unnamed: 0,Suburb,Rooms,Type,Price,Postcode,Bathroom,Car,Landsize,Year,Month,Day,Crime Average
1,abbotsford,2,0,1480000.0,3067.0,1.0,1.0,202.0,2016,2,12,1113.0
2,abbotsford,2,0,1035000.0,3067.0,1.0,0.0,156.0,2016,3,2,1113.0
4,abbotsford,3,0,1465000.0,3067.0,2.0,0.0,134.0,2017,3,3,1113.0
5,abbotsford,3,0,850000.0,3067.0,2.0,1.0,94.0,2017,3,3,1113.0
6,abbotsford,4,0,1600000.0,3067.0,1.0,2.0,120.0,2016,3,6,1113.0
...,...,...,...,...,...,...,...,...,...,...,...,...
34849,wollert,3,0,570000.0,3750.0,2.0,2.0,404.0,2018,1,24,499.0
34852,yarraville,4,0,1480000.0,3013.0,1.0,3.0,593.0,2018,1,24,716.0
34853,yarraville,2,0,888000.0,3013.0,2.0,1.0,98.0,2018,1,24,716.0
34854,yarraville,2,2,705000.0,3013.0,1.0,2.0,220.0,2018,1,24,716.0


In [None]:
# Still 17394 rows

In [93]:
# Confirming all null values have been removed.
finalised_df.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Postcode         0
Bathroom         0
Car              0
Landsize         0
Year             0
Month            0
Day              0
Crime Average    0
dtype: int64

In [None]:
# but what to do about the suburbs??
# do I convert now?
# do I use one hot encoding later
# how will we know which number represents which suburb when adding in the domain data?

In [94]:
finalised_df["Suburb"].unique()

array(['abbotsford', 'airport west', 'albert park', 'alphington',
       'altona', 'altona north', 'armadale', 'ascot vale', 'ashburton',
       'ashwood', 'avondale heights', 'balaclava', 'balwyn',
       'balwyn north', 'bentleigh', 'bentleigh east', 'box hill',
       'braybrook', 'brighton', 'brighton east', 'brunswick',
       'brunswick west', 'bulleen', 'burwood', 'camberwell', 'canterbury',
       'carlton north', 'carnegie', 'caulfield', 'caulfield north',
       'caulfield south', 'chadstone', 'clifton hill', 'coburg',
       'coburg north', 'collingwood', 'doncaster', 'eaglemont',
       'elsternwick', 'elwood', 'essendon', 'essendon north', 'fairfield',
       'fitzroy', 'fitzroy north', 'flemington', 'footscray', 'glen iris',
       'glenroy', 'gowanbrae', 'hadfield', 'hampton east', 'hawthorn',
       'heidelberg heights', 'heidelberg west', 'hughesdale', 'ivanhoe',
       'kealba', 'keilor east', 'kensington', 'kew', 'kew east',
       'kooyong', 'maidstone', 'malvern', 

In [95]:
finalised_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Postcode,Bathroom,Car,Landsize,Year,Month,Day,Crime Average
1,abbotsford,2,0,1480000.0,3067.0,1.0,1.0,202.0,2016,2,12,1113.0
2,abbotsford,2,0,1035000.0,3067.0,1.0,0.0,156.0,2016,3,2,1113.0
4,abbotsford,3,0,1465000.0,3067.0,2.0,0.0,134.0,2017,3,3,1113.0
5,abbotsford,3,0,850000.0,3067.0,2.0,1.0,94.0,2017,3,3,1113.0
6,abbotsford,4,0,1600000.0,3067.0,1.0,2.0,120.0,2016,3,6,1113.0


In [96]:
finalised_df.tail()

Unnamed: 0,Suburb,Rooms,Type,Price,Postcode,Bathroom,Car,Landsize,Year,Month,Day,Crime Average
34849,wollert,3,0,570000.0,3750.0,2.0,2.0,404.0,2018,1,24,499.0
34852,yarraville,4,0,1480000.0,3013.0,1.0,3.0,593.0,2018,1,24,716.0
34853,yarraville,2,0,888000.0,3013.0,2.0,1.0,98.0,2018,1,24,716.0
34854,yarraville,2,2,705000.0,3013.0,1.0,2.0,220.0,2018,1,24,716.0
34856,yarraville,2,0,1020000.0,3013.0,1.0,0.0,250.0,2018,1,24,716.0


In [None]:
# are they alphabetical in the dataframe because when I print out the unique columns they aren't alphabetical??

In [98]:
finalised_df.to_csv('home_crime_data.csv', index = False)