In [35]:
import pandas as pd

In [41]:
df = pd.read_csv('melb_data.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [42]:
df.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [43]:
# Fix date column
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

Suburb                   object
Address                  object
Rooms                     int64
Type                     object
Price                   float64
Method                   object
SellerG                  object
Date             datetime64[ns]
Distance                float64
Postcode                float64
Bedroom2                float64
Bathroom                float64
Car                     float64
Landsize                float64
BuildingArea            float64
YearBuilt               float64
CouncilArea              object
Lattitude               float64
Longtitude              float64
Regionname               object
Propertycount           float64
dtype: object

In [44]:
df.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
Suburb,13580,314.0,Reservoir,359.0,NaT,NaT,,,,,,,
Address,13580,13378.0,53 William St,3.0,NaT,NaT,,,,,,,
Rooms,13580,,,,NaT,NaT,2.938,0.955748,1.0,2.0,3.0,3.0,10.0
Type,13580,3.0,h,9449.0,NaT,NaT,,,,,,,
Price,13580,,,,NaT,NaT,1075680.0,639311.0,85000.0,650000.0,903000.0,1330000.0,9000000.0
Method,13580,5.0,S,9022.0,NaT,NaT,,,,,,,
SellerG,13580,268.0,Nelson,1565.0,NaT,NaT,,,,,,,
Date,13580,58.0,2017-05-27 00:00:00,473.0,2016-01-28,2017-12-08,,,,,,,
Distance,13580,,,,NaT,NaT,10.1378,5.86872,0.0,6.1,9.2,13.0,48.1
Postcode,13580,,,,NaT,NaT,3105.3,90.677,3000.0,3044.0,3084.0,3148.0,3977.0


In [45]:
# [Car, BuildingArea, YearBuilt, CouncilArea] all have some null values.  For now, we'll just drop these
df.dropna(inplace=True)
df.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
Suburb,6196,287.0,Reservoir,154.0,NaT,NaT,,,,,,,
Address,6196,6117.0,1/1 Clarendon St,3.0,NaT,NaT,,,,,,,
Rooms,6196,,,,NaT,NaT,2.93141,0.971079,1.0,2.0,3.0,4.0,8.0
Type,6196,3.0,h,4088.0,NaT,NaT,,,,,,,
Price,6196,,,,NaT,NaT,1068830.0,675156.0,131000.0,620000.0,880000.0,1325000.0,9000000.0
Method,6196,5.0,S,3957.0,NaT,NaT,,,,,,,
SellerG,6196,203.0,Nelson,763.0,NaT,NaT,,,,,,,
Date,6196,51.0,2017-05-27 00:00:00,225.0,2016-03-09,2017-12-08,,,,,,,
Distance,6196,,,,NaT,NaT,9.7511,5.61207,0.0,5.9,9.0,12.4,47.4
Postcode,6196,,,,NaT,NaT,3101.95,86.4216,3000.0,3044.0,3081.0,3147.0,3977.0


# Setting up the Model  
We'll need to grab our prediction target (or y), as well as our features (or X)

In [72]:
y = df['Price']
X = df[['Rooms','Bathroom','Landsize','Lattitude','Longtitude']]

In [73]:
# Next, we need to split our data into a training set and a testing  set
from sklearn.model_selection import train_test_split

# For the fitting of our model, we'll use the training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [75]:
# This is all the features we'll be using to predict price
X_train.head() 

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
10385,3,1.0,206.0,-37.87107,145.04991
5805,2,1.0,0.0,-37.859,144.9767
8488,2,1.0,2701.0,-37.8109,144.8684
6672,3,1.0,670.0,-37.8134,144.8745
776,6,3.0,708.0,-37.9181,145.044


In [76]:
X_train.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,4647.0,4647.0,4647.0,4647.0,4647.0
mean,2.942113,1.577147,478.649451,-37.808104,144.99097
std,0.972881,0.713032,963.944034,0.076552,0.099908
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,158.0,-37.85589,144.92629
50%,3.0,1.0,375.0,-37.8013,144.9961
75%,4.0,2.0,629.0,-37.758,145.05385
max,8.0,8.0,37000.0,-37.48381,145.52635


In [57]:
from sklearn.tree import DecisionTreeRegressor 
# We'll be using a simple decision tree regressor to predict the price the house will be listed at

In [77]:
# First define the model
df_model = DecisionTreeRegressor(random_state=1)

# Then fit the model
df_model.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

In [81]:
# Let's predict the first 5 home values
print(X_train.head(),'\n\nThe predictions are: \n', df_model.predict(X_train.head()))
print('\nThe actual values are:\n', y_train.head())

       Rooms  Bathroom  Landsize  Lattitude  Longtitude
10385      3       1.0     206.0  -37.87107   145.04991
5805       2       1.0       0.0  -37.85900   144.97670
8488       2       1.0    2701.0  -37.81090   144.86840
6672       3       1.0     670.0  -37.81340   144.87450
776        6       3.0     708.0  -37.91810   145.04400 

The predictions are: 
 [1060000.  410000.  502000. 1055000. 1900000.]

The actual values are:
 10385    1060000.0
5805      390000.0
8488      502000.0
6672     1055000.0
776      1900000.0
Name: Price, dtype: float64


In [82]:
# But, the goal is to predict the testing data!  Let's give it a try
print(X_test.head(),'\n\nThe testing data predictions are: \n', df_model.predict(X_test.head()))
print('\nThe actual values are:\n', y_test.head())

       Rooms  Bathroom  Landsize  Lattitude  Longtitude
4850       2       1.0      96.0  -37.85010   144.99530
2307       2       1.0       0.0  -37.89020   144.99070
10090      2       1.0     136.0  -37.85542   144.99571
3645       3       2.0     205.0  -37.79930   145.02670
4930       2       1.0     400.0  -37.73520   144.98520 

The testing data predictions are: 
 [ 900000.  696750. 1120000. 1590000.  630000.]

The actual values are:
 4850      815000.0
2307      655000.0
10090     957500.0
3645     1330000.0
4930      722000.0
Name: Price, dtype: float64


In [80]:
# We need a way to validate our models.  With regression problems, we can use the MAE (Mean Absolute Error) approach
# To sum up what this is, we can use the error formula (error = actual - predicted) for each prediction, and 
# take the average of this

from sklearn.metrics import mean_absolute_error # sklearn has a module for this

# For scoring our models, we want to use the testing data vice the training
predicted_prices = df_model.predict(X_test)
mae = mean_absolute_error(y_test, predicted_prices)
print('MAE:', mae)

MAE: 273518.01872175594


In [91]:
# We can improve our Decision Tree Regressor using the 'max_leaf_nodes' parameter

def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
    ''' Utility function for obtaining the mean absolute error '''
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, y_train)
    test_predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, test_predictions)
    return mae

for nodes in [2,5,25,50,100,500,1000,5000]:
    print(str(nodes), 'nodes:' ,get_mae(nodes, X_train, X_test, y_train, y_test))

2 nodes: 439583.61347217765
5 nodes: 385696.54278937966
25 nodes: 307919.7001056724
50 nodes: 279794.61143891385
100 nodes: 269191.989429751
500 nodes: 261718.1134423186
1000 nodes: 262426.6407636801
5000 nodes: 271996.1207230471


In [92]:
# Now that we know the best number of leaves to use on our model, we can apply it to the whole dataset (including test set)
df_refined_model = DecisionTreeRegressor(max_leaf_nodes=500, random_state=0)
df_refined_model.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=500, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [93]:
# Now, let's move into the Random Forest Regressor
# It's similar to Decision Tree Regressor, except it uses several different trees in its calculations
from sklearn.ensemble import RandomForestRegressor

In [94]:
# Train it the exact same way (like most models, we will define fit (on training), predict (on testing), 
# get our metrics, and refine))
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)
forest_predictions = forest_model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, forest_predictions))

# This isn't necessarily a GOOD value, but just the default parameters already did better than any of our tests with the 
# Decision Tree Regressor

MAE: 218482.25517538196


