In [72]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [3]:
melbourne_file_path = "C:/Users/user/Desktop/School/Data Projects/Melbourne Housing Snapshot/melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)

              Rooms         Price      Distance      Postcode      Bedroom2  \
count  13580.000000  1.358000e+04  13580.000000  13580.000000  13580.000000   
mean       2.937997  1.075684e+06     10.137776   3105.301915      2.914728   
std        0.955748  6.393107e+05      5.868725     90.676964      0.965921   
min        1.000000  8.500000e+04      0.000000   3000.000000      0.000000   
25%        2.000000  6.500000e+05      6.100000   3044.000000      2.000000   
50%        3.000000  9.030000e+05      9.200000   3084.000000      3.000000   
75%        3.000000  1.330000e+06     13.000000   3148.000000      3.000000   
max       10.000000  9.000000e+06     48.100000   3977.000000     20.000000   

           Bathroom           Car       Landsize  BuildingArea    YearBuilt  \
count  13580.000000  13518.000000   13580.000000   7130.000000  8205.000000   
mean       1.534242      1.610075     558.416127    151.967650  1964.684217   
std        0.691712      0.962634    3990.669241   

# Exploratory data analysis (EDA)

In [17]:
table_summary = melbourne_data.describe()
print(table_summary)

#Average Landsize for Melbourne properties
avg_lot_size = table_summary['Landsize']['mean']
print("Average Landsize is " , avg_lot_size)

newest_home_age = 2023 - table_summary['YearBuilt']['max']
print("Newest Unit's age is " , newest_home_age)



             Rooms         Price     Distance     Postcode     Bedroom2  \
count  6196.000000  6.196000e+03  6196.000000  6196.000000  6196.000000   
mean      2.931407  1.068828e+06     9.751097  3101.947708     2.902034   
std       0.971079  6.751564e+05     5.612065    86.421604     0.970055   
min       1.000000  1.310000e+05     0.000000  3000.000000     0.000000   
25%       2.000000  6.200000e+05     5.900000  3044.000000     2.000000   
50%       3.000000  8.800000e+05     9.000000  3081.000000     3.000000   
75%       4.000000  1.325000e+06    12.400000  3147.000000     3.000000   
max       8.000000  9.000000e+06    47.400000  3977.000000     9.000000   

          Bathroom          Car      Landsize  BuildingArea    YearBuilt  \
count  6196.000000  6196.000000   6196.000000   6196.000000  6196.000000   
mean      1.576340     1.573596    471.006940    141.568645  1964.081988   
std       0.711362     0.929947    897.449881     90.834824    38.105673   
min       1.000000  

In [11]:
#Let's check if there is any missing data in our dataset.
melbourne_data.isna().sum().sum()

13256

In [12]:
#There are missing data (na) in our dataset, so it's better if we just drop these NA data for now.
melbourne_data = melbourne_data.dropna(axis= 0)

In [14]:
# Select our target variable (the variable which we want to predict)
# In this case, it is Price of the property

y = melbourne_data.Price
print(y)

1        1035000.0
2        1465000.0
4        1600000.0
6        1876000.0
7        1636000.0
           ...    
12205     601000.0
12206    1050000.0
12207     385000.0
12209     560000.0
12212    2450000.0
Name: Price, Length: 6196, dtype: float64


In [18]:
#Next let's see all the columns of these dataset that we are working with.
melbourne_data.columns


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [34]:
#Now I don't know much about housing prices, but to predict our target variable, we must select some columns from these columns.
#Let's just select the obvious predictors that will definitely affect housing prices.

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
#we call all these predictors X
X = melbourne_data[melbourne_features]
print(x)

       Rooms  Bathroom  Landsize  Lattitude  Longtitude
1          2       1.0     156.0  -37.80790   144.99340
2          3       2.0     134.0  -37.80930   144.99440
4          4       1.0     120.0  -37.80720   144.99410
6          3       2.0     245.0  -37.80240   144.99930
7          2       1.0     256.0  -37.80600   144.99540
...      ...       ...       ...        ...         ...
12205      3       2.0     972.0  -37.51232   145.13282
12206      3       1.0     179.0  -37.86558   144.90474
12207      1       1.0       0.0  -37.85588   144.89936
12209      2       1.0       0.0  -37.85581   144.99025
12212      6       3.0    1087.0  -37.81038   144.89389

[6196 rows x 5 columns]


In [35]:
#Great, now essentially machine learning model is mathematical functions that calculate y from x.
#for example, Linear Regression, the easiest to understand, y = m * X + c (m being the gradient of the graph, c is the constant)

# Let's build simply model using Decision Tree Regressor

In [38]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [40]:
melbourne_model = DecisionTreeRegressor(random_state=1)
#we set random state = 1, so that our model has the same result each runs.

#Fit Model
melbourne_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

In [None]:
#That is it. A simple model of Decision Tree regressor (readme. for explanation on Decision Tree logic)

In [44]:
#let's test our model
print("Making predictions for the following 5 houses:")
test1 = val_X.head(5)
print(test1)


Making predictions for the following 5 houses:
      Rooms  Bathroom  Landsize  Lattitude  Longtitude
6048      3       3.0     221.0  -37.77080    144.8401
9186      4       2.0     528.0  -37.83539    145.0431
3991      3       2.0       0.0  -37.80950    144.9691
5829      3       2.0    1039.0  -37.86380    144.9820
3616      6       6.0    1334.0  -37.80290    145.0267


In [53]:
print("The predictions are")
prediction1 = melbourne_model.predict(test1)
print(prediction1)

The predictions are
[ 503000. 1857000.  760000. 1395000. 4250000.]


In [55]:
#Let's compare with the actual prices of these units in our original dataset ()
print(val_y.head(5))
val_test = val_y.head(5)

6048     620000.0
9186    2320000.0
3991     750000.0
5829    1120000.0
3616    6500000.0
Name: Price, dtype: float64


In [None]:
#As you can see, using validation data set x and validation predictor y (val_y) , we realize that the prediction is not exactly accurate
#Let's see if we can optimize these

# Model Optimization part 1

In [48]:
#Decision Tree model is essentially keep asking Yes/No question to split the dataset into smaller leaf, so that maybe the nth level of the segmented leaf, will have prices of houses with similar features.

In [None]:
#For example: First level split can be "Do the house has more than 2 bedroom??"
#Next level can be: "Lotsize larger than 8500 squarefeet?" for the houses with less than 2 bedroom
#"Lotsize larger than 11500 squarefeet?" for house with more than 2 bedrooms.
#Keep splitting the data using segmentation of the housing under 5 predictors

In [50]:
#Overfitting: when you segment dataset into too small leaf. Means 1 leaf contains only small number of houses.
#If we insert a brandnew house, it can only fit into 1 single leaf (that satisfy all the split conditions), 
# Predicted price will may not be accurate, because it's made with only small number of data inside that 1 small leaf.

#Underfitting: is opposite with the above issue. Where the split is too shallow, one single leaf contains far too much data of houses
#This leads to a problem where if you fit in a brand new house, it can easily fit into any of the big leaf and result in inaccurate prediction as well

#The ideal scenerio is to find out the Mean Absolute Error ( that sweet spot where we can reduce the lowest level of underfitting and overfitting)

In [None]:
#Let's use another tool under SkLearn, that is mean_absolute_error
#the better Decision Tree model, will have lower MAE

In [59]:
val_mae = mean_absolute_error(val_test ,prediction1)
print("Mean Absolute Error")
print(val_mae)

Mean Absolute Error
623000.0


# Model Optimization part 2

In [None]:
#Now the above model is only has ONE MAE, which we cannot compare to any other models. 
#That's why when we build ML model as beginner, it's best that we should write them as function-based coding

In [None]:
#Our data are the same: train_X, val_X, train_y, val_y

In [69]:
def get_mae(max_leaf_nodes , train_x , val_x , train_y , val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_x , train_y)
    prediction2 = model.predict(val_x)
    mae = mean_absolute_error(val_y , prediction2)
    return (mae)

In [61]:
#now we can apply this function above to multiple numbers of nodes (splits)
#remember that a leaf is a node without children.

# Binary tree Theorem:
# no. of internal node = i
# number of nodes (both internal nodes and leaves) : n =2i + 1
# (or vice versa : number of internal node = (n-1) / 2 )
# number of leaves: l = (n+1)/ 2
# vice versa: number of  nodes from leaves :  n = 2l -1 

# hence relationship between internal nodes vs leaves 
# 2i + 1 = 2l -1 => i = l -1 

# maximum number of leaves = max-l = 2^(h-1) where h is the height of the trees

In [70]:
for max_leaf_nodes in [5,50,500,5000]:
    my_mae = get_mae(max_leaf_nodes , train_X , val_X , train_y, val_y)
    print("For max leaf node: " , max_leaf_nodes , "MAE is " , my_mae )
    

#As you can see, for max leaf node of 500, the MAE is the lowest, hence the optimal MAE would be somewhere between 50 - 500

For max leaf node:  5 MAE is  369673.0400167675
For max leaf node:  50 MAE is  266644.21831092256
For max leaf node:  500 MAE is  243613.31456921576
For max leaf node:  5000 MAE is  256227.639767592


# Random Forest

### A decision tree is a simple, yet powerful algorithm that creates a tree-like model of decisions and their possible consequences. The algorithm splits the data into smaller subsets based on the features that are most informative for predicting the outcome. Each decision is made based on the values of one feature, and the outcome is determined by the final leaf node of the tree.

### Random Forest is an extension of decision trees that improves the accuracy and robustness of the algorithm by using an ensemble approach. Random Forest creates multiple decision trees, each with a different subset of the training data and features. During the prediction phase, the algorithm combines the predictions of all the decision trees to produce a final prediction. By creating multiple trees, Random Forest reduces the risk of overfitting and increases the overall accuracy of the model.

In [71]:
# Basically, with Decision tree, there are so only so many height of 1 single tree can go, before it reaches the level of overfitting
# Hence if we build multiple decision trees and combine their results, we can reduce the risks of overfitting.
#let'S build Random Forest model with the above sets of data

In [73]:
def get_mae2(max_leaf_nodes , train_x , val_x , train_y , val_y):
    forest_model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes , random_state=1)
    forest_model.fit(train_x , train_y)
    prediction3 = forest_model.predict(val_x)
    mae = mean_absolute_error(val_y , prediction3)
    return (mae)

In [75]:
leaf_nodes = [5,50,500,5000]
list1 = []
list2 = []
for max_leaf_nodes in leaf_nodes:
    decisiontree_mae = get_mae(max_leaf_nodes , train_X , val_X , train_y, val_y)
    list1.append(decisiontree_mae)
    randomforest_mae = get_mae2(max_leaf_nodes , train_X , val_X , train_y, val_y)
    list2.append(randomforest_mae)

    

[369673.0400167675, 266644.21831092256, 243613.31456921576, 256227.639767592]
[352296.73817633535, 232778.04369765415, 193927.13612872423, 190556.59504186007]


In [80]:
list3 = [list1 , list2]
rows_name = ["Decision Trees" ," Random Forest"]
comparison_table = pd.DataFrame(list3, columns = leaf_nodes , index =rows_name)
print(comparison_table)

                         5              50             500            5000
Decision Trees  369673.040017  266644.218311  243613.314569  256227.639768
 Random Forest  352296.738176  232778.043698  193927.136129  190556.595042


In [None]:
#Based on MAE, the Random Forest has reduced MAE significantly compared to Decision Trees.
#Even at 5000 nodes, Random Forest MAE is still decreasing, means the optimal MAE point for Random Forest model on this data is still further

### Remember that, the above models are only using FIVE predictors out multiple predictors in the original dataset. Of course, the more predictors should, in theory, increase the accuracy of the models. But that is not always the case. 
### We need to do more in the step EDA to explore the predictors and transform / eliminate predictors that are subjected other issues such as imbalance data, highly skewed , outliers etc...