In [24]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree

In [3]:
#load in the csv file for model
df = pd.read_csv('resources/GTA_houses_aftercleaning.csv')
df.head()

Unnamed: 0,Address,zipcode,city,latitude,longitude,price,bathrooms,bedrooms,homeType
0,120 Church St E,L6V1G8,Brampton,43.69446,-79.756256,1,2,3,SINGLE_FAMILY
1,128 Church St E,L6V1G8,Brampton,43.694798,-79.75577,1,2,2,SINGLE_FAMILY
2,3 Sophia St,L6V1T8,Brampton,43.69498,-79.75685,1,2,3,SINGLE_FAMILY
3,1A Sophia St,L6V1T8,Brampton,43.694664,-79.75647,1,2,2,SINGLE_FAMILY
4,1 Sophia St,L6V1T8,Brampton,43.694763,-79.75668,1,3,4,SINGLE_FAMILY


In [4]:
#check how many rows and columns we have
df.shape

(2000, 9)

In [5]:
df = pd.get_dummies(df, columns=["city", "homeType", "bathrooms", "bedrooms"])
df.head()

Unnamed: 0,Address,zipcode,latitude,longitude,price,city_Brampton,city_Etobicoke,city_Markham,city_Mississauga,city_Thornhill,...,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,bedrooms_9,bedrooms_12
0,120 Church St E,L6V1G8,43.69446,-79.756256,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,128 Church St E,L6V1G8,43.694798,-79.75577,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3 Sophia St,L6V1T8,43.69498,-79.75685,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1A Sophia St,L6V1T8,43.694664,-79.75647,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1 Sophia St,L6V1T8,43.694763,-79.75668,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
#delete first column as it is unneeded
df = df.iloc[: , 4:]
df.head()

Unnamed: 0,price,city_Brampton,city_Etobicoke,city_Markham,city_Mississauga,city_Thornhill,city_Toronto,city_Vaughan,homeType_APARTMENT,homeType_CONDO,...,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,bedrooms_9,bedrooms_12
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [16]:
# Define the features set.
X = df.copy()
X = X.drop(['price'], axis=1)
X.head()

Unnamed: 0,city_Brampton,city_Etobicoke,city_Markham,city_Mississauga,city_Thornhill,city_Toronto,city_Vaughan,homeType_APARTMENT,homeType_CONDO,homeType_LOT,...,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,bedrooms_9,bedrooms_12
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [31]:
X.shape

(2000, 40)

In [34]:
# Define the target set.
y = df['price'].values
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [36]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1500, 40)
(500, 40)
(1500,)
(500,)


In [22]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [28]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
print(predictions)

[ 999000  699000  399000  739999  659000  559000 4488888  999000  999000
 7998000  559000 1499900 1235000  995000 6498000 1398800  499000 1599000
   45000  970000  899000       1 1235000  899000  499000 1698000  999000
  449900  780000  499000  499000  699000 6888000 2389900 1280000 1399000
  699000  999000  669900 1299000 1299900 1299000  669900 1299000 1499900
  499000 1099998  740000 1088000  899000  695000  479000 2100000 1599786
  899000 2100000  449900 1300000  999000       1  899900  999000  999000
  699000 7998000 1398800  749000  499000  699000  999000  699000  739999
 7880000 1079000  799999  599900  999000 1599786 1299000 1199000  999000
 6288000  899000 1299900  599900  499000  749000 1399000  488000 1299000
 1849000 1300000  999000  588000  749000 1235000  799900 1399000 4488000
  999000  999000 1399000 2100000  799900  999000  999000 1080000  588000
 1398800  699900 1299900  449900  999000  989800 1235000  999000   20000
 1299900 1698000 1399000  999000  999000  999000  5

In [10]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
scaled = data_scaler.fit_transform(df)
scaled[:5]

array([[-0.83131836,  2.17879013, -0.0316386 , -0.44372074, -0.4669334 ,
        -0.02236627, -0.68993823, -0.43399785, -0.02236627, -0.50935827,
        -0.03875891, -0.02236627, -0.03875891,  0.72334587, -0.39251508,
        -0.10301175, -0.34069257,  1.67616342, -0.54267705, -0.56811207,
        -0.30085302, -0.18181818, -0.12547523, -0.10786004, -0.06337243,
        -0.05006262, -0.02236627, -0.02236627, -0.02236627, -0.0316386 ,
        -0.02236627, -0.12129857, -0.30085302, -0.37969035,  1.36127405,
        -0.68914893, -0.29986373, -0.12340351, -0.07088812, -0.0316386 ,
        -0.02236627],
       [-0.83131836,  2.17879013, -0.0316386 , -0.44372074, -0.4669334 ,
        -0.02236627, -0.68993823, -0.43399785, -0.02236627, -0.50935827,
        -0.03875891, -0.02236627, -0.03875891,  0.72334587, -0.39251508,
        -0.10301175, -0.34069257,  1.67616342, -0.54267705, -0.56811207,
        -0.30085302, -0.18181818, -0.12547523, -0.10786004, -0.06337243,
        -0.05006262, -0.02236

In [11]:
import numpy as np
print(np.mean(scaled[:,0]))
print(np.std(scaled[:,0]))

-1.4210854715202004e-17
1.0
