In [66]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree

In [67]:
#load in the csv file for model
df = pd.read_csv('resources/GTA_houses_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,listing image,listing url,full address,city,zipcode,# of beds,# of bathrooms,price,latitude,longitude,home type
0,0,https://photos.zillowstatic.com/fp/23aacb3df57...,https://www.zillow.com/homedetails/6225-Lawren...,"6225 Lawrence Ave E, Toronto, ON M1C 5G4",Toronto,M1C5G4,3,4,799900,43.65572,-79.45745,SINGLE_FAMILY
1,2,https://photos.zillowstatic.com/fp/1cd388cbfce...,https://www.zillow.com/homedetails/3840-Bathur...,"3840 Bathurst St #702, Toronto, ON M3H 6C6",Toronto,M3H6C6,2,2,699000,43.62386,-79.488945,CONDO
2,3,https://photos.zillowstatic.com/fp/1f0037527b3...,https://www.zillow.com/homedetails/8-Littlelea...,"8 Littleleaf Dr, Toronto, ON M1B 1Z1",Toronto,M1B1Z1,3,3,1149900,43.72315,-79.44823,CONDO
3,4,https://photos.zillowstatic.com/fp/780ba993641...,https://www.zillow.com/homedetails/2301-Danfor...,"2301 Danforth Ave #306, Toronto, ON M4C 0A7",Toronto,M4C0A7,2,2,699000,43.64239,-79.424736,CONDO
4,5,https://photos.zillowstatic.com/fp/950875d10ea...,https://www.zillow.com/homedetails/16-Catalda-...,"16 Catalda Ct, Toronto, ON M2R 3X5",Toronto,M2R3X5,4,5,2690000,43.641495,-79.41007,CONDO


In [68]:
#check how many rows and columns we have
df.shape

(1908, 12)

In [69]:
df = pd.get_dummies(df, columns=["home type", "# of beds"])
df.head()

Unnamed: 0.1,Unnamed: 0,listing image,listing url,full address,city,zipcode,# of bathrooms,price,latitude,longitude,home type_CONDO,home type_SINGLE_FAMILY,home type_TOWNHOUSE,# of beds_1,# of beds_2,# of beds_3,# of beds_4,# of beds_5
0,0,https://photos.zillowstatic.com/fp/23aacb3df57...,https://www.zillow.com/homedetails/6225-Lawren...,"6225 Lawrence Ave E, Toronto, ON M1C 5G4",Toronto,M1C5G4,4,799900,43.65572,-79.45745,0,1,0,0,0,1,0,0
1,2,https://photos.zillowstatic.com/fp/1cd388cbfce...,https://www.zillow.com/homedetails/3840-Bathur...,"3840 Bathurst St #702, Toronto, ON M3H 6C6",Toronto,M3H6C6,2,699000,43.62386,-79.488945,1,0,0,0,1,0,0,0
2,3,https://photos.zillowstatic.com/fp/1f0037527b3...,https://www.zillow.com/homedetails/8-Littlelea...,"8 Littleleaf Dr, Toronto, ON M1B 1Z1",Toronto,M1B1Z1,3,1149900,43.72315,-79.44823,1,0,0,0,0,1,0,0
3,4,https://photos.zillowstatic.com/fp/780ba993641...,https://www.zillow.com/homedetails/2301-Danfor...,"2301 Danforth Ave #306, Toronto, ON M4C 0A7",Toronto,M4C0A7,2,699000,43.64239,-79.424736,1,0,0,0,1,0,0,0
4,5,https://photos.zillowstatic.com/fp/950875d10ea...,https://www.zillow.com/homedetails/16-Catalda-...,"16 Catalda Ct, Toronto, ON M2R 3X5",Toronto,M2R3X5,5,2690000,43.641495,-79.41007,1,0,0,0,0,0,1,0


In [None]:
#delete first column as it is unneeded
df = df.iloc[: , 1:]
df.head()

In [72]:
# Define the features set.
X = df.copy()
X = X.drop(['price', 'listing image', 'listing url', 'full address', 'zipcode', 'latitude', 'longitude', 'city', '# of bathrooms', 'Unnamed: 0'], axis=1)
X.head()

Unnamed: 0,home type_CONDO,home type_SINGLE_FAMILY,home type_TOWNHOUSE,# of beds_1,# of beds_2,# of beds_3,# of beds_4,# of beds_5
0,0,1,0,0,0,1,0,0
1,1,0,0,0,1,0,0,0
2,1,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,0
4,1,0,0,0,0,0,1,0


In [73]:
X.shape

(1908, 8)

In [74]:
# Define the target set.
y = df['price']
y[:15]

0      799900
1      699000
2     1149900
3      699000
4     2690000
5      829000
6      299999
7      924900
8      769000
9     7288000
10     649900
11    1185000
12     799000
13     999000
14     750000
Name: price, dtype: int64

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, train_size=0.75)

In [76]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1431, 8)
(477, 8)
(1431,)
(477,)


In [77]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [78]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [79]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
print(predictions)

[ 699000 1199000 1199000  699900  599000  599000 1399000  699000  699000
 1399000  999000  749000  699000 1199000 1199000 1199000  699900 1199000
 1199000  699900  699000  699000 1199000  699000 1399000  699000  499000
  999000  699000 1199000 1199000 1199000  649900  699000  999000  699900
  999000  699900 1399000  699900  699000  499000  499000 1199000 1399000
  999000  699000  699000 1399000  699000 1199000 1399000  599000  999000
 1399000 1399000 1399000  699900  999000 1199000  499000 1199000 1399000
 1199000  699000  499000  699000 1199000  699000  999000 1399000 1399000
  499000 1199000 1199000  699000  499000  499000  699000  499000  999000
  699000  999000 1199000 1199000 1199000  499000 1399000  499000 1399000
 1199000 1399000 1399000 1399000 1399000 1199000  699000 1199000  499000
  699900 1399000  699000  999000  699000  999000  749000 1199000  649900
  699000  699000 1399000 1199000 1399000 1199000  699000 1399000 1399000
  699900 1199000 1199000 1399000  999000  749000  7

In [80]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(acc_score)
print(classification_report(y_test, predictions))

0.020964360587002098
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
        2900       0.00      0.00      0.00         1
       45000       0.00      0.00      0.00         1
       88888       0.00      0.00      0.00         1
      199900       0.00      0.00      0.00         1
      225000       0.00      0.00      0.00         1
      229000       0.00      0.00      0.00         1
      298000       0.00      0.00      0.00         1
      317000       0.00      0.00      0.00         1
      320000       0.00      0.00      0.00         1
      330000       0.00      0.00      0.00         1
      339900       0.00      0.00      0.00         1
      349000       0.00      0.00      0.00         1
      415000       0.00      0.00      0.00         1
      449900       0.00      0.00      0.00         1
      469900       0.00      0.00      0.00         0
      475000       0.00      0.00      0.00         1
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
scaled = data_scaler.fit_transform(df)
scaled[:5]

In [None]:
import numpy as np
print(np.mean(scaled[:,0]))
print(np.std(scaled[:,0]))