In [11]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [12]:
#load in the csv file for model
df = pd.read_csv('resources/GTA_houses_cleaned.csv')
df.head()

Unnamed: 0,listing image,listing url,full address,city,zipcode,# of beds,# of bathrooms,price,latitude,longitude,home type
0,https://photos.zillowstatic.com/fp/23aacb3df57...,https://www.zillow.com/homedetails/6225-Lawren...,"6225 Lawrence Ave E, Toronto, ON M1C 5G4",Toronto,M1C5G4,3,4,799900,43.65572,-79.45745,SINGLE_FAMILY
1,https://photos.zillowstatic.com/fp/1cd388cbfce...,https://www.zillow.com/homedetails/3840-Bathur...,"3840 Bathurst St #702, Toronto, ON M3H 6C6",Toronto,M3H6C6,2,2,699000,43.62386,-79.488945,CONDO
2,https://photos.zillowstatic.com/fp/1f0037527b3...,https://www.zillow.com/homedetails/8-Littlelea...,"8 Littleleaf Dr, Toronto, ON M1B 1Z1",Toronto,M1B1Z1,3,3,1149900,43.72315,-79.44823,CONDO
3,https://photos.zillowstatic.com/fp/780ba993641...,https://www.zillow.com/homedetails/2301-Danfor...,"2301 Danforth Ave #306, Toronto, ON M4C 0A7",Toronto,M4C0A7,2,2,699000,43.64239,-79.424736,CONDO
4,https://photos.zillowstatic.com/fp/950875d10ea...,https://www.zillow.com/homedetails/16-Catalda-...,"16 Catalda Ct, Toronto, ON M2R 3X5",Toronto,M2R3X5,4,5,2690000,43.641495,-79.41007,CONDO


In [13]:
#check how many rows and columns we have
df.shape

(1893, 11)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1893 entries, 0 to 1892
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   listing image   1893 non-null   object 
 1   listing url     1893 non-null   object 
 2   full address    1893 non-null   object 
 3   city            1893 non-null   object 
 4   zipcode         1893 non-null   object 
 5   # of beds       1893 non-null   int64  
 6   # of bathrooms  1893 non-null   int64  
 7   price           1893 non-null   int64  
 8   latitude        1893 non-null   float64
 9   longitude       1893 non-null   float64
 10  home type       1893 non-null   object 
dtypes: float64(2), int64(3), object(6)
memory usage: 162.8+ KB


In [15]:
#df = pd.get_dummies(df, columns=["homeType", "city"])
#df.head()

#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
#df = df.copy()
#df['city'] = le.fit_transform(df['city'])
#df['homeType'] = le.fit_transform(df['home type'])
#df.head()

df = pd.get_dummies(df, columns=['city', 'home type'])
df.head()

Unnamed: 0,listing image,listing url,full address,zipcode,# of beds,# of bathrooms,price,latitude,longitude,city_Brampton,city_Markham,city_Mississauga,city_Toronto,city_Vaughan,home type_CONDO,home type_SINGLE_FAMILY,home type_TOWNHOUSE
0,https://photos.zillowstatic.com/fp/23aacb3df57...,https://www.zillow.com/homedetails/6225-Lawren...,"6225 Lawrence Ave E, Toronto, ON M1C 5G4",M1C5G4,3,4,799900,43.65572,-79.45745,0,0,0,1,0,0,1,0
1,https://photos.zillowstatic.com/fp/1cd388cbfce...,https://www.zillow.com/homedetails/3840-Bathur...,"3840 Bathurst St #702, Toronto, ON M3H 6C6",M3H6C6,2,2,699000,43.62386,-79.488945,0,0,0,1,0,1,0,0
2,https://photos.zillowstatic.com/fp/1f0037527b3...,https://www.zillow.com/homedetails/8-Littlelea...,"8 Littleleaf Dr, Toronto, ON M1B 1Z1",M1B1Z1,3,3,1149900,43.72315,-79.44823,0,0,0,1,0,1,0,0
3,https://photos.zillowstatic.com/fp/780ba993641...,https://www.zillow.com/homedetails/2301-Danfor...,"2301 Danforth Ave #306, Toronto, ON M4C 0A7",M4C0A7,2,2,699000,43.64239,-79.424736,0,0,0,1,0,1,0,0
4,https://photos.zillowstatic.com/fp/950875d10ea...,https://www.zillow.com/homedetails/16-Catalda-...,"16 Catalda Ct, Toronto, ON M2R 3X5",M2R3X5,4,5,2690000,43.641495,-79.41007,0,0,0,1,0,1,0,0


In [16]:
#statistical overview of the dataset
print(df.describe().round(2).T)

                          count        mean         std      min        25%  \
# of beds                1893.0        3.17        1.06     1.00       3.00   
# of bathrooms           1893.0        3.13        1.43     1.00       2.00   
price                    1893.0  1596955.89  1434246.57  2900.00  849900.00   
latitude                 1893.0       43.73        0.10    43.50      43.65   
longitude                1893.0      -79.51        0.18   -79.86     -79.66   
city_Brampton            1893.0        0.17        0.38     0.00       0.00   
city_Markham             1893.0        0.17        0.38     0.00       0.00   
city_Mississauga         1893.0        0.18        0.38     0.00       0.00   
city_Toronto             1893.0        0.32        0.47     0.00       0.00   
city_Vaughan             1893.0        0.16        0.37     0.00       0.00   
home type_CONDO          1893.0        0.20        0.40     0.00       0.00   
home type_SINGLE_FAMILY  1893.0        0.66        0

In [17]:
# Define the features set.
X = df.drop(['price', 'listing image', 'listing url', 'full address', 'zipcode', 'latitude', 'longitude'], axis=1)
X.head()

Unnamed: 0,# of beds,# of bathrooms,city_Brampton,city_Markham,city_Mississauga,city_Toronto,city_Vaughan,home type_CONDO,home type_SINGLE_FAMILY,home type_TOWNHOUSE
0,3,4,0,0,0,1,0,0,1,0
1,2,2,0,0,0,1,0,1,0,0
2,3,3,0,0,0,1,0,1,0,0
3,2,2,0,0,0,1,0,1,0,0
4,4,5,0,0,0,1,0,1,0,0


In [18]:
X.shape

(1893, 10)

In [19]:
# Define the target set.
y = df['price']
y[:5]

0     799900
1     699000
2    1149900
3     699000
4    2690000
Name: price, dtype: int64

In [20]:
y.shape

(1893,)

In [22]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)

In [23]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1514, 10)
(379, 10)
(1514,)
(379,)


In [24]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=50)

In [28]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [29]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
print(predictions)

[ 649900 1199990 1858000  849000 1150000 1399000 1288900 2288880 3250000
 1199990  699000  999000 1700000  999900 1499900 1688000 1599000  999000
  719000 1399900  899000 1950000  699000 1199990 1388000  998000 1399000
  999900 1199000 1250000  550000  609900 1428000  609900 1599900 1300000
   25000 1735000 1399000 1139900 1299000 1099000 1079000 1898800 1199000
  999000 1425000  999000 1428000  699000  609900 1399000  778000 3880000
  579900  699000 1199000  609900  739000  849900  565000  699000  849900
 1700000 2595000 7580000 1199000 1238800 1199990  999000  849900  849000
  999000 1188000  869888  849900  849900  651900 1350000 1399000  649900
 1449900 2490000  998000 1499000 1399000 1250000  999900  999900  999000
 1688000 1238800  579900 1428000 1199000  999000 1399000 1450000  849900
  849000  799900 5450000 1399000 1300000 1150000 1368000 1328000  879900
  649900  799900 1599000  579900  999000 1199000 1950000 1499900 1428000
 1599900 8888000 1299000 1139900 1499900 1139900  4

In [30]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(acc_score)
#print("Confusion Matrix")
#display(cm_df)
print(classification_report(y_test, predictions))

ValueError: Shape of passed values is (346, 346), indices imply (2, 2)

In [33]:
# importing necessary liberaries


from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# Mean squared error (MSE)
mse = mean_squared_error(y_test.values.ravel(), predictions)


# R2 Score
r2 = r2_score(y_test.values.ravel(), predictions)


# Mean Absolute Error
mae = mean_absolute_error(y_test.values.ravel(), predictions)


#Print Results


print("Mean squared error (MSE): ", round(mse, 2))

print("R2 Score: ", round(r2, 2))
print("Mean Absolute Error (MAE): ", round(mae, 2))

Mean squared error (MSE):  2222174590432.49
R2 Score:  -0.15
Mean Absolute Error (MAE):  731821.89


In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)