In [127]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [128]:
#load in the csv file for model
df = pd.read_csv(Path('resources/GTA_houses_aftercleaning.csv'))
df.head()

Unnamed: 0,Address,zipcode,city,latitude,longitude,price,bathrooms,bedrooms,homeType
0,120 Church St E,L6V1G8,Brampton,43.69446,-79.756256,1,2,3,SINGLE_FAMILY
1,128 Church St E,L6V1G8,Brampton,43.694798,-79.75577,1,2,2,SINGLE_FAMILY
2,3 Sophia St,L6V1T8,Brampton,43.69498,-79.75685,1,2,3,SINGLE_FAMILY
3,1A Sophia St,L6V1T8,Brampton,43.694664,-79.75647,1,2,2,SINGLE_FAMILY
4,1 Sophia St,L6V1T8,Brampton,43.694763,-79.75668,1,3,4,SINGLE_FAMILY


In [129]:
#check how many rows and columns we have
df.shape

(2000, 9)

In [130]:
#df = pd.get_dummies(df, columns=["homeType", "city"])
#df.head()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df = df.copy()
df['city'] = le.fit_transform(df['city'])
df['homeType'] = le.fit_transform(df['homeType'])
df.head()

Unnamed: 0,Address,zipcode,city,latitude,longitude,price,bathrooms,bedrooms,homeType
0,120 Church St E,L6V1G8,0,43.69446,-79.756256,1,2,3,5
1,128 Church St E,L6V1G8,0,43.694798,-79.75577,1,2,2,5
2,3 Sophia St,L6V1T8,0,43.69498,-79.75685,1,2,3,5
3,1A Sophia St,L6V1T8,0,43.694664,-79.75647,1,2,2,5
4,1 Sophia St,L6V1T8,0,43.694763,-79.75668,1,3,4,5


In [131]:
#statistical overview of the dataset
print(df.describe().round(2).T)

            count        mean         std    min        25%         50%  \
city       2000.0        3.43        2.07   0.00       2.00        3.00   
latitude   2000.0       43.73        0.10  43.51      43.66       43.73   
longitude  2000.0      -79.52        0.18 -79.90     -79.66      -79.50   
price      2000.0  1680882.98  2022453.06   1.00  839974.25  1199000.00   
bathrooms  2000.0        3.20        1.65   0.00       2.00        3.00   
bedrooms   2000.0        3.23        1.22   0.00       3.00        3.00   
homeType   2000.0        4.30        1.72   0.00       5.00        5.00   

                  75%          max  
city             5.00         6.00  
latitude        43.82        44.00  
longitude      -79.38       -79.13  
price      1759000.00  37500000.00  
bathrooms        4.00        16.00  
bedrooms         4.00        12.00  
homeType         5.00         6.00  


In [132]:
# Define the features set.
X = df.copy()
X = X[['city', 'bathrooms', 'bedrooms', 'homeType']]
X.head()

Unnamed: 0,city,bathrooms,bedrooms,homeType
0,0,2,3,5
1,0,2,2,5
2,0,2,3,5
3,0,2,2,5
4,0,3,4,5


In [133]:
# Define the target set.
y = df['price'].ravel()
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [159]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50, train_size=0.80)

In [160]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1600, 4)
(400, 4)
(1600,)
(400,)


In [161]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [162]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=50)

In [163]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [164]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
print(predictions)

[ 1549000   849000  1300000  1599999  1599000   998888   999000   889000
  1299000   609900  1849000   587000   799900  1100000   999000   999000
   699000   799900   699000  1399000  1300000  1599000   999900   899000
   499900   599900   699000  1399000  7299000   699000  1830888  3999900
   699900   649000  1479900  1599999  4958000  1299000  1888888  1599000
   999900  1399000   999000  1099000  7299000  1100000   699000  7580000
   488000  1350000  1299000  1300000   999900  1399000   899000   649000
   799900  1825000  1549000  4995000   999900  2549888   849000  1399000
  1399000   849000   799900   368000   899000  1588000  1300000   699000
  1299000  4958000  1549000  1849000   599900  1599000   899900  2199000
   799900   998888  1099000  1299000 19250000   998000  3350000 12895000
  1299000  1099000  1980000  1599000   699000   699000   609900  1599000
   649000   999900   999000  1599999   475000   899000   699900   999000
  1599000  1479900  1300000  2099000  7499000  3499

In [165]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(acc_score)
print("Confusion Matrix")
display(cm_df)
print(classification_report(y_test, predictions))

ValueError: Shape of passed values is (341, 341), indices imply (2, 2)

In [166]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.36484215, 0.28094664, 0.25956382, 0.09464739])

In [167]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3648421531979317, 'city'),
 (0.2809466408405542, 'bathrooms'),
 (0.259563817924565, 'bedrooms'),
 (0.09464738803694916, 'homeType')]