In [3]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
# np.set_printoptions(threshold=np.nan)
sns.set()


In [4]:
df = pd.read_csv('resale-flat-prices-based-on-approval-date-1990-1999.csv')

In [5]:
# Check the columns using dtypes
print(df.dtypes)

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
resale_price             int64
dtype: object


In [6]:
# Randomly sample 5 records with .sample(5)
df.sample(5)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
8819,1990-09,ANG MO KIO,4 ROOM,535,ANG MO KIO AVE 5,01 TO 03,91.0,NEW GENERATION,1980,70500
202826,1998-07,PASIR RIS,5 ROOM,553,PASIR RIS ST 51,07 TO 09,123.0,IMPROVED,1992,370000
14535,1991-02,YISHUN,3 ROOM,778,YISHUN AVE 2,10 TO 12,74.0,MODEL A,1989,56000
104379,1995-10,SERANGOON,3 ROOM,124,SERANGOON NTH AVE 1,01 TO 03,73.0,NEW GENERATION,1985,131000
83879,1994-12,TAMPINES,5 ROOM,298,TAMPINES ST 22,10 TO 12,130.0,IMPROVED,1985,265000


# 2. Baseline Model (Decision Tree)
A simple/initial model that you compare your later/more complex models against. It is basically the benchmark for your problem statement.

In [7]:
#Decision Tree in SKLearn don't take in string well. So we use a label encoder to change that string to a numeric value
for column in df.columns:
    if df[column].dtype == type(object):
        #Create the label encoder
        le = preprocessing.LabelEncoder()
        #Convert the non numeric data to numeric
        df[column] = le.fit_transform(df[column])

In [8]:
# Randomly sample 5 records with .sample(5)
df.sample(5)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
193830,100,10,4,42,121,1,117.0,11,1977,330000
183301,97,17,3,460,261,2,105.0,5,1990,305000
266599,114,21,3,581,303,0,104.0,5,1992,248000
137866,81,1,2,581,27,0,68.0,8,1980,195000
140697,81,25,2,24,404,1,64.0,10,1986,165000


### Building Decision Tree Model

In [9]:
df['flat_type'].unique()

array([0, 2, 3, 4, 1, 5, 6])

In [10]:
#split dataset in features and target variable
feature_cols = ['month','town','block','street_name','storey_range','floor_area_sqm','flat_model','lease_commence_date','resale_price']
X = df[feature_cols]
#y = df[['flat_type']]
y = df['flat_type']

In [11]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 20% test

In [12]:
# initialise Decision Tree
#Using gini currently, can also use entropy or miclassification error.
clf = DecisionTreeClassifier(criterion='gini',random_state=0) 

# train model
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [13]:
# get predictions and compare results
predictions = clf.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
results_df.sample(10)

Unnamed: 0,Actual,Predicted
64597,3,3
184883,3,3
138980,2,2
129604,3,3
208612,2,2
176715,4,4
79660,4,4
270961,5,5
65428,4,4
268763,3,3


In [35]:
# Accuracy
accuracy = accuracy_score(y_test,predictions)
print("Accuracy:",accuracy)

Accuracy: 0.9993036211699164


In [36]:
# Precision = (True Positive)/(Total Predicted Positive)
# To determine if costs of False Positive is high.
precision = metrics.precision_score(y_test, predictions, average='micro')
precision1 = metrics.precision_score(y_test, predictions, average='macro')
precision2 = metrics.precision_score(y_test, predictions, average='weighted')

#Recall = (True Positive)/(Total Actual Positive)
# calculates the no. of Actual Positives our model capture through labeling it as Positive
recall = metrics.recall_score(y_test, predictions, average='micro')
recall1 = metrics.recall_score(y_test, predictions, average='macro')
recall2 = metrics.recall_score(y_test, predictions, average='weighted')

#F1-score = 2* (precision*recall)/(precision + recall)
# F1 Score is needed when you want to seek a balance between Precision and Recall.
f_measure = metrics.f1_score(y_test, predictions, average='micro')
f_measure1 = metrics.f1_score(y_test, predictions, average='macro')
f_measure2 = metrics.f1_score(y_test, predictions, average='weighted')

# https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9
print("Precision_micro:",precision)
print("Recall_micro:",recall)
print("F-measure_micro:",f_measure)

print("Precision1_macro:",precision1)
print("Recall1_macro:",recall1)
print("F-measure1_macro:",f_measure1)

print("Precision2_weighted:",precision2)
print("Recall2_weighted:",recall2)
print("F-measure2_weighted:",f_measure2)

Precision: 0.9993036211699164
Recall: 0.9993036211699164
F-measure: 0.9993036211699164
Precision1: 0.9945692936025551
Recall1: 0.9990440600463923
F-measure1: 0.9967700896889797
Precision2: 0.9993047478626799
Recall2: 0.9993036211699164
F-measure2: 0.9993038938254236


In [38]:
###################################################################################################################################


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=0)


In [40]:
#Create the kNN classifier and set the number of neighbors to be 3. Note that you can tune this number of neighbors
model = KNeighborsClassifier(n_neighbors=3)

#Fit the training feature Xs and training label Ys
model.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [44]:
y_pred = model.predict(X_test)

In [33]:
#installed graphviz & pydotplus
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.externals.six import StringIO
from IPython.display import Image

feature_cols = ['month','town','block','street_name','storey_range','floor_area_sqm','flat_model','lease_commence_date','resale_price']


ModuleNotFoundError: No module named 'sklearn.externals.six'

In [37]:
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out different scores.
f2 = precision_score(y_pred, y_test,pos_label='positive',average='macro')
f3 = recall_score(y_pred, y_test,pos_label='positive',average='weighted')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
# f2 = f1_score(y_pred, y_test,average='macro')
# f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('accuracy: ',asr)
print('f1-Score: ',f1)
print('precision: ',f2)
print('recall: ',f3)

[[  165     0     0     0     0     0     0]
 [    0   857     4     0     0     0     0]
 [    0     2 22711     6     0     0     0]
 [    0     0    16 19806     0     0     0]
 [    0     0     0     0  9478     0     0]
 [    0     0     0     0     0  4332     0]
 [    0     0     0     0     0     2    61]]
accuracy:  0.9994777158774373
f1-Score:  0.9994777158774373
precision:  0.9994325059089217
recall:  0.9994777158774373




## Bagging (with Decision Tree)

Refer to the following links on for detail explanation on the implementation:

In [25]:
#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
model = BaggingClassifier(n_estimators=50)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

BaggingClassifier(n_estimators=50)

In [26]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)

In [43]:
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
# f2 = f1_score(y_pred, y_test,average='macro')
# f3 = f1_score(y_pred, y_test,average='weighted')
f2 = precision_score(y_pred, y_test,pos_label='positive',average='micro')
f3 = recall_score(y_pred, y_test,pos_label='positive',average='micro')
#The best f1_score is 1 and the worst value is 0.

print('accuracy: ',asr)
print('f1-Score: ',f1)
print('precision: ',f2)
print('recall: ',f3)

[[  165     0     0     0     0     0     0]
 [    0   857     4     0     0     0     0]
 [    0     2 22711     6     0     0     0]
 [    0     0    16 19806     0     0     0]
 [    0     0     0     0  9478     0     0]
 [    0     0     0     0     0  4332     0]
 [    0     0     0     0     0     2    61]]
accuracy:  0.9994777158774373
f1-Score:  0.9994777158774373
precision:  0.9994777158774373
recall:  0.9994777158774373


In [28]:
# get predictions and compare results
predictions = model.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
results_df.sample(10)

Unnamed: 0,Actual,Predicted
205536,5,5
6638,0,0
90530,2,2
106084,2,2
52036,2,2
212010,5,5
26219,1,1
54762,4,4
119423,2,2
160095,2,2


## Bagging (with kNN)

In [None]:
#Create the kNN base classifier
knn = KNeighborsClassifier(n_neighbors=5)

#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
model = BaggingClassifier(n_estimators=50, base_estimator=knn)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

In [42]:
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
# f2 = f1_score(y_pred, y_test,average='macro')
# f3 = f1_score(y_pred, y_test,average='weighted')
f2 = precision_score(y_pred, y_test,pos_label='positive',average='micro')
f3 = recall_score(y_pred, y_test,pos_label='positive',average='micro')
#The best f1_score is 1 and the worst value is 0.

print('accuracy: ',asr)
print('f1-Score: ',f1)
print('precision: ',f2)
print('recall: ',f3)

[[  165     0     0     0     0     0     0]
 [    0   857     4     0     0     0     0]
 [    0     2 22711     6     0     0     0]
 [    0     0    16 19806     0     0     0]
 [    0     0     0     0  9478     0     0]
 [    0     0     0     0     0  4332     0]
 [    0     0     0     0     0     2    61]]
accuracy:  0.9994777158774373
f1-Score:  0.9994777158774373
precision:  0.9994777158774373
recall:  0.9994777158774373


## AdaBoost (with Decision Tree)


In [34]:
#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
model = AdaBoostClassifier(n_estimators=50,learning_rate=0.1)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

AdaBoostClassifier(learning_rate=0.1)

In [41]:
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
# f2 = f1_score(y_pred, y_test,average='macro')
# f3 = f1_score(y_pred, y_test,average='weighted')
f2 = precision_score(y_pred, y_test,pos_label='positive',average='micro')
f3 = recall_score(y_pred, y_test,pos_label='positive',average='micro')
#The best f1_score is 1 and the worst value is 0.

print('accuracy: ',asr)
print('f1-Score: ',f1)
print('precision: ',f2)
print('recall: ',f3)

[[  165     0     0     0     0     0     0]
 [    0   857     4     0     0     0     0]
 [    0     2 22711     6     0     0     0]
 [    0     0    16 19806     0     0     0]
 [    0     0     0     0  9478     0     0]
 [    0     0     0     0     0  4332     0]
 [    0     0     0     0     0     2    61]]
accuracy:  0.9994777158774373
f1-Score:  0.9994777158774373
precision:  0.9994777158774373
recall:  0.9994777158774373




## AdaBoost (with Gaussian Navie Bayes)
Refer to the following links on for detail explanation on the implementation:

In [114]:
model = GaussianNB()
#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

GaussianNB()

In [119]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
f2 = f1_score(y_pred, y_test,average='macro')
f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('accuracy: ',asr)
print('f1-Score: ',f1)
print('precision: ',f2)
print('recall: ',f3)

[[  403     0     0     0     0     0     0]
 [    2  1784   317     0     0     0     0]
 [    0   256 54464  2713     0     0     0]
 [    0     0  1795 45692   686     0     0]
 [    0     0    24   743 22577    37    15]
 [    0     0     9     0   945 10859    94]
 [    0     0    19     0     0   131    35]]
asr:  0.9457799442896936
f1:  0.9457799442896936
f2:  0.8372868390668112
f3:  0.945701144101097


In [120]:
nb = GaussianNB()

model = AdaBoostClassifier(n_estimators=50,learning_rate=1, base_estimator=nb)
#model = BaggingClassifier(n_estimators=50, base_estimator=knn)


#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=GaussianNB(), learning_rate=1)

In [124]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
f2 = f1_score(y_pred, y_test,average='macro')
f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('asr: ',asr)
print('f1: ',f1)
print('f2: ',f2)
print('f3: ',f3)

[[  336     0     0     0     0     0     0]
 [   51  1340   883     0     0     0     0]
 [   18   700 54470 14346   154     0     0]
 [    0     0  1256 30205  3582     0     0]
 [    0     0    14  4591 17629  1430    17]
 [    0     0     5     6  2843  9588   106]
 [    0     0     0     0     0     9    21]]
asr:  0.7910097493036212
f1:  0.7910097493036212
f2:  0.6998741020796836
f3:  0.7981375336134957
