In [105]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
# np.set_printoptions(threshold=np.nan)
sns.set()


In [72]:
df = pd.read_csv('resale-flat-prices-based-on-approval-date-1990-1999.csv')

In [73]:
# Check the columns using dtypes
print(df.dtypes)

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
resale_price             int64
dtype: object


In [74]:
# Randomly sample 5 records with .sample(5)
df.sample(5)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
120430,1996-04,BEDOK,3 ROOM,135,BEDOK NTH ST 2,01 TO 03,67.0,NEW GENERATION,1978,162000
236196,1999-02,GEYLANG,4 ROOM,348,UBI AVE 1,10 TO 12,84.0,SIMPLIFIED,1986,191000
253888,1999-05,JURONG EAST,4 ROOM,319,JURONG EAST ST 31,10 TO 12,104.0,MODEL A,1984,250000
273928,1999-09,BEDOK,5 ROOM,146,BEDOK RESERVOIR RD,13 TO 15,122.0,IMPROVED,1986,313000
161239,1997-06,HOUGANG,4 ROOM,461,HOUGANG AVE 10,10 TO 12,105.0,MODEL A,1992,380000


# 2. Baseline Model (Decision Tree)
A simple/initial model that you compare your later/more complex models against. It is basically the benchmark for your problem statement.

In [75]:
#Decision Tree in SKLearn don't take in string well. So we use a label encoder to change that string to a numeric value
for column in df.columns:
    if df[column].dtype == type(object):
        #Create the label encoder
        le = preprocessing.LabelEncoder()
        #Convert the non numeric data to numeric
        df[column] = le.fit_transform(df[column])

In [76]:
# Randomly sample 5 records with .sample(5)
df.sample(5)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
218219,105,22,3,935,346,2,84.0,10,1987,227000
148253,84,18,4,225,116,6,123.0,11,1976,500000
3220,2,16,2,694,237,1,65.0,2,1976,38000
236811,109,13,3,413,191,2,103.0,5,1985,216000
111939,72,1,2,48,26,0,67.0,8,1978,133500


### Building Decision Tree Model
Let's create a Decision Tree Model using Scikit-learn.

In [77]:
df['flat_type'].unique()

array([0, 2, 3, 4, 1, 5, 6])

In [84]:
#split dataset in features and target variable
feature_cols = ['month','town','block','street_name','storey_range','floor_area_sqm','flat_model','lease_commence_date','resale_price']
X = df[feature_cols]
#y = df[['flat_type']]
y = df['flat_type']

In [89]:
# Split dataset into training set and test set
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

###################################################################################################################################


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=0)


In [90]:
# # Create Decision Tree classifer object
# clf = DecisionTreeClassifier()

# # Train Decision Tree Classifer
# clf = clf.fit(X_train,y_train)

# #Predict the response for test dataset
# y_pred = clf.predict(X_test)

###################################################################################################################################

#Create the kNN classifier and set the number of neighbors to be 3. Note that you can tune this number of neighbors
model = KNeighborsClassifier(n_neighbors=3)

#Fit the training feature Xs and training label Ys
model.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [87]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9993964716805942


In [88]:
#installed graphviz & pydotplus

In [91]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)

In [108]:
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out different scores.
f2 = precision_score(y_pred, y_test,pos_label='positive',average='macro')
f3 = recall_score(y_pred, y_test,pos_label='positive',average='weighted')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
# f2 = f1_score(y_pred, y_test,average='macro')
# f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('asr: ',asr)
print('f1: ',f1)
print('f2: ',f2)
print('f3: ',f3)

[[  256    48    53    14     0     0     0]
 [   29   753   426    45     2     0     0]
 [  116  1204 52148  6056   609    76     1]
 [    4    35  3881 40186  5817   746    28]
 [    0     0    86  2810 15479  3001    35]
 [    0     0    34    37  2296  7192    76]
 [    0     0     0     0     5    12     4]]
asr:  0.807924791086351
f1:  0.807924791086351
f2:  0.5798811706072399
f3:  0.807924791086351




## Bagging (with Decision Tree)

Refer to the following links on for detail explanation on the implementation:

In [109]:
#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
model = BaggingClassifier(n_estimators=50)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

BaggingClassifier(n_estimators=50)

In [110]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)

In [118]:
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
f2 = f1_score(y_pred, y_test,average='macro')
f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('asr: ',asr)
print('f1: ',f1)
print('f2: ',f2)
print('f3: ',f3)

[[  403     0     0     0     0     0     0]
 [    2  1784   317     0     0     0     0]
 [    0   256 54464  2713     0     0     0]
 [    0     0  1795 45692   686     0     0]
 [    0     0    24   743 22577    37    15]
 [    0     0     9     0   945 10859    94]
 [    0     0    19     0     0   131    35]]
asr:  0.9457799442896936
f1:  0.9457799442896936
f2:  0.8372868390668112
f3:  0.945701144101097


## AdaBoost (with Gaussian Navie Bayes)
Refer to the following links on for detail explanation on the implementation:

In [114]:
model = GaussianNB()
#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

GaussianNB()

In [119]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
f2 = f1_score(y_pred, y_test,average='macro')
f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('asr: ',asr)
print('f1: ',f1)
print('f2: ',f2)
print('f3: ',f3)

[[  403     0     0     0     0     0     0]
 [    2  1784   317     0     0     0     0]
 [    0   256 54464  2713     0     0     0]
 [    0     0  1795 45692   686     0     0]
 [    0     0    24   743 22577    37    15]
 [    0     0     9     0   945 10859    94]
 [    0     0    19     0     0   131    35]]
asr:  0.9457799442896936
f1:  0.9457799442896936
f2:  0.8372868390668112
f3:  0.945701144101097


In [120]:
nb = GaussianNB()

model = AdaBoostClassifier(n_estimators=50,learning_rate=1, base_estimator=nb)
#model = BaggingClassifier(n_estimators=50, base_estimator=knn)


#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=GaussianNB(), learning_rate=1)

In [124]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)
# Find the confusion matrix of the result
cm = confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy and F1 score of the result
asr = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test,average='micro')

# testing out differet form of accuracy.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
f2 = f1_score(y_pred, y_test,average='macro')
f3 = f1_score(y_pred, y_test,average='weighted')

#The best f1_score is 1 and the worst value is 0.

print('asr: ',asr)
print('f1: ',f1)
print('f2: ',f2)
print('f3: ',f3)

[[  336     0     0     0     0     0     0]
 [   51  1340   883     0     0     0     0]
 [   18   700 54470 14346   154     0     0]
 [    0     0  1256 30205  3582     0     0]
 [    0     0    14  4591 17629  1430    17]
 [    0     0     5     6  2843  9588   106]
 [    0     0     0     0     0     9    21]]
asr:  0.7910097493036212
f1:  0.7910097493036212
f2:  0.6998741020796836
f3:  0.7981375336134957
