In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/bins.csv


In [12]:
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from scipy.io import arff
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [13]:
import pandas as pd
df = pd.read_csv("../input/bins.csv")
df1 = df.iloc[:,1:8]
df1.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,0.272926,0.730594,0.009513,1.0,0.616941,0.719323,Medium
1,0.272926,0.445205,0.043809,0.9,0.584949,0.711451,Medium
2,1.0,0.303653,0.083315,0.5,0.671231,0.758896,Medium
3,0.909389,0.303653,0.083315,0.5,0.671231,0.758896,Moderately High
4,0.181223,0.114155,0.056799,0.5,0.573194,0.743153,Medium


In [14]:
#feature selection
column_names = ['X1 transaction date','X2 house age','X3 distance to the nearest MRT station','X4 number of convenience stores','X5 latitude','X6 longitude','Y house price of unit area']
def get_feature_names(X, col = column_names):
  try:
    mask = X.get_support() #list of booleans
  except AttributeError:
    mask = X.support_  #Boruta has different attributes from scikit-learn API
  new_features = [] # The list of your K best features
  for bool, feature in zip(mask, col):
    if bool:
      new_features.append(feature)
  return new_features

In [15]:
!pip install Boruta



In [16]:
#To divide data into attributes and labels
X = df1.drop('Y house price of unit area',axis=1).values
y = df1['Y house price of unit area'].values

In [17]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
#Boruta feature selection
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
boruta_selector.fit(X, y)
boruta_set = boruta_selector.transform(X)

print(get_feature_names(boruta_selector))

['X2 house age', 'X3 distance to the nearest MRT station', 'X5 latitude', 'X6 longitude']


In [18]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
#L1-based feature selection
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter = 2000).fit(X, y)
l_model = SelectFromModel(lsvc, prefit=True)
l1_set = l_model.transform(X)
print(get_feature_names(l_model))

#tree-based feature selection
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
tb_model = SelectFromModel(clf, prefit=True)
tr_set = tb_model.transform(X)
print(get_feature_names(tb_model))

['X4 number of convenience stores']
['X2 house age', 'X3 distance to the nearest MRT station', 'X6 longitude']


In [19]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

#rebalance the dataset using oversampling (random oversampling)
ros = RandomOverSampler(random_state=42)

ros_boruta_set, ros_boruta_labels = ros.fit_resample(boruta_set, y)
print("Class distribution of oversampling with train_set_boruta " + str(sorted(Counter(ros_boruta_labels).items())))

ros_tr_set, ros_tr_labels = ros.fit_resample(tr_set, y)
print("Class distribution of oversampling with train_set_tr " + str(sorted(Counter(ros_tr_labels).items())))

Using TensorFlow backend.


Class distribution of oversampling with train_set_boruta [('High', 226), ('Low', 226), ('Medium', 226), ('Moderately High', 226)]
Class distribution of oversampling with train_set_tr [('High', 226), ('Low', 226), ('Medium', 226), ('Moderately High', 226)]


In [20]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
#rebalance the dataset using undersampling (nearest neightbours)
renn = RepeatedEditedNearestNeighbours()

renn_boruta_set, renn_boruta_labels = renn.fit_resample(boruta_set, y)
print("Class distribution of undersampling with boruta_set " + str(sorted(Counter(renn_boruta_labels).items())))

renn_tr_set, renn_tr_labels = renn.fit_resample(tr_set, y)
print("Class distribution of undersampling with tr_set " + str(sorted(Counter(renn_tr_labels).items())))

Class distribution of undersampling with boruta_set [('High', 4), ('Low', 84), ('Medium', 123), ('Moderately High', 23)]
Class distribution of undersampling with tr_set [('High', 4), ('Low', 80), ('Medium', 112), ('Moderately High', 23)]


In [21]:
#split the dataset into train and test set
from sklearn.model_selection import train_test_split

#1. split original_set and original_labels
original_set_train, original_set_test, original_labels_train, original_labels_test = train_test_split(X, y, test_size=0.2,random_state=42)

#2. split ros_boruta_set and ros_boruta_labels
ros_boruta_set_train, ros_boruta_set_test, ros_boruta_labels_train, ros_boruta_labels_test = train_test_split(ros_boruta_set,ros_boruta_labels, test_size=0.2, random_state=42)

#3. split ros_tr_set and ros_tr_labels
ros_tr_set_train, ros_tr_set_test, ros_tr_labels_train, ros_tr_labels_test = train_test_split(ros_tr_set,ros_tr_labels, test_size=0.2, random_state=42)

#4. split renn_boruta_set and renn_boruta_labels
renn_boruta_set_train, renn_boruta_set_test, renn_boruta_labels_train, renn_boruta_labels_test = train_test_split(renn_boruta_set,renn_boruta_labels, test_size=0.2, random_state=42)

#5. split renn_tr_set and renn_tr_labels
renn_tr_set_train, renn_tr_set_test, renn_tr_labels_train, renn_tr_labels_test = train_test_split(renn_tr_set,renn_tr_labels, test_size=0.2, random_state=42)

In [22]:
#define the dateset list
train_set_list = [original_set_train,ros_boruta_set_train, ros_tr_set_train,renn_boruta_set_train, renn_tr_set_train]
train_labels_list = [original_labels_train,ros_boruta_labels_train, ros_tr_labels_train,renn_boruta_labels_train, renn_tr_labels_train]
test_set_list = [original_set_test,ros_boruta_set_test,ros_tr_set_test,renn_boruta_set_test,renn_tr_set_test]
test_labels_list = [original_labels_test,ros_boruta_labels_test,ros_tr_labels_test,renn_boruta_labels_test,renn_tr_labels_test]
dataset_name_list = ["original","ros_boruta","ros_tr","renn_boruta","renn_tr"]

In [35]:
#Tree models
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report,confusion_matrix

dtc = DecisionTreeClassifier()

In [37]:
for i in range(5):
  #train the model against 5 dataset
  dtc.fit(train_set_list[i],train_labels_list[i])
  
  dtc_predicted = dtc.predict(test_set_list[i])
  #dtc_confusion = confusion_matrix(test_labels_list[i], dtc_predicted)

  print(classification_report(test_labels_list[i], dtc_predicted))

  print('Micro-averaged precision = {:.2f} (treat instances equally)'
      .format(precision_score(test_labels_list[i], dtc_predicted, average = 'micro')))
  print('Macro-averaged precision = {:.2f} (treat classes equally)'
      .format(precision_score(test_labels_list[i], dtc_predicted, average = 'macro')))

  print('Micro-averaged f1 = {:.2f} (treat instances equally)'
        .format(f1_score(test_labels_list[i], dtc_predicted, average = 'micro')))
  print('Macro-averaged f1 = {:.2f} (treat classes equally)'
        .format(f1_score(test_labels_list[i], dtc_predicted, average = 'macro')))
  print("--------------------------------------------------------------------------")


                 precision    recall  f1-score   support

            Low       0.83      0.86      0.84        28
         Medium       0.83      0.80      0.81        44
Moderately High       0.67      0.73      0.70        11

       accuracy                           0.81        83
      macro avg       0.78      0.79      0.78        83
   weighted avg       0.81      0.81      0.81        83

Micro-averaged precision = 0.81 (treat instances equally)
Macro-averaged precision = 0.78 (treat classes equally)
Micro-averaged f1 = 0.81 (treat instances equally)
Macro-averaged f1 = 0.78 (treat classes equally)
--------------------------------------------------------------------------
                 precision    recall  f1-score   support

           High       0.92      1.00      0.96        47
            Low       0.86      0.95      0.90        40
         Medium       0.90      0.59      0.71        46
Moderately High       0.77      0.90      0.83        48

       accuracy       

  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
#Linear Model
from sklearn.svm import SVC

for i in range(5):
  #train the model against 5 dataset
  svm = SVC(kernel = 'linear').fit(train_set_list[i], train_labels_list[i])
  svm_predicted = svm.predict(test_set_list[i])

  print(classification_report(test_labels_list[i], svm_predicted))

  print('Micro-averaged precision = {:.2f}'
      .format(precision_score(test_labels_list[i], svm_predicted, average = 'micro')))
  print('Macro-averaged precision = {:.2f}'
      .format(precision_score(test_labels_list[i], svm_predicted, average = 'macro')))

  print('Micro-averaged f1 = {:.2f}'
        .format(f1_score(test_labels_list[i], svm_predicted, average = 'micro')))
  print('Macro-averaged f1 = {:.2f} '
        .format(f1_score(test_labels_list[i], svm_predicted, average = 'macro')))
  print("--------------------------------------------------------------------------")


                 precision    recall  f1-score   support

            Low       0.88      0.79      0.83        28
         Medium       0.71      0.93      0.80        44
Moderately High       0.00      0.00      0.00        11

       accuracy                           0.76        83
      macro avg       0.53      0.57      0.54        83
   weighted avg       0.67      0.76      0.71        83

Micro-averaged precision = 0.76
Macro-averaged precision = 0.53
Micro-averaged f1 = 0.76
Macro-averaged f1 = 0.54 
--------------------------------------------------------------------------
                 precision    recall  f1-score   support

           High       0.44      0.40      0.42        47
            Low       0.91      0.80      0.85        40
         Medium       0.40      0.04      0.08        46
Moderately High       0.48      0.98      0.64        48

       accuracy                           0.55       181
      macro avg       0.56      0.56      0.50       181
   weig

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
#Probabilistic models
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

In [43]:
for i in range(5):
  #train the model against 5 dataset
  gnb.fit(train_set_list[i],train_labels_list[i])
  
  gnb_predicted = gnb.predict(test_set_list[i])

  print(classification_report(test_labels_list[i], gnb_predicted))

  print('Micro-averaged precision = {:.2f}'
      .format(precision_score(test_labels_list[i], gnb_predicted, average = 'micro')))
  print('Macro-averaged precision = {:.2f}'
      .format(precision_score(test_labels_list[i], gnb_predicted, average = 'macro')))

  print('Micro-averaged f1 = {:.2f}'
        .format(f1_score(test_labels_list[i], gnb_predicted, average = 'micro')))
  print('Macro-averaged f1 = {:.2f}'
        .format(f1_score(test_labels_list[i], gnb_predicted, average = 'macro')))
  print("--------------------------------------------------------------------------")


                 precision    recall  f1-score   support

            Low       0.86      0.89      0.88        28
         Medium       0.80      0.64      0.71        44
Moderately High       0.32      0.55      0.40        11

       accuracy                           0.71        83
      macro avg       0.66      0.69      0.66        83
   weighted avg       0.76      0.71      0.72        83

Micro-averaged precision = 0.71
Macro-averaged precision = 0.66
Micro-averaged f1 = 0.71
Macro-averaged f1 = 0.66
--------------------------------------------------------------------------
                 precision    recall  f1-score   support

           High       0.73      1.00      0.85        47
            Low       0.92      0.82      0.87        40
         Medium       0.71      0.26      0.38        46
Moderately High       0.64      0.85      0.73        48

       accuracy                           0.73       181
      macro avg       0.75      0.74      0.71       181
   weigh

In [44]:
#Adaboost
from sklearn.ensemble import AdaBoostClassifier

adab = AdaBoostClassifier()

for i in range(5):
  #train the model against 5 dataset
  adab.fit(train_set_list[i],train_labels_list[i])
  
  adab_predicted = adab.predict(test_set_list[i])

  print(classification_report(test_labels_list[i], adab_predicted))

  print('Micro-averaged precision = {:.2f}'
      .format(precision_score(test_labels_list[i], adab_predicted, average = 'micro')))
  print('Macro-averaged precision = {:.2f}'
      .format(precision_score(test_labels_list[i], adab_predicted, average = 'macro')))

  print('Micro-averaged f1 = {:.2f}'
        .format(f1_score(test_labels_list[i], adab_predicted, average = 'micro')))
  print('Macro-averaged f1 = {:.2f}'
        .format(f1_score(test_labels_list[i], adab_predicted, average = 'macro')))
  print("--------------------------------------------------------------------------")


  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

           High       0.00      0.00      0.00         0
            Low       0.86      0.86      0.86        28
         Medium       0.81      0.89      0.85        44
Moderately High       0.80      0.36      0.50        11

       accuracy                           0.81        83
      macro avg       0.62      0.53      0.55        83
   weighted avg       0.83      0.81      0.80        83

Micro-averaged precision = 0.81
Macro-averaged precision = 0.62
Micro-averaged f1 = 0.81
Macro-averaged f1 = 0.55
--------------------------------------------------------------------------
                 precision    recall  f1-score   support

           High       0.86      0.13      0.22        47
            Low       0.74      0.93      0.82        40
         Medium       0.48      0.35      0.41        46
Moderately High       0.52      0.98      0.68        48

       accuracy                           0.59       181
      ma

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

           High       0.00      0.00      0.00        47
            Low       0.84      0.65      0.73        40
         Medium       0.44      0.65      0.53        46
Moderately High       0.52      0.90      0.66        48

       accuracy                           0.55       181
      macro avg       0.45      0.55      0.48       181
   weighted avg       0.44      0.55      0.47       181

Micro-averaged precision = 0.55
Macro-averaged precision = 0.45
Micro-averaged f1 = 0.55
Macro-averaged f1 = 0.48
--------------------------------------------------------------------------
                 precision    recall  f1-score   support

            Low       1.00      0.12      0.21        17
         Medium       0.55      0.88      0.68        24
Moderately High       0.57      0.67      0.62         6

       accuracy                           0.57        47
      macro avg       0.71      0.55      0.50        47
   weigh

  _warn_prf(average, modifier, msg_start, len(result))
