# Multilayer Perceptron

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import metrics

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.datasets import make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("property24.csv")

data.head()

Unnamed: 0,Type,Location,Bedrooms,Bathrooms,Floor,Floors Area,Lot Area,Garage,Garden,Pet Friendly,Reservation Fee,Price,Link
0,House and Lot,Las Pinas City,2,1,2,67.0,55.0,1,yes,yes,0,3900001,2 bedroom House / Lot for sale in Las Pinas Ci...
1,House and Lot,Las Pinas City,3,2,2,78.0,58.0,1,yes,yes,30000,4987000,3 bedroom House / Lot for sale in Las Pinas Ci...
2,House and Lot,Quezon City,3,2,2,65.0,60.0,1,yes,yes,50000,4200000,3 bedroom House / Lot for sale in Quezon City ...
3,Townhouse,Las Pinas City,3,2,2,70.0,52.0,1,yes,yes,30000,4480000,3 bedroom Townhouse for sale in Las Pinas City...
4,House and Lot,Quezon City,3,2,2,65.0,48.0,1,yes,yes,50000,3800000,3 bedroom House / Lot for sale in Quezon City ...


### Check for Nulls

In [4]:
display('-'*100)
display(data.isnull().any())

'----------------------------------------------------------------------------------------------------'

Type               False
Location           False
Bedrooms           False
Bathrooms          False
Floor              False
Floors Area        False
Lot Area           False
Garage             False
Garden             False
Pet Friendly       False
Reservation Fee    False
Price              False
Link               False
dtype: bool

### Drop not needed column/s

In [5]:
prefinal_data = data.drop(['Link'], axis = 1)

prefinal_data.head()

Unnamed: 0,Type,Location,Bedrooms,Bathrooms,Floor,Floors Area,Lot Area,Garage,Garden,Pet Friendly,Reservation Fee,Price
0,House and Lot,Las Pinas City,2,1,2,67.0,55.0,1,yes,yes,0,3900001
1,House and Lot,Las Pinas City,3,2,2,78.0,58.0,1,yes,yes,30000,4987000
2,House and Lot,Quezon City,3,2,2,65.0,60.0,1,yes,yes,50000,4200000
3,Townhouse,Las Pinas City,3,2,2,70.0,52.0,1,yes,yes,30000,4480000
4,House and Lot,Quezon City,3,2,2,65.0,48.0,1,yes,yes,50000,3800000


### Identify unique values of the categorical columns

In [6]:
type_values = prefinal_data['Type'].unique()
location_values = prefinal_data['Location'].unique()
garden_values = prefinal_data['Garden'].unique()
pet_friendly_values = prefinal_data['Pet Friendly'].unique()

print('Categorical Unique Values')

print('-'*100)
print('Location unique values: ')
print(location_values)

print('-'*100)
print('Type unique values: ')
print(type_values)

print('-'*100)
print('Garden: ')
print(garden_values)

print('-'*100)
print('Pet Friendly: ')
print(pet_friendly_values)

Categorical Unique Values
----------------------------------------------------------------------------------------------------
Location unique values: 
['Las Pinas City' 'Quezon City' 'Pasig City' 'Muntinlupa City'
 'Marikina City' 'Paranaque City' 'Caloocan City' 'Taguig City'
 'Manila City']
----------------------------------------------------------------------------------------------------
Type unique values: 
['House and Lot' 'Townhouse']
----------------------------------------------------------------------------------------------------
Garden: 
['yes' 'no']
----------------------------------------------------------------------------------------------------
Pet Friendly: 
['yes' 'no']


### Check for duplicates

In [7]:
# removed duplicates
num_duplicates = len(prefinal_data) - len(prefinal_data.drop_duplicates(keep=False))

# before removing duplicates
print('Before:')
print('duplicates: {0}'.format(num_duplicates))
print('total instances: ', len(prefinal_data))

# removed duplicates
remaining_data = prefinal_data.drop_duplicates()

# afer removing
print('-'*100)
print('After:')
print('total instances: ', len(remaining_data))

Before:
duplicates: 73
total instances:  365
----------------------------------------------------------------------------------------------------
After:
total instances:  325


### Variable Creation

In [8]:
# encode categorical variables
dummy_type = pd.get_dummies(remaining_data['Type'], prefix='Type')
dummy_location = pd.get_dummies(remaining_data['Location'], prefix='Location')
dummy_pet_friendly = pd.get_dummies(remaining_data['Pet Friendly'], prefix='Pet Friendly')
dummy_garden = pd.get_dummies(remaining_data['Garden'], prefix='Garden')


cleaned_df = remaining_data.drop(['Location', 'Type', 'Pet Friendly', 'Garden'], axis=1)

# only include n-1 for the created columns for the categorical variable
# to avoid dummy variable trap

cleaned_df['Type_Townhouse'] = dummy_type['Type_Townhouse']

cleaned_df['Location_Quezon City'] = dummy_location['Location_Quezon City']
cleaned_df['Location_Pasig City'] = dummy_location['Location_Pasig City']
cleaned_df['Location_Muntinlupa City'] = dummy_location['Location_Muntinlupa City']
cleaned_df['Location_Marikina City'] = dummy_location['Location_Marikina City']
cleaned_df['Location_Paranaque City'] = dummy_location['Location_Paranaque City']
cleaned_df['Location_Caloocan City'] = dummy_location['Location_Caloocan City']
cleaned_df['Location_Taguig City'] = dummy_location['Location_Taguig City']
cleaned_df['Location_Manila City'] = dummy_location['Location_Manila City']

cleaned_df['Type_Townhouse'] = dummy_type['Type_Townhouse']

cleaned_df['Garden_yes'] = dummy_garden['Garden_yes']
cleaned_df['Pet_Friendly_yes'] = dummy_pet_friendly['Pet Friendly_yes']

# Location
#   1. Las Pinas City
#   2. Quezon City
#   3. Pasig City
#   4. Muntinlupa City
#   5. Marikina City
#   6. Paranaque City
#   7. Caloocan City
#   8. Taguig City
#   9. Manila City

# Type
#   1. House and Lot
#   2. Townhouse

# Garden
#   1. Yes
#   2. No

# Pet Friendly
#   1. Yes
#   2. No

cleaned_df.head()

Unnamed: 0,Bedrooms,Bathrooms,Floor,Floors Area,Lot Area,Garage,Reservation Fee,Price,Type_Townhouse,Location_Quezon City,Location_Pasig City,Location_Muntinlupa City,Location_Marikina City,Location_Paranaque City,Location_Caloocan City,Location_Taguig City,Location_Manila City,Garden_yes,Pet_Friendly_yes
0,2,1,2,67.0,55.0,1,0,3900001,0,0,0,0,0,0,0,0,0,1,1
1,3,2,2,78.0,58.0,1,30000,4987000,0,0,0,0,0,0,0,0,0,1,1
2,3,2,2,65.0,60.0,1,50000,4200000,0,1,0,0,0,0,0,0,0,1,1
3,3,2,2,70.0,52.0,1,30000,4480000,1,0,0,0,0,0,0,0,0,1,1
4,3,2,2,65.0,48.0,1,50000,3800000,0,1,0,0,0,0,0,0,0,1,1


### Convert data type

In [9]:
cleaned_df = cleaned_df.astype(float)

### Variance Inflation Factor

In [10]:
# Price is drop because it is dependent variable
cleaned_df_check = cleaned_df.drop(columns=['Price'])

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(cleaned_df_check)

# create the series for before the drop
series = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)

# display the series
print('VIF score')
print('-'*100)
display(series)

VIF score
----------------------------------------------------------------------------------------------------


const                       71.960705
Bedrooms                     2.101989
Bathrooms                    1.341946
Floor                        1.239830
Floors Area                  1.049454
Lot Area                     1.149684
Garage                       1.038128
Reservation Fee              1.114123
Type_Townhouse               1.105361
Location_Quezon City         1.448772
Location_Pasig City          1.136628
Location_Muntinlupa City     1.040956
Location_Marikina City       1.463591
Location_Paranaque City      1.181385
Location_Caloocan City       1.431717
Location_Taguig City         1.215829
Location_Manila City         1.094219
Garden_yes                   1.276444
Pet_Friendly_yes             1.254238
dtype: float64

### Remove the outliers

In [11]:
# before removing outliers
print('Before:')
print('total instances: {0}'.format(len(cleaned_df)))

# filter the data frame to remove the values exceeding 3 standard deviations
cleaned_remove_df = cleaned_df[(np.abs(stats.zscore(cleaned_df)) < 3).all(axis=1)]

# what rows were removed
cleaned_outliers_df = cleaned_df.index.difference(cleaned_remove_df.index)

# total outliers
total_outliers = len(cleaned_outliers_df)

# assign the cleaned data (without outliers)
cleaned_df = cleaned_remove_df

# after removing outliers
print('-'*100)
print('After:')
print('outliers: ', cleaned_outliers_df.values)
print('total outliers index: ', len(cleaned_outliers_df))
print('cleaned instances: ', len(cleaned_df))

Before:
total instances: 325
----------------------------------------------------------------------------------------------------
After:
outliers:  [  8  11  17  26  28  30  33  44  45  49  54  56  67  71  72  74  84  86
  89  90  98 101 111 117 120 122 124 134 138 142 151 166 182 184 185 189
 192 199 203 230 258 265 288 300 301 306 314 315 325 331 332 334 341 342
 347 348 353]
total outliers index:  57
cleaned instances:  268


### Define the Xs and Y

In [12]:
# define our input variable (X) & output variable
X = cleaned_df.drop('Price', axis = 1)
Y = cleaned_df[['Price']]

# Generate a random n-class classification problem
X, Y = make_classification(n_samples=100, random_state=1)

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

### Multi-layer Perceptron classifier

In [13]:
# max_iter = Maximum number of iterations, given 300 iterations.

clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

In [14]:
# Probability estimates.
clf.predict_proba(X_test[:1])

array([[0.96005677, 0.03994323]])

In [15]:
# Predict using the multi-layer perceptron classifier
clf.predict(X_test[:5, :])

array([0, 0, 0, 0, 0])

In [17]:
# accuracy percentage
accuracy_train = clf.score(X_train,y_train)
accuracy_test = clf.score(X_test,y_test)

print('Accuracy Train: {0}%'.format(accuracy_train * 100))
print('Accuracy Test: {0}%'.format(accuracy_test * 100))

Accuracy Train: 100.0%
Accuracy Test: 90.0%
