# Aparments and Houses Price Prediction
### Authors: 
Cagampang, Joseph Donee Y.<br>
Gucio, Maria Angelica<br>
Mondejar, Yanni Jan<br>
Rosalijos, Joshua<br>
Verdida, Kenneth Mae<br>

#### Date: March 18, 2021

In [1]:
# import libraries
import numpy as np
import pandas as pd

from scipy import stats

In [2]:
# read the dataset
dataset_df = pd.read_csv('apartments-houses.csv')

# display the first five rows
display(dataset_df.head())

Unnamed: 0,Type,Location,Bedrooms,Bathrooms,Floors,Lot Area,Usable Area,Price
0,House,Luzon,7,7,2.0,450.0,700.0,150000000
1,House,Luzon,5,5,3.0,157.0,380.0,18000000
2,House,Luzon,5,5,2.0,709.0,1000.0,160000000
3,House,Luzon,5,3,3.0,233.0,595.0,120000000
4,House,Luzon,3,3,2.0,157.0,300.0,15000000


### Data Cleaning

#### 1. Remove null values

In [3]:
# count total rows with null value
num_null =  dataset_df.isna().sum().sum()

# before removing null values
print('Before:')
print('null values: {0}'.format(num_null))
print('total instances: ', len(dataset_df))

# display what columns has null values
display(dataset_df.isnull().any())

# drop nulls
dataset_df = dataset_df.dropna()

# redisplay
display('-'*100)
display(dataset_df.isnull().any())

# after removing
num_null =  dataset_df.isna().sum().sum()

print('After:')
print('null values: {0}'.format(num_null))
print('total instances: ', len(dataset_df))

Before:
null values: 1
total instances:  300


Type           False
Location       False
Bedrooms       False
Bathrooms      False
Floors         False
Lot Area        True
Usable Area    False
Price          False
dtype: bool

'----------------------------------------------------------------------------------------------------'

Type           False
Location       False
Bedrooms       False
Bathrooms      False
Floors         False
Lot Area       False
Usable Area    False
Price          False
dtype: bool

After:
null values: 0
total instances:  299


#### 2. Remove duplicates

In [4]:
# removed duplicates
num_duplicates = len(dataset_df) - len(dataset_df.drop_duplicates(keep=False))

# before removing duplicates
print('Before:')
print('duplicates: {0}'.format(num_duplicates))
print('total instances: ', len(dataset_df))

# removed duplicates
dataset_df = dataset_df.drop_duplicates()

# afer removing
print('-'*100)
print('After:')
print('total instances: ', len(dataset_df))

Before:
duplicates: 27
total instances:  299
----------------------------------------------------------------------------------------------------
After:
total instances:  285


#### 3. Encoding Categorical

In [5]:
# encode categorical variables
dummy_type = pd.get_dummies(dataset_df.Type, prefix='Type')
dummy_location = pd.get_dummies(dataset_df.Location, prefix='Location')

dataset_df = dataset_df.drop(['Location', 'Type' ], axis=1)

# only include n-1 for the created columns for the categorical variable
dataset_df['Type_House'] = dummy_type['Type_House']
dataset_df['Location_Visayas'] = dummy_location['Location_Visayas']
dataset_df['Location_Mindanao'] = dummy_location['Location_Mindanao']

#### 4. Remove outliers

In [6]:
# before removing outliers
print('Before:')
print('total instances: {0}'.format(len(dataset_df)))

# filter the data frame to remove the values exceeding 3 standard deviations
dataset_remove_df = dataset_df[(np.abs(stats.zscore(dataset_df)) < 3).all(axis=1)]

# what rows were removed
dataset_outliers_df = dataset_df.index.difference(dataset_remove_df.index)

# total outliers
total_outliers = len(dataset_outliers_df)

# assign the cleaned data (without outliers)
dataset_df = dataset_remove_df

# after removing outliers
print('-'*100)
print('After:')
print('outliers: ', dataset_outliers_df.values)
print('total outliers index: ', len(dataset_outliers_df))
print('cleaned instances: ', len(dataset_df))

Before:
total instances: 285
----------------------------------------------------------------------------------------------------
After:
outliers:  [  0   2   7   8  28  33  37 177 191 194 220 251 253 256 258 260 261 265
 274 277 279 291]
total outliers index:  22
cleaned instances:  263
