# Aparments and Houses Price Prediction
### Authors: 
Cagampang, Joseph Donee Y.<br>
Gucio, Maria Angelica<br>
Mondejar, Yanni Jan<br>
Rosalijos, Joshua<br>
Verdida, Kenneth Mae<br>

#### Date: March 18, 2021

In [1]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns 
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.preprocessing import LabelEncoder

In [2]:
# read the dataset
dataset_df = pd.read_csv('property24.csv')

# display the first five rows
display(dataset_df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'property24.csv'

### Data Cleaning

#### Check null values

In [None]:
# count total rows with null value
num_null =  dataset_df.isna().sum().sum()

# before removing null values
print('Before:')
print('null values: {0}'.format(num_null))
print('total instances: ', len(dataset_df))

# display what columns has null values
display(dataset_df.isnull().any())

# drop nulls
dataset_df = dataset_df.dropna()

# redisplay
display('-'*100)
display(dataset_df.isnull().any())

# after removing
num_null =  dataset_df.isna().sum().sum()

print('After:')
print('null values: {0}'.format(num_null))
print('total instances: ', len(dataset_df))

#### Drop not needed column/s

In [None]:
dataset_df = dataset_df.drop(['Link'], axis=1)

#### Identify unique values of the categorical columns

In [None]:
type_values = dataset_df['Type'].unique()
location_values = dataset_df['Location'].unique()
garden_values = dataset_df['Garden'].unique()
pet_friendly_values = dataset_df['Pet Friendly'].unique()

print('Categorical Unique Values')

print('-'*100)
print('Location unique values: ')
print(location_values)

print('-'*100)
print('Type unique values: ')
print(type_values)

print('-'*100)
print('Garden: ')
print(garden_values)

print('-'*100)
print('Pet Friendly: ')
print(pet_friendly_values)

#### Check for duplicates

In [None]:
# removed duplicates
num_duplicates = len(dataset_df) - len(dataset_df.drop_duplicates(keep=False))

# before removing duplicates
print('Before:')
print('duplicates: {0}'.format(num_duplicates))
print('total instances: ', len(dataset_df))

# removed duplicates
dataset_df = dataset_df.drop_duplicates()

# afer removing
print('-'*100)
print('After:')
print('total instances: ', len(dataset_df))

#### Variable Creation

In [None]:
# encode categorical variables
dummy_type = pd.get_dummies(dataset_df['Type'], prefix='Type')
dummy_location = pd.get_dummies(dataset_df['Location'], prefix='Location')
dummy_pet_friendly = pd.get_dummies(dataset_df['Pet Friendly'], prefix='Pet Friendly')
dummy_garden = pd.get_dummies(dataset_df['Garden'], prefix='Garden')


cleaned_df = dataset_df.drop(['Location', 'Type', 'Pet Friendly', 'Garden'], axis=1)

# only include n-1 for the created columns for the categorical variable
# to avoid dummy variable trap

cleaned_df['Type_Townhouse'] = dummy_type['Type_Townhouse']

cleaned_df['Location_Quezon City'] = dummy_location['Location_Quezon City']
cleaned_df['Location_Pasig City'] = dummy_location['Location_Pasig City']
cleaned_df['Location_Muntinlupa City'] = dummy_location['Location_Muntinlupa City']
cleaned_df['Location_Marikina City'] = dummy_location['Location_Marikina City']
cleaned_df['Location_Paranaque City'] = dummy_location['Location_Paranaque City']
cleaned_df['Location_Caloocan City'] = dummy_location['Location_Caloocan City']
cleaned_df['Location_Taguig City'] = dummy_location['Location_Taguig City']
cleaned_df['Location_Manila City'] = dummy_location['Location_Manila City']

cleaned_df['Type_Townhouse'] = dummy_type['Type_Townhouse']

cleaned_df['Garden_yes'] = dummy_garden['Garden_yes']
cleaned_df['Pet_Friendly_yes'] = dummy_pet_friendly['Pet Friendly_yes']


# Location
#   1. Las Pinas City
#   2. Quezon City
#   3. Pasig City
#   4. Muntinlupa City
#   5. Marikina City
#   6. Paranaque City
#   7. Caloocan City
#   8. Taguig City
#   9. Manila City

# Type
#   1. House and Lot
#   2. Townhouse

# Garden
#   1. Yes
#   2. No

# Pet Friendly
#   1. Yes
#   2. No

#### Convert data type

In [None]:
cleaned_df = cleaned_df.astype(float)

#### Check Multicollinearity

##### 1. Correlation Matrix

In [None]:
# calculate the correlation matrix
corr = cleaned_df.corr()

# display the correlation matrix
display(corr)

# plot the correlation heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='RdBu')

##### 2. Variance Inflation Factor

In [None]:
# Price is drop because it is dependent variable
cleaned_df_check = cleaned_df.drop(columns=['Price',])

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(cleaned_df_check)

# create the series for before the drop
series = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)

# display the series
print('VIF score')
print('-'*100)
display(series)

#### Check Outliers

In [None]:
# before removing outliers
print('Before:')
print('total instances: {0}'.format(len(cleaned_df)))

# filter the data frame to remove the values exceeding 3 standard deviations
cleaned_remove_df = cleaned_df[(np.abs(stats.zscore(cleaned_df)) < 3).all(axis=1)]

# what rows were removed
cleaned_outliers_df = cleaned_df.index.difference(cleaned_remove_df.index)

# total outliers
total_outliers = len(cleaned_outliers_df)

# assign the cleaned data (without outliers)
cleaned_df = cleaned_remove_df

# after removing outliers
print('-'*100)
print('After:')
print('outliers: ', cleaned_outliers_df.values)
print('total outliers index: ', len(cleaned_outliers_df))
print('cleaned instances: ', len(cleaned_df))

cleaned_df 
    
1. Check Null Values
2. Drop not needed column/s
3. Identify unique values of the categorical columns
4. Check for duplicates
5. Variable Creation
6. Check Multicollinearity
7. Check Outliers

In [None]:
# Function importing Dataset 
def importdata(): 
    proerty = cleaned_df
    # Printing the dataswet shape 
    print ("Dataset Length: ", len(proerty)) 
    print ("Dataset Shape: ", proerty.shape) 
    #balance_data = balance_data.apply(LabelEncoder().fit_transform)
    #balance_data = balance_data.astype(int)
    # Printing the dataset obseravtions 
    #print ("Dataset: ",balance_data.head()) 
    return proerty 


In [None]:
  
# Function to split the dataset 
def splitdataset(proerty): 
  
    # Separating the target variable 
    X = proerty.values[:, 1:5] 
    Y = proerty.values[:, 0] 
  
    # Splitting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split(  
    X, Y, test_size = 0.3, random_state = 100) 
      
    return X, Y, X_train, X_test, y_train, y_test 

In [None]:
# Function to perform training with giniIndex. 
def train_using_gini(X_train, X_test, y_train): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            random_state = 100,max_depth=3, min_samples_leaf=5) 
  
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini 


In [None]:
# Function to perform training with giniIndex. 
def test_using_gini(X_test,X_train, y_test): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            random_state = 100,max_depth=3, min_samples_leaf=5) 
  
    # Performing training 
    clf_gini.fit(X_test, y_test) 
    return clf_gini 

In [None]:
      
# Function to perform training with entropy. 
def tarin_using_entropy(X_train, X_test, y_train): 
  
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 100, 
            max_depth = 3, min_samples_leaf = 5) 
  
    # Performing training 
    clf_entropy.fit(X_train, y_train) 
    return clf_entropy 


In [None]:
      
# Function to perform training with entropy. 
def test_using_entropy(X_test,X_train, y_test): 
  
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 100, 
            max_depth = 3, min_samples_leaf = 5) 
  
    # Performing training 
    clf_entropy.fit(X_test, y_test) 
    return clf_entropy 


In [None]:
# Function to make predictions 
def test_prediction(X_test, clf_object): 
  
    # Predicton on test with giniIndex 
    y_pred = clf_object.predict(X_test) 
    print("Test predicted values:") 
    print(y_pred) 
    return y_pred 


In [None]:
# Function to make predictions 
def train_prediction(X_train, clf_object): 
  
    # Predicton on test with giniIndex 
    y_pred = clf_object.predict(X_train) 
    print("Train Predicted values:") 
    print(y_pred) 
    return y_pred 


In [None]:
      
# Function to calculate accuracy 
def cal_accuracy(y_test,y_train, y_pred, train_pred_gini): 
      
    print("Test Confusion Matrix: ")
    print(confusion_matrix(y_test, y_pred))
      
    print ("Test accuracy : ", 
    accuracy_score(y_test,y_pred)*100) 
    
    print ("Train accuracy : ", 
    accuracy_score(y_train,train_pred_gini)*100) 
      
    print("Report : ", 
    classification_report(y_test, y_pred)) 


In [None]:
  
# Driver code 
def main(): 
      
    # Building Phase 
    data = importdata() 
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 
    
    train_clf_gini = train_using_gini(X_train, X_test, y_train) 
    test_clf_gini = test_using_gini(X_test,X_train, y_test)  
    
    train_clf_entropy = tarin_using_entropy(X_train, X_test, y_train) 
    test_clf_entropy = test_using_entropy(X_test,X_train, y_test) 
    
      
    # Operational Phase 
    
    print("\nResults Using Gini Index:") 
      
    # Prediction using gini 
    test_pred_gini = test_prediction(X_test,test_clf_gini) 
    train_pred_gini = train_prediction(X_train,train_clf_gini) 
    cal_accuracy(y_test,y_train, test_pred_gini, train_pred_gini)
    
    
    print("\nResults Using Entropy:") 
    #Prediction using entropy 
    test_pred_entropy = test_prediction(X_test, test_clf_entropy) 
    train_pred_entropy = train_prediction(X_train, train_clf_entropy) 
    
    cal_accuracy(y_test, y_train,test_pred_entropy, train_pred_entropy) 


In [None]:
# Calling main function 
if __name__=="__main__": 
    main() 
    

In [None]:
FEATURE_NAMES = ['Bedrooms', 'Bathrooms', 'Floor', 'Floors Area', 'Lot Area', 'Garage',
                 'Reservation Fee', 'Type_Townhouse', 'Location_Quezon City',
                 'Location_Pasig City', 'Location_Muntinlupa City',
                 'Location_Marikina City', 'Location_Paranaque City',
                 'Location_Caloocan City', 'Location_Taguig City',
                 'Location_Manila City', 'Garden_yes', 'Pet_Friendly_yes']
#esktop/2nd semester/sample/Transformed Data Set.csv
#iris = datasets.load_iris()
X = cleaned_df.drop('Price', axis = 1)
y = cleaned_df['Price']
#X = pd.DataFrame(iris.data, columns = FEATURE_NAMES)
#y = iris.target


In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X,y)

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(model, 'tree.dot', feature_names = FEATURE_NAMES)

In [None]:
from subprocess import check_call
check_call(['dot','-Tpng','tree.dot','-o','tree.png'])

In [None]:
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 20))
plt.imshow(img)