# House Price Prediction Model Using Decision Tree Regression

Essentially, the aim is to:

- Identify the variables affecting house prices, e.g., the area, the - number of rooms, bathrooms, etc. 
- Create a linear model that quantitatively relates house prices with variables, such as the number of rooms, area, number of bathrooms; and
- Know the variables that significantly contribute towards predicting house prices.

### Data Preparation and Understanding

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

import warnings  
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('housing.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

#### Handle Categorical Variables

In [None]:
cat_col = ['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']
df[cat_col] = df[cat_col].apply(lambda x : x.map({'yes':1,'no':0}))


In [None]:
df.furnishingstatus = df.furnishingstatus.map(
    {'furnished': 1, 'semi-furnished': 2, 'unfurnished': 3})


In [None]:
df.head()

In [None]:
plt.figure(figsize=(5,5))
 
df.furnishingstatus.hist()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Model building
import sklearn 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score , confusion_matrix,r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [None]:
df_train,df_test = train_test_split(df , train_size=0.7,test_size=0.3,random_state=100)


In [None]:
df_train.head()

In [None]:

df_test.head()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
# Scale the training dataest
num_col=['price','area','bedrooms','bathrooms','stories','parking']
scaler = MinMaxScaler()
df_train[num_col] = scaler.fit_transform(df_train[num_col])
y_train = df_train.pop('price')
X_train = df_train

In [None]:
# create testing data
df_test[num_col] = scaler.transform(df_test[num_col])
y_test = df_test.pop('price')
X_test = df_test

In [None]:
df_train.head()

In [None]:
df_test.head()

##### Buildin Decision Regression Tree Classifier

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(X_train.corr(),annot = True)
plt.show()

In [None]:
dt = DecisionTreeRegressor(random_state=100,max_depth=4,min_samples_leaf=10)

In [None]:
# we do not required scaling of feature only y Variable is sufficient for it 
np.random.seed(0) 
df_train, df_test = train_test_split(df,train_size =0.7,test_size=0.3,random_state=100)
df_train.shape,df_test.shape


In [None]:
scaler = MinMaxScaler()
df_train[['price']] = scaler.fit_transform(df_train[['price']])
df_test[['price']] = scaler.transform(df_test[['price']])

In [None]:
df_train.shape,df_test.shape

In [None]:
df_train.price,df_test.price

In [None]:
y_train = df_train.pop('price')
X_train = df_train
y_test = df_test.pop('price')
X_test = df_test


In [None]:
params = {
    'max_depth' : [2,3,4,5,6,7,8,10,15,20,30],
    'min_samples_leaf':[10,15,20,30,40,50,100],
    # 'criterion':['gini','entropy']
}
gridSearch   = GridSearchCV(estimator=dt ,param_grid=params,cv=4,verbose=True,scoring='r2')

In [None]:
# Fit the DT model 
gridSearch.fit(X_train,y_train)

In [None]:
res = pd.DataFrame(gridSearch.cv_results_)

In [None]:
res.head()

In [None]:
res.nlargest(5,'mean_test_score')

In [None]:
gridSearch.best_score_