## Import modules

In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

## Import dataset

In [2]:
df = pd.read_csv('../input/housing/Housing.csv')

In [4]:
df.info()

In [5]:
df.describe()

In [6]:
df.isnull().sum()

There are no null values

## Looking outliers in numerical data

### 1. Price

In [8]:
sns.boxplot(x=df['price'])

### 2. Area

In [9]:
sns.boxplot(x=df['area'])

### 3. Bedrooms

In [10]:
sns.boxplot(x=df['bedrooms'])

### 4. Bathrooms

In [11]:
sns.boxplot(x=df['bathrooms'])

### 5. Stories

In [12]:
sns.boxplot(x=df['stories'])

### 6. Parking

In [13]:
sns.boxplot(x=df['parking'])

## Understanding data features

In [14]:
df.columns

In [15]:
df.head()

## Change all categorical into numerical values

In [18]:
df_categorical = ['mainroad', 'guestroom', 'basement', 
                  'hotwaterheating', 'airconditioning',
                  'prefarea']

In [19]:
for x in df_categorical:
    df[x].replace({'no':0, 'yes':1}, inplace=True)

In [20]:
df.head()

In [21]:
df.furnishingstatus.value_counts()

In [22]:
df.furnishingstatus.replace({'furnished':2, 'semi-furnished':1, 'unfurnished':0}, inplace=True)

In [23]:
df.head()

All data is become numerical to make sure using dtypes

In [24]:
df.dtypes

We want to predict the price based on data, so let's do it

## Train Test Split

In [26]:
from sklearn.model_selection import train_test_split

X = df.drop(['price'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [27]:
len(df)

## Model

### 1. KNN

In [31]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_train)

In [33]:
mean_squared_error(y_pred_knn, y_train)

### 2. Random Forest

In [35]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=80, random_state=21, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_train)

In [36]:
mean_squared_error(y_pred_rf, y_train)

### AdaBoost

In [37]:
from sklearn.ensemble import AdaBoostRegressor

boost = AdaBoostRegressor(n_estimators=80, learning_rate=0.7, random_state=21)
boost.fit(X_train, y_train)
y_pred_boost = boost.predict(X_train)

In [38]:
mean_squared_error(y_pred_boost, y_train)

## Visualizing

In [46]:
mse = pd.DataFrame(columns=['train', 'test'], index=['KNN', 'RF', 'AdaBoost'])

In [47]:
mse

In [48]:
model_dict = {'KNN': knn, 'RF': rf, 'AdaBoost': boost}

In [49]:
for name, model in model_dict.items():
    mse.loc[name, 'train'] = mean_squared_error(y_train, 
                                                y_pred=model.predict(X_train)) / 1e3
    mse.loc[name, 'test'] = mean_squared_error(y_test, 
                                               y_pred=model.predict(X_test)) / 1e3

In [50]:
mse

In [52]:
fig, ax = plt.subplots()
mse.sort_values(by='train', ascending=False).plot(kind='bar', ax=ax, zorder=1)
ax.grid(zorder=0)

In [53]:
fig, ax = plt.subplots()
mse.sort_values(by='test', ascending=False).plot(kind='bar', ax=ax, zorder=1)
ax.grid(zorder=0)

## Conclusion

KNN is the best model for predicting house price based on the data