## Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import os
import warnings
warnings.filterwarnings('ignore')

## Data Analysis

In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
train_df.info()

In [None]:
data = pd.concat([train_df['SalePrice'], train_df['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0,900000));

Positive correlation between SalePrice and livng area square footage

In [None]:
data = pd.concat([train_df['SalePrice'], train_df['YearBuilt']], axis=1)
data.plot.scatter(x='YearBuilt', y='SalePrice');

Exponential like correlation between SalePrice and YearBuilt

In [None]:
data = pd.concat([train_df['SalePrice'], train_df['OverallQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='OverallQual', y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

Positive correlation between SalePrice and OverallQual

In [None]:
corrmat = train_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

Corrleation between features

In [None]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Correlation between top 10 most correlated features with respect to SalePrice

## Data Processing

In [None]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

In [None]:
train_df = train_df.drop((missing_data[missing_data['Total'] > 1]).index,1)
train_df = train_df.drop(train_df.loc[train_df['Electrical'].isnull()].index)
print("Number of missing data in dataframe:", train_df.isnull().sum().max())

Removing features with missing data

In [None]:
train = pd.get_dummies(train_df)

Converting categorical data to numerical data

## Trying Base Models

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
y = train['SalePrice']
x = train.drop('SalePrice', axis = 1)
x = train.drop('Id', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
n_est_params = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
max_depth_params = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
f_model_acc = [0, 0, 0]

In [None]:
'''for i in n_est_params:
    for j in max_depth_params:
        f_model = RandomForestRegressor(n_estimators=i, max_depth=j)
        f_model.fit(X_train, y_train)
        print(f_model.score(X_test, y_test))
        if f_model.score(X_test, y_test) > f_model_acc[0]:
            f_model_acc[0] = f_model.score(X_test, y_test)
            f_model_acc[1] = i
            f_model_acc[2] = j
'''

In [None]:
#print("Highest acc:", f_model_acc[0], "with n_est:", f_model_acc[1], "and max_depth:", f_model_acc[2])

In [None]:
forest_model = RandomForestRegressor(n_estimators=70, max_depth=10)
forest_model.fit(X_train, y_train)

In [None]:
n_est_params = [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]
max_depth_params = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
x_model_acc = [0, 0, 0, 0]

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
'''for i in n_est_params:
    for j in max_depth_params:
        for k in learning_rate:
            x_model = XGBRegressor(n_estimators=i, max_depth=j)
            x_model.fit(X_train, y_train)
            kfold = KFold(n_splits=10, shuffle=True)
            kf_cv_scores = cross_val_score(x_model, X_train, y_train, cv=kfold)
            print(kf_cv_scores.mean())
            if kf_cv_scores.mean() > x_model_acc[0]:
                x_model_acc[0] = kf_cv_scores.mean()
                x_model_acc[1] = i
                x_model_acc[2] = j
                x_model_acc[3] = k
'''

In [None]:
#print("Highest acc:", x_model_acc[0], "\nn_est:", x_model_acc[1], "\nmax_depth:", x_model_acc[2], "\nlearning rate:", x_model_acc[3])

In [None]:
xg_model = XGBRegressor(n_estimators=140, max_depth=5, learning_rate=0.2)
xg_model.fit(X_train, y_train)

In [None]:
f, ax = plt.subplots(2, 2, figsize=(15,15))
ax[0,0].plot(y_train, y_train, 'r-')
ax[0,0].set(title='Random Forest Model Training Data Accuracy', xlabel='True Values', ylabel='Predicted Values')
ax[0,0].scatter(y_train, forest_model.predict(X_train))

ax[1,0].plot(y_test, y_test, 'r-')
ax[1,0].set(title='Random Forest Model Test Data Accuracy', xlabel='True Values', ylabel='Predicted Values')
ax[1,0].scatter(y_test, forest_model.predict(X_test))

ax[0,1].plot(y_train, y_train, 'r-')
ax[0,1].set(title='XG Boost Model Training Data Accuracy', xlabel='True Values', ylabel='Predicted Values')
ax[0,1].scatter(y_train, xg_model.predict(X_train))

ax[1,1].plot(y_test, y_test, 'r-')
ax[1,1].set(title='XG Boost Model Test Data Accuracy', xlabel='True Values', ylabel='Predicted Values')
ax[1,1].scatter(y_test, xg_model.predict(X_test))