In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# input data files are available in the read-only "kaggle/input" directory
# for example, running this (by clicking run or pressing Shift+Enter) will install all files under the input directory

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# you can write up to 20GB to the current directory (kaggle/working) that gets preserved as output when you create a version using "Save & Run All"

# you can also write temporary files to 'kaggle/temp', but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
df = pd.read_csv('kaggle/input/house-prices-advanced-regression-techniques/train.csv')

test = pd.read_csv('kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
df

In [None]:
# PREPARING THE DATA

df.set_index('Id', inplace = True)

In [None]:
df.info()

In [None]:
len(df)

In [None]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        print('%s has %d null values'%(col, df[col].isnull().sum())) 

In [None]:
more_than_1000 = ['Alley', 'PoolQC', 'Fence', 'MiscFeature']

df.drop(more_than_1000, axis = 1, inplace = True)
test.drop(more_than_1000, axis = 1, inplace = True)

In [None]:
df[df['GarageType'].isna()][['GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']].isna().sum()

In [None]:
garage = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']

for g in garage:

    df[g] = df[g].fillna('NA')
    test[g] = test[g].fillna('NA')

df['GarageYrBlt'] = df['GarageYrBlt'].astype('str')
test['GarageYrBlt'] = test['GarageYrBlt'].astype('str')

In [None]:
df.corr()['LotFrontage'].sort_values(ascending = False)[1:5]

In [None]:
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

test['MasVnrType'] = test['MasVnrType'].fillna('None')
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)

df['Electrical'] = df['Electrical'].fillna('Mix')
test['Electrical'] = test['Electrical'].fillna('Mix')

In [None]:
df[df['BsmtQual'].isnull()][['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].isna().sum()

In [None]:
for c in range(len(df.columns)):
    if 'Bsmt' in df.columns[c]:
        print('%s-->%d'%(df.columns[c], c))

In [None]:
df[~(df['BsmtQual'].isnull()) & (df['BsmtExposure'].isnull())]

In [None]:
df.iat[948,31] = 'No'

In [None]:
df[~(df['BsmtQual'].isnull()) & (df['BsmtFinType2'].isnull())]

In [None]:
df.iat[332, 34] = 'Unf'

In [None]:
basement = ['Qual', 'Cond', 'Exposure', 'FinType1', 'FinType2']

for b in basement:

    df['Bsmt' + b] = df['Bsmt' + b].fillna('NA')
    test['Bsmt' + b] = test['Bsmt' + b].fillna('NA')

In [None]:
test.isna().sum()[test.isna().sum() > 0]

In [None]:
test.set_index('Id', inplace = True) 

In [None]:
test['MSZoning'] = test['MSZoning'].fillna(test['MSZoning'].mode())
test['Utilities'] = test['Utilities'].fillna('AllPub')

test['Exterior1st'] = test['Exterior1st'].fillna('VinylSd')
test['Exterior2nd'] = test['Exterior2nd'].fillna('VinylSd')

In [None]:
test[test['BsmtFinSF1'].isna()][['BsmtUnfSF', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtCond', 'BsmtFullBath']]

In [None]:
test[test['BsmtFullBath'].isna()].iloc[:, 45:47].index

In [None]:
test.at[2189, 'BsmtHalfBath'] = 0
test.at[2189, 'BsmtFullBath'] = 0

In [None]:
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(0)
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(0)

test['BsmtUnfSF']= test['BsmtUnfSF'].fillna(0)

test['BsmtFullBath'] = test['BsmtFullBath'].fillna(0)
test['BsmtHalfBath'] = test['BsmtHalfBath'].fillna(0)

In [None]:
test['KitchenQual'] = test['KitchenQual'].fillna(df['KitchenQual'].mode().loc[0])
test['Functional'] = test['Functional'].fillna(df['Functional'].mode().loc[0])

test['GarageCars'] = test['GarageCars'].fillna(df['GarageCars'].mode().loc[0])
test['GarageArea'] = test['GarageArea'].fillna(df['GarageArea'].mean())

test['SaleType'] = test['SaleType'].fillna(df['SaleType'].mode().loc[0])

In [None]:
# APPLYING LINEAR REGRESSION FOR DATA IMPUTATION

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

cols = ['1stFlrSF', 'LotArea', 'GrLivArea', 'TotalBsmtSF', 'LotFrontage']

lotfront_df = df[cols].copy()
null_lotfront_df = lotfront_df[lotfront_df['LotFrontage'].isnull()].copy()
lotfront_df.dropna(inplace = True)

X = lotfront_df.drop('LotFrontage', axis = 1)
y = lotfront_df['LotFrontage']

sc = StandardScaler()
X_sc = sc.fit_transform(X)

lr = LinearRegression()
lr.fit(X_sc, y)

test_X = null_lotfront_df.drop('LotFrontage', axis = 1)
fill_null = np.round(lr.predict(sc.transform(test_X)), 2)
null_lotfront_df['LotFrontage'] = fill_null

lotfront_df = pd.concat([lotfront_df, null_lotfront_df], axis = 0)
df['LotFrontage'] = lotfront_df['LotFrontage']

In [None]:
lotfront_test = test[cols].copy()
null_lotfront_test = lotfront_test[lotfront_test['LotFrontage'].isnull()].copy()
lotfront_test.dropna(inplace = True)

X = lotfront_test.drop('LotFrontage', axis = 1)
y = lotfront_test['LotFrontage']

X_sc = sc.transform(X)

test_X = null_lotfront_test.drop('LotFrontage', axis = 1)
fill_null = np.round(lr.predict(sc.transform(test_X)), 2)
null_lotfront_test['LotFrontage'] = fill_null

lotfront_test = pd.concat([lotfront_test, null_lotfront_test], axis = 0)
test['LotFrontage'] = lotfront_test['LotFrontage']

In [None]:
df['FireplaceQu'] = df['FireplaceQu'].fillna('NA')
test['FireplaceQu'] = test['FireplaceQu'].fillna('NA')

In [None]:
test['MSZoning'] = test['MSZoning'].fillna('RL')

In [None]:
for col in df.columns:
    if df[col].isnull().sum() > 0:

        print('%s has %d null values'%(col, df[col].isnull().sum()))

In [None]:
for col in test.columns:
    if test[col].isnull().sum() > 0:

        print('%s has %d null values'%(col, test[col].isnull().sum()))

In [None]:
to_str = ['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd','YrSold','MoSold','GarageCars','FullBath','Fireplaces','TotRmsAbvGrd']

for c in to_str:

    df[c] = df[c].astype(str)
    test[c] = test[c].astype(str)

In [None]:
# EDA AND DATA TRANSFORMATION

from copy import copy

my_cmap = copy(plt.cm.YlGnBu)

my_cmap.set_over("white")
my_cmap.set_under("white")

plt.figure(figsize = (10,6), dpi = 200)

sns.heatmap(df.corr(), vmin = 0.3, vmax = 0.99, cmap = my_cmap, linewidths = 1.0)

In [None]:
small_corr_feats = ['LotArea','BsmtFinSF2','BsmtUnfSF','LowQualFinSF','BsmtFullBath','BsmtHalfBath','HalfBath','BedroomAbvGr','KitchenAbvGr','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']

reduced_df = df.drop(small_corr_feats, axis = 1)
reduced_test = test.drop(small_corr_feats, axis = 1)

In [None]:
plt.figure(figsize = (10,6), dpi = 200)

sns.heatmap(reduced_df.corr(), cmap = my_cmap, vmin = 0.2, linewidths = 1.0)

In [None]:
num_cols = []

for c in reduced_df.columns:

    if c == 'SalePrice':

        continue

    else:

        if type(reduced_df[c].iloc[0]) is np.int64 or type(reduced_df[c].iloc[0]) is np.float64:

            num_cols.append(c)

fig, ax_arr = plt.subplots(nrows = 5, ncols = 2, figsize = (25,25))
k = 0

for i in range(5):

    for j in range(2):

        sns.scatterplot(x = num_cols[k], y = 'SalePrice', data = reduced_df, ax = ax_arr[i][j])

        k += 1


In [None]:
fig,ax_arr = plt.subplots(nrows = 5, ncols = 2, figsize = (25,25))
k = 0

for i in range(5):

    for j in range(2):

        sns.histplot(x = num_cols[k], kde = True, data = reduced_df, ax = ax_arr[i][j])

        k += 1

In [None]:
for c in reduced_df.columns:

    if 'Blt' in c or 'Sold' in c or 'Built' in c:

        print(c)

In [None]:
reduced_df['YrsBtw'] = np.zeros(len(reduced_df))
reduced_test['YrsBtw'] = np.zeros(len(reduced_test))

for i in range(len(reduced_df)):

    reduced_df.iat[i,len(reduced_df.columns) - 1] = int(reduced_df.iloc[i]['YrSold']) - int(reduced_df.iloc[i]['YearBuilt'])

for i in range(len(reduced_test)):

    reduced_test.iat[i,len(reduced_test.columns) - 1] = int(reduced_test.iloc[i]['YrSold']) - int(reduced_test.iloc[i]['YearBuilt'])

In [None]:
reduced_df['YrsBtw']

In [None]:
sns.scatterplot(x = 'YrsBtw', y = 'SalePrice', data = reduced_df)

In [None]:
reduced_df['Remod'] = np.zeros(len(reduced_df))
reduced_test['Remod'] = np.zeros(len(reduced_test))

for i in range(len(reduced_df)):

    diff = int(reduced_df.iloc[i]['YearBuilt']) - int(reduced_df.iloc[i]['YearRemodAdd'])

    reduced_df.at[i+1, 'Remod'] = np.where(diff > 0, 1, 0)

for i in range(len(reduced_test)):

    diff = int(reduced_test.iloc[i]['YearBuilt']) - int(reduced_test.iloc[i]['YearRemodAdd'])

    reduced_test.iat[i, len(reduced_test.columns) - 1] = np.where(diff > 0, 1, 0)    

In [None]:
sns.scatterplot(x = 'YrsBtw', y = 'SalePrice', hue = 'Remod', data = reduced_df)

In [None]:
reduced_df.drop(['Remod', 'YearRemodAdd'], axis = 1, inplace = True)
reduced_test.drop(['Remod', 'YearRemodAdd'], axis = 1, inplace = True)

df.drop('YearRemodAdd', axis = 1, inplace = True)
test.drop('YearRemodAdd', axis = 1, inplace = True)

In [None]:
cat_cols = reduced_df.select_dtypes(include = object).columns


In [None]:
# TQC ANALYSIS - TYPE, QUALITY AND CONDITION

tqc_cols = []

for c in cat_cols:

    if 'type' in c.lower() or 'qual' in c.lower() or 'cond' in c.lower():

        tqc_cols.append(c)

print(len(tqc_cols))

In [None]:
fig, ax_arr = plt.subplots(nrows = 9, ncols = 2, figsize = (25,30))
k = 0

for i in range(9):

    for j in range(2):

        sns.countplot(y = tqc_cols[k], data = reduced_df, ax = ax_arr[i][j])
        k += 1

fig.tight_layout()

In [None]:
fig,ax_arr = plt.subplots(nrows = 9, ncols = 2, figsize = (25,30))
k = 0

for i in range(9):

    for j in range(2):

        sns.boxplot(x=tqc_cols[k],y='SalePrice',data=reduced_df,ax=ax_arr[i][j])

        ax_arr[i][j].set_title("%s v SalePrice"%(tqc_cols[k]))

        k+=1

fig.tight_layout()



In [None]:
# MODEL PREPARATION

reduced_df.drop('YearBuilt', axis = 1, inplace = True)
reduced_test.drop('YearBuilt', axis = 1, inplace = True)

In [None]:
reduced_df.drop('GarageYrBlt', axis = 1, inplace = True)
reduced_test.drop('GarageYrBlt', axis = 1, inplace = True)

In [None]:
reduced_df.drop('YrSold', axis = 1, inplace = True)
reduced_test.drop('YrSold', axis = 1, inplace = True)

In [None]:
reduced_df.shape, reduced_test.shape

In [None]:
reduced_df_enc = pd.get_dummies(reduced_df, drop_first = True)
reduced_test_enc = pd.get_dummies(reduced_test, drop_first = True)

In [None]:
reduced_df_enc.shape, reduced_test_enc.shape

In [None]:
set1 = set(reduced_df_enc.columns)
set2 = set(reduced_test_enc.columns)

set1.difference(set2)

In [None]:
reduced_df.drop(reduced_df[reduced_df['Electrical'] == 'Mix'].index, inplace = True)

In [None]:
to_remove_cols = {'BsmtFinType1','Condition2','Exterior1st','Exterior2nd','GarageCars','Heating','HouseStyle','RoofMatl','TotRmsAbvGrd','Utilities','GarageQual','MSSubClass','FullBath','Fireplaces'}

reduced_df.drop(to_remove_cols, inplace = True, axis = 1)
reduced_test.drop(to_remove_cols, inplace = True, axis = 1)

reduced_df_enc = pd.get_dummies(reduced_df, drop_first = True)
reduced_test_enc = pd.get_dummies(reduced_test, drop_first = True)

print(reduced_df_enc.shape, reduced_test_enc.shape)

In [None]:
X = reduced_df_enc.drop('SalePrice', axis = 1)
y = reduced_df_enc['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)

In [None]:
epvr = []

for n_c in range(70,151):

    pca = PCA(n_components = n_c)
    pca.fit(X_train_sc)

    epvr.append(sum(pca.explained_variance_ratio_))

In [None]:
len(epvr)

In [None]:
plt.plot([x for x in range(70,151)], epvr, linestyle = '--')

In [None]:
pca = PCA(n_components = 150)

X_train_sc_pca = pca.fit_transform(X_train_sc)

In [None]:
models = [LinearRegression(),Ridge(),Lasso(),ElasticNet(),KNeighborsRegressor(),SVR(),DecisionTreeRegressor(),RandomForestRegressor()]

def base_model_errors(models,X,y):

    ind = []
    srs = []

    for model in models:

        scores = cross_val_score(estimator = model, X = X, y = y, cv = 5)
        scores.sort()

        srs.append(scores)
        ind.append(str(model)[:-2])

    return pd.DataFrame(index = ind, data = srs, columns = ['S1','S2','S3','S4','S5'])

error_df = base_model_errors(models = models, X = X_train_sc_pca, y = y_train)


In [None]:
error_df

In [None]:
models[4].n_neighbors

In [None]:
models[6].get_params()

In [None]:
models[7].get_params()

In [None]:
models1 = [ElasticNet(),KNeighborsRegressor(),DecisionTreeRegressor(),RandomForestRegressor()]

param_grid1 = [
    {
        'alpha': [0.5,1,2,3,4,10,20,30,40,50,60,70,80,90,95,100],
        'l1_ratio': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    },
    {
        'n_neighbors': [4,5,6,7]
    },
    {
        'max_depth': [4,5,6,None]
    },
    {
        'n_estimators': [100,150,200],
        'max_depth': [4,5,6,None]
    }
]

def models_best_params(models,param_grids, X, y):

    for i in range(len(models)):
    
        gm = GridSearchCV(estimator = models[i], param_grid = param_grids[i], cv = 5)
        gm.fit(X,y)
        
        print("%s's best parameters are.."%(str(models[i])))
        print(gm.best_params_)
        
        print("%s's best score is.."%(str(models[i])))
        print(gm.best_score_)

models_best_params(models = models1, param_grids = param_grid1, X = X_train_sc_pca, y = y_train)


In [None]:
models2 = [ElasticNet(), RandomForestRegressor()]

param_grid2 = [
    {
        'alpha': [0.2,0.3,0.4,0.5,0.6,0.7],
        'l1_ratio': [0.1,0.2,0.3,0.4,0.5]
    },
    {
        'n_estimators': [80,90,100,110,120],
        'max_depth': [4,5,6,7,None]
    }
]

models_best_params(models = models2, param_grids = param_grid2, X = X_train_sc_pca, y = y_train)

In [None]:
models3 = [RandomForestRegressor()]

param_grid3 = [
    {
        'n_estimators': [115,120,125,130],
        'max_depth': [6,7,None]
    }
]

models_best_params(models = models3, param_grids = param_grid3, X = X_train_sc_pca, y = y_train)

In [None]:
# FINAL MODEL

best_model = RandomForestRegressor(n_estimators = 130)
best_pca = PCA(n_components = 150)

sc = StandardScaler()

model_pipe = Pipeline(steps = [('sc',sc),('pca',best_pca),('model',best_model)])

X = reduced_df_enc.drop('SalePrice', axis = 1)
y = reduced_df_enc['SalePrice']

model_pipe.fit(X,y)

pred = model_pipe.predict(reduced_test_enc)

In [None]:
test.reset_index(inplace = True)
test['SalePrice'] = pred

submission = [['SalePrice','Id']]
submission.set_index('Id', inplace = True)

submission.to_csv('submission.csv')