# House Prices - Advanced Regression Techniques




The following are some of the terms used in the notebook:

train - Kaggle data on trains

test - Kaggle data for testing

X - train data independent variables (columns)

y - train data dependent variable (column)

data - a mix of training and testing data

X_ - train data with missing values removed

test_ - test data after missing values have been removed.

Xscaled is data that has been scaled from X.

x train 90, x test 10, y train 90, y test 10 - x train 90, x test 10, y train 90, y test 10 - y train 90, y test 10 - y train 90, y test 10 - y_

x train 75, x test 25, y train 75, y test 25 - data separated into train and test with testsize=0.25 from X

x train scaled 90, x test scaled 10, y train scaled 90, y test scaled 10 - train test split with test size=0.10 data from X scaled data.

x train scaled 75, x test scaled 25, y train scaled 75, y test scaled 25 - train test split with test size=0.25 data from X scaled data.

score test - stores the results of all algorithms applied to the x test and y test data.

model identifies which models/algorithms are being used.

best model - the best model's name

y predict test - data for submission

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns #for visualisation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv('/kaggle/input/house-price/train.csv')
test=pd.read_csv('/kaggle/input/house-price/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head(10)

In [None]:
test.head(10)

## Starting prelimanary anaylsis

In [None]:
train.dtypes #shows datatype of different columns of train dataset

In [None]:
train.describe(include="all")

In [None]:
train.info

from this we are getting gist about our dataset

### Finding the missing values and dealing with them

In [None]:
count=train.isnull()
count

In [None]:
#counting null value in both train and test dataset
train.isnull().sum().sum()

In [None]:
test.isnull().sum().sum()

***There are 6965 missing values in train dataset and 7000 values in test dataset***

## Splitting the dataset for training later

In [None]:
#Split the train into x_train and y_train so that SalePrice can be kept separate for training later
y = train.SalePrice
X = train.drop('SalePrice', axis=1)

In [None]:
y.shape ,X.shape, test.shape

In [None]:
X['Type'] = 'train'
test['Type'] = 'test'
data = X.append(test)

In [None]:
columns_with_null_values= data[data.columns[data.isnull().sum()>0]]
columns_with_null_values

***We have 34 columns which have null values out of 81 columns***

## Dealing with missing values

In [None]:
#We have to fill in appropriate value according to columns relevacny so our data don't get disturbed .
data['Electrical'].value_counts()

##### In the above values, we can see that "Sbrkr" is the mostly used 'Electrical' part. Hence, we can't put "None" in the null values because a house must have "Electrical" items/fuses. So we will fill null values with "Sbrkr" in this column.

In [None]:
data['Electrical'].fillna("Sbrkr", inplace=True)

we have to fill in appropriate values according to columns specification

In [None]:
data['MSZoning'].value_counts()
#Filling null values with 'RL'
data['MSZoning'].fillna("RL",inplace=True)

#Filling nul values with mean
data['LotFrontage'].fillna(data['LotFrontage'].mean(), inplace=True)

data['Alley'].fillna("Nothing", inplace=True)
data['Utilities'].fillna("AllPub", inplace=True)
data['Exterior1st'].fillna("VinylSd", inplace=True)
data['Exterior2nd'].fillna("VinylSd", inplace=True)
data['MasVnrArea'].fillna(0, inplace=True)
data['MasVnrType'].fillna("None", inplace=True)
data['BsmtCond'].fillna("No", inplace=True)
data['BsmtExposure'].fillna("NB", inplace=True)
data['BsmtFinType1'].fillna("NB", inplace=True)
data['BsmtFinSF1'].fillna(0.0, inplace=True)
data['BsmtFinSF2'].fillna(0.0, inplace=True)
data['BsmtUnfSF'].fillna(0.0, inplace=True)
data['TotalBsmtSF'].fillna(0.0, inplace=True)
data['BsmtFullBath'].fillna(0.0, inplace=True)
data['BsmtHalfBath'].fillna(0.0, inplace=True)
data['KitchenQual'].fillna("TA", inplace=True)
data['Functional'].fillna("Typ", inplace=True)
data['FireplaceQu'].fillna("None", inplace=True)
data['GarageType'].fillna("No", inplace=True)
data['GarageYrBlt'].fillna(0, inplace=True)
data['GarageFinish'].fillna("No", inplace=True)
data['GarageCars'].fillna(0, inplace=True)
data['GarageArea'].fillna(0, inplace=True)
data['GarageQual'].fillna("No", inplace=True)
data['GarageCond'].fillna("No", inplace=True)
data['PoolQC'].fillna("No", inplace=True)
data['Fence'].fillna("No", inplace=True)
data['MiscFeature'].fillna("No", inplace=True)
data['SaleType'].fillna("Con", inplace=True)
data['SaleCondition'].fillna("None", inplace=True)
data['BsmtQual'].fillna("TA", inplace=True)
data['BsmtFinType2'].fillna("Unf", inplace=True)

### checking for null values

In [None]:
data.isnull().sum().sum()

### Checking different datatype of columns

In [None]:
data_Num=data.select_dtypes(np.number)

print('Numerical features :')
print(data_Num.columns,'\n')


In [None]:

data_Cat=data.select_dtypes(object)

print('Categorical features :')
print(data_Cat.columns)

In [None]:
data_float=data.select_dtypes(np.float)

print('decimal point value features :')
print(data_float.columns,'\n')


## Data Preprocessing

In [None]:
data.var()

In [None]:
corr =data.corr()
corr

In [None]:
#visualising correaltion between data with a heatmap

plt.figure(figsize=(25,19))
sns.heatmap(corr, annot=True ,vmin=-1.0,cmap='mako')
plt.title('Correlation heatmap')
plt.show()

**As we know all the diagonal elements will be 1 so let's take the upper triangular matrix**


In [None]:
upper_matrix = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool))
upper_matrix

In [None]:
#Dropping columns with high correlation
drop_columns=[col for col in upper_matrix.columns if any(upper_matrix[col]>0.8)]
drop_columns

In [None]:
data.drop(data[drop_columns],axis=1, inplace=True)
data.head()

## Label Encoding for categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in data_Cat:
    label=LabelEncoder()
    label.fit(data[i].values)
    data[i]=label.transform(data[i].values)

In [None]:
data_Cat=data[data.columns[data.dtypes=='object']]
data_Cat.columns

In [None]:
data_Num=data[data.columns[data.dtypes=='int']]
data_Num.columns

In [None]:
data.head()

All the object are of numerical format now.

In [None]:
X_ = data[data.Type==1]
X_ = X_.drop(["Type"], axis=1)

test_ = data[data.Type==0]
test_ = test_.drop(["Type"], axis=1)

In [None]:
X_.shape, y.shape, test_.shape 


### Scaling the data

***It is required because dataset has columns which varies highly in magnitudes. If scaling is not performed then high magnitude values will have more impact on modelling.***

In [None]:
from sklearn import preprocessing
names= X_.columns
preprocess = preprocessing.normalize(X_ )
X_scaled = pd.DataFrame(preprocess, columns=names)

In [None]:
X_scaled.head()

## Data Visualisation

In [None]:
sns.set()
cols = ['OverallQual', 'TotalBsmtSF', 'YearBuilt']
sns.pairplot(X_[cols], size = 3)
plt.show()

In [None]:
correl=X_.corr()
plt.subplots(figsize=(23,18))
sns.heatmap(correl,vmax=0.8,square=True,cmap="RdYlGn")

## Modelling Aka traning our dataset

In [None]:
#Creating lists to collect all the model names and their scores together

score_test = []
#score_train = []
model = []


In [None]:
from sklearn.model_selection import train_test_split
x_train_90, x_test_10, y_train_90, y_test_10 = train_test_split(X_,y,test_size=0.10,random_state=1)
x_train_75, x_test_25, y_train_75, y_test_25 = train_test_split(X_,y,test_size=0.25,random_state=1)



In [None]:
from sklearn.model_selection import train_test_split
x_train_scaled_90, x_test_scaled_10, y_train_scaled_90, y_test_scaled_10 = train_test_split(X_scaled, y, test_size=0.10, random_state=1)

x_train_scaled_75, x_test_scaled_25, y_train_scaled_75, y_test_scaled_25 = train_test_split(X_scaled, y, test_size=0.25, random_state=1)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_randomforest_train90 = RandomForestRegressor(n_estimators=500,n_jobs=-1, random_state=13)
model_randomforest_train90.fit(x_train_90, y_train_90)

model_randomforest_train75 = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=13)
model_randomforest_train75.fit(x_train_75, y_train_75)

model_randomforest_scaled_train90 = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=13)
model_randomforest_scaled_train90.fit(x_train_scaled_90, y_train_scaled_90)

model_randomforest_scaled_train75 = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=13)
model_randomforest_scaled_train75.fit(x_train_scaled_75, y_train_scaled_75)


In [None]:
score_test.append(model_randomforest_train90.score(x_test_10,y_test_10))
model.append("model_randomforest_train90") 
                  
score_test.append(model_randomforest_train75.score(x_test_25, y_test_25))
model.append("model_randomforest_train75")

score_test.append(model_randomforest_scaled_train90.score(x_test_scaled_10, y_test_scaled_10))
model.append("model_randomforest_scaled_train90")

score_test.append(model_randomforest_scaled_train75.score(x_test_scaled_25, y_test_scaled_25))
model.append("model_randomforest_scaled_train75")

**Xgboost**

In [None]:
import xgboost as xgb
model_xgboost_train90= xgb.XGBRegressor(colsample_bytree=0.4603,gamma=0.0468,
                                       learning_rate=0.05, max_depth=3,
                                       min_child_weight=1.7817, n_estimators=2200,
                                       reg_alpha=0.4640, reg_lambda=0.8571,
                                       subsample=0.5213,silent=1,
                                        random_state=7,nthread= -1)
model_xgboost_train90.fit(x_train_90,y_train_90)

model_xgboost_train75 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
 
model_xgboost_train75.fit(x_train_75, y_train_75)

model_xgboost_scaled_train90 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
 
model_xgboost_scaled_train90.fit(x_train_scaled_90, y_train_scaled_90)

model_xgboost_scaled_train75 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
 
model_xgboost_scaled_train75.fit(x_train_scaled_75, y_train_scaled_75)

In [None]:
score_test.append(model_xgboost_train90.score(x_test_10, y_test_10))
model.append("model_xgboost_train90")

score_test.append(model_xgboost_train75.score(x_test_25, y_test_25))
model.append("model_xgboost_train75")

score_test.append(model_xgboost_scaled_train90.score(x_test_scaled_10, y_test_scaled_10))
model.append("model_xgboost_scaled_train90")

score_test.append(model_xgboost_scaled_train75.score(x_test_scaled_25, y_test_scaled_25))
model.append("model_xgboost_scaled_train75")

**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor
model_decisiontree_train90=DecisionTreeRegressor(random_state=0)
model_decisiontree_train90.fit(x_train_90,y_train_90)

model_decisiontree_train75 = DecisionTreeRegressor(random_state=0)
model_decisiontree_train75.fit(x_train_75, y_train_75)

model_decisiontree_scaled_train90 = DecisionTreeRegressor(random_state=0)
model_decisiontree_scaled_train90.fit(x_train_scaled_90, y_train_scaled_90)

model_decisiontree_scaled_train75 = DecisionTreeRegressor(random_state=0)
model_decisiontree_scaled_train75.fit(x_train_scaled_75, y_train_scaled_75)

In [None]:
score_test.append(model_decisiontree_train90.score(x_test_10, y_test_10))
model.append("model_decisiontree_train90")

score_test.append(model_decisiontree_train75.score(x_test_25, y_test_25))
model.append("model_decisiontree_train75")

score_test.append(model_decisiontree_scaled_train90.score(x_test_10, y_test_10))
model.append("model_decisiontree_scaled_train90")

score_test.append(model_decisiontree_scaled_train75.score(x_test_25, y_test_25))
model.append("model_decisiontree_scaled_train75")

#### LASSO

In [None]:
from sklearn.linear_model import Lasso
model_lasso_train90= Lasso(alpha=0.0005)
model_lasso_train90.fit(x_train_90, y_train_90)

model_lasso_train75 = Lasso(alpha=0.0005)
model_lasso_train75.fit(x_train_75, y_train_75)


In [None]:
score_test.append(model_lasso_train90.score(x_test_10, y_test_10))
model.append("model_lasso_train90")

score_test.append(model_lasso_train90.score(x_test_10, y_test_10))
model.append("model_lasso_train90")

#### Score of all the models

In [None]:
final_scores = pd.DataFrame()
final_scores['model_name'] = model
final_scores['score_test'] = score_test
final_scores

#### Finding the best model

In [None]:
best_index=score_test.index(max(score_test))
best_model=final_scores['model_name'][best_index]
best_model

#### Predicting the test data with best model

In [None]:
y_predict_best = model_xgboost_train90.predict(test_)

#### Submission

In [None]:
result = pd.DataFrame()
result['Id'] = test['Id']
result['SalePrice'] = y_predict_best

In [None]:
result.head(10)

In [None]:
result.to_csv('submission.csv', index=False)