In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import skew
import matplotlib.pyplot as plt

## Importing the data

In [2]:
traindata=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
testdata=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
traindata.head()

### Shape or dimentions of train and test data

In [4]:
traindata.shape  , testdata.shape

In [5]:
traindata.columns

### Linear model make some assumption :
1. Linearity
2. No endogeneity 
3. Normality and Homoscedasticity
4. No autocorrelation
5. No Multicolinearity

we will take this assumption into consideartion while  bulding the model

In [6]:
traindata.info()

In [7]:
traindata.describe()

### Lets look at depended variabel i.e SalePrice

In [8]:
traindata['SalePrice'].describe()

In [9]:
sns.distplot(traindata['SalePrice']);

from above graph we can say that SalePrice is positively skewed , let  calulate the skewness and Kurtosis 

In [10]:
print('Skewness :%f' %traindata['SalePrice'].skew())

Kurtosis is a statistical measure that defines how heavily the tails of a distribution differ from the tails of a normal distribution. In other words, kurtosis identifies whether the tails of a given distribution contain extreme values(Outlier).

In [11]:
print('Kurtosis : %f' %traindata['SalePrice'].kurt())

### Lets take a look at correlation for trainindata set

In [12]:
traindata.corr()

In [13]:
corrmat = traindata.corr()
plt.figure(figsize=(12,12))
sns.heatmap(corrmat,square = True )

In [14]:
corrmat=traindata.corr()
top_corr=corrmat.index[abs(corrmat['SalePrice'])>0.5]
plt.figure(figsize=(10,10))
sns.heatmap(traindata[top_corr].corr() ,annot=True)

In [15]:
traindata.corr()['SalePrice']

#### From the above heatmap of correlaion we can say that OverallQual , GrLivArea , GarageCars, GarageArea , TotalBsmtSF, 1stFlrSF are highly related with SalePrice
If we take look at multicolinearity then 
1. TotalBsmtSF  vs 1stFlrSF 
2. TotRmsAbvGrd vs GrLivArea
3. GrageArea vs GrageCars

#### Lets plot the scatter plot  OverallQual , GrLivArea , GarageCars, GarageArea , TotalBsmtSF, 1stFlrSF  and saleprice 

In [16]:
col=['SalePrice','OverallQual' , 'GrLivArea' , 'GarageArea' ,'TotalBsmtSF', '1stFlrSF','FullBath', 'YearBuilt']
sns.pairplot(traindata[col])

'GrLivArea' , 'GarageArea' ,'TotalBsmtSF', '1stFlrSF','FullBath' show linear relationship with SalePrice

In [17]:
Y=traindata.SalePrice
traindata=traindata.drop(['SalePrice'],axis=1)
traindata.shape

### Missing Value 
First we will look at missing valves in our traindata set 

In [18]:
AllData=pd.concat([traindata ,testdata],axis=0)
AllData.shape

In [19]:
total= AllData.isnull().sum().sort_values(ascending = False)
percent= (AllData.isnull().sum()/AllData.isnull().count()).sort_values(ascending= False)
missingdata = pd.concat([total,percent],axis=1,keys=['total','percent'])
missingdata.head(40)

 As we can see PoolQC , MiscFeature ,Alley	,Fence, has  most missing value more than 80% so we willdrop this feature  as well as  FireplaceQu . And due to multicolinearity we will derop 1stFlrSF','GrLivArea','GrageCars' this feature

In [20]:
AllData= AllData.drop(['PoolQC' , 'MiscFeature' ,'Alley' ,'Fence','Id',  'FireplaceQu','1stFlrSF','GrLivArea','GarageCars'],axis=1)

In [21]:
AllData.shape

In dataset if there is no garage availabel to  house then entry shows as NA so we will fill this value as 0 it mean it has no garage  and same goes for BsmtExposure and BsmtFinType2	

In [22]:
Garage_feature=['GarageYrBlt','GarageCond','GarageType','GarageFinish','GarageQual']
Basment_feature=['BsmtExposure','BsmtFinType2','BsmtQual','BsmtCond','BsmtFinType1']
for i in Garage_feature:
    AllData[i].fillna(0,inplace=True )

for i in Basment_feature:
    AllData[i].fillna(0,inplace=True )

AllData['LotFrontage'].fillna(AllData['LotFrontage'].mean(),inplace=True)
AllData['MasVnrArea'].fillna(AllData['MasVnrArea'].mean(),inplace=True)
AllData['MasVnrType'].fillna(0,inplace=True )
AllData['Electrical'].fillna(AllData['Electrical'].mode()[0],inplace=True)


AllData['BsmtHalfBath'].fillna(0,inplace=True )
AllData['Utilities'].fillna('AllPub',inplace=True )
AllData['Functional'].fillna('Typ',inplace=True )
#testdata['BsmtHalfBath'].fillna(0,inplace=True )
AllData['BsmtFinSF1'].fillna(traindata['BsmtFinSF1'].mode()[0],inplace=True)
AllData['BsmtFinSF2'].fillna(traindata['BsmtFinSF2'].mode()[0],inplace=True)
AllData['KitchenQual'].fillna(traindata['KitchenQual'].mode()[0],inplace=True)
AllData['TotalBsmtSF'].fillna(traindata['TotalBsmtSF'].mode()[0],inplace=True)
AllData['Exterior2nd'].fillna(traindata['Exterior2nd'].mode()[0],inplace=True)
#testdata['GarageCars'].fillna(traindata['GarageCars'].mode()[0],inplace=True)
AllData['Exterior1st'].fillna(traindata['Exterior1st'].mode()[0],inplace=True)
AllData['GarageArea'].fillna(traindata['GarageArea'].mode()[0],inplace=True)
AllData['SaleType'].fillna(traindata['SaleType'].mode()[0],inplace=True)
AllData['MSZoning'].fillna(traindata['MSZoning'].mode()[0],inplace=True)
AllData['BsmtFullBath'].fillna(traindata['BsmtFullBath'].mode()[0],inplace=True)
AllData['BsmtUnfSF'].fillna(traindata['BsmtUnfSF'].mode()[0],inplace=True)

In [23]:
AllData['GarageYrBlt'] = 2021-AllData['GarageYrBlt']
AllData['YearBuilt'] = 2021-AllData['YearBuilt']
AllData['YearRemodAdd'] = 2021-AllData['YearRemodAdd']
AllData['YrSold'] = 2021-AllData['YrSold']
AllData[['GarageYrBlt','YearBuilt','YearRemodAdd','YrSold']].head()

In [24]:
AllData.isnull().sum().max()

### Skweness
Lets Chek the skwenees of feature of traindata fix the skwness as we know while doing the regression Normality is one of the assumption

First we seperate the Numerical and Catagorical Variabel

In [25]:
numerical_feature=AllData.select_dtypes(exclude=['object']).columns
cataorical_feature=AllData.select_dtypes(include=['object']).columns
numerical_feature

In [26]:
len(numerical_feature)

In [27]:
len(cataorical_feature)

In [28]:
All_num = AllData[numerical_feature]
All_cat = AllData[cataorical_feature]
All_num.shape

In [29]:
# checkin skewmess of all features
skewness = All_num.apply(lambda x: skew(x))
skewness.sort_values(ascending=False)

In [30]:
skewness_Y=skew(Y)
skewness_Y

### we will select the feature whos skewness >0.5

In [31]:
skewness=skewness[abs(skewness)>0.5]
skewness.index

In [32]:
All_num[skewness.index]=np.log1p(All_num[skewness.index])
Y=np.log1p(Y)

### Dealing with categorical variabel

In [33]:
All_cat.shape

In [34]:
All_cat=pd.get_dummies(All_cat,drop_first=True)
All_cat

In [35]:
AllData_1=pd.concat([All_cat,All_num],axis=1)

In [36]:
AllData_1.shape

### Checking and removing Outliers

In [37]:
sns.boxplot(Y)

##### As we can see from boxplot the are many outlier in SalePrice .We will use Z-score tretment for detectin and removin the outliers

As we can see  in LotArea max: 12.279537  and min is : 7.170888 after processing this we will see the result

In [38]:
AllData_1['LotArea'].describe()

In [39]:
def outlier(z):
    upper_limit= AllData_1[z].mean()+ 3* AllData_1[z].std()
    lower_limit=  AllData_1[z].mean()- 3* AllData_1[z].std()
    AllData_1[z]=np.where( AllData_1[z]>upper_limit ,upper_limit,
                   np.where( AllData_1[z]<lower_limit, lower_limit ,
                             AllData_1[z]
                   )
        )
    print('Upperlimit : {} and lowerlimit : {} and Columns name is: {}'.format(upper_limit,lower_limit,z))
    

In [40]:
for i in numerical_feature:
    outlier(i)
    

  

In [41]:
AllData_1['LotArea'].describe()

As we can see  in LotArea max: 10.624562  and min is : 7.565269 . so this how capping done And insted of droping the outlier we will replace it from this processes we will not lose the information 

In [42]:
Y.describe()

In [43]:
Y

In [44]:
max_limit= Y.mean()+ 3*Y.std()
min_limit= Y.mean()- 3*Y.std()
print(min_limit , max_limit)
Y=np.where( Y > max_limit , max_limit ,
                   np.where( Y< min_limit, min_limit ,
                             Y
                   )
        )

In [45]:
Y

In [46]:
traindata.shape ,testdata.shape

In [47]:
traindata_1=AllData_1.iloc[:1460,:]
#pd.concat([traindata_1,Y],axis=1)
traindata_1

In [48]:
testdata_1=AllData_1.iloc[1460:,:]
testdata_1

In [49]:
X = traindata_1

In [50]:
from sklearn.ensemble import RandomForestRegressor

In [51]:
RFR =RandomForestRegressor(n_jobs=-1,random_state=1)

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
para={
    'max_depth':[2,5,10,50,100,150],
    'min_samples_leaf':[2,5,7,50,100,200],
    'n_estimators':[5,10,30,50,100,200]
}

In [54]:
%%time
grid_serch=GridSearchCV(estimator=RFR,
                       param_grid=para,
                      cv=5,
                      n_jobs=-1, scoring="neg_mean_squared_error" ,verbose=1
                      )

In [55]:
grid_serch.fit(X,Y)

In [56]:
rfbest_estimator=grid_serch.best_estimator_
rfbest_estimator

In [57]:
rfbest_estimator.feature_importances_

In [58]:
imp_fea=pd.DataFrame({
    'variable':X.columns,
    'imp': rfbest_estimator.feature_importances_
})
feature=imp_fea.sort_values(by='imp',ascending= False)
feature.head(50)

### Scaling the Data

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
sc=StandardScaler()

In [61]:
col=X.columns
X=pd.DataFrame(sc.fit_transform(X))
X.columns = col
X

In [62]:
col=testdata_1.columns
testdata_1=pd.DataFrame(sc.fit_transform(testdata_1))
testdata_1.columns = col
testdata_1

In [63]:
from sklearn. linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split as tts

Using RFE for Initial Feature Selection

In [64]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split as tts

In [65]:
x_train,x_test,y_train,y_test=tts(X,Y,test_size=0.2,random_state=42)

In [66]:
x_train

In [67]:
lm = LinearRegression()
lm.fit(x_train, y_train)

rfe = RFE(lm, 50) # running RFE
rfe = rfe.fit(x_train, y_train)

In [68]:
col = x_train.columns[rfe.support_]
col

In [69]:
# Creating X_train & X_test dataframes with RFE selected variables
x_train_rfe = x_train[col]
x_test_rfe = x_test[col]
lm.fit(x_train_rfe,y_train)
y_pred_rfe=lm.predict(x_test_rfe)
mean_squared_error(y_test,y_pred_rfe)

**Building model using Ridge Regression with RFE feature**

In [70]:

params = {'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 
                    9.0, 10.0, 20, 50, 100, 500, 1000 ]}

ridge = Ridge()

folds = 5
ridge_model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
ridge_model_cv.fit(x_train_rfe, y_train)

In [71]:
ridge_cv_results = pd.DataFrame(ridge_model_cv.cv_results_)
ridge_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])

In [72]:
#checking the value of optimum number of parameters
print(ridge_model_cv.best_params_)

In [73]:
# Building the model with alpha
ridge = Ridge(alpha=ridge_model_cv.best_params_['alpha'])

ridge.fit(x_train_rfe, y_train)
y_train_pred = ridge.predict(x_train_rfe)
y_test_pred = ridge.predict(x_test_rfe)

print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

**Building model using Ridge Regression with all Data**

In [74]:
params = {'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 
                    9.0, 10.0, 20, 50, 100, 500, 1000 ]}

ridge = Ridge()

folds = 5
ridge_model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
ridge_model_cv.fit(x_train, y_train)

In [75]:
ridge_cv_results = pd.DataFrame(ridge_model_cv.cv_results_)
ridge_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])

In [76]:
#checking the value of optimum number of parameters
print(ridge_model_cv.best_params_)

In [77]:
# Building the model with alpha
ridge = Ridge(alpha=ridge_model_cv.best_params_['alpha'])

ridge.fit(x_train, y_train)
y_train_pred = ridge.predict(x_train)
y_test_pred = ridge.predict(x_test)

print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

In [78]:
mean_squared_error(y_test,y_test_pred)

In [79]:
model_param = list(ridge.coef_)
model_param.insert(0,ridge.intercept_)
cols = x_train.columns
model_rigi_coe=pd.DataFrame(list(zip(model_param ,col)))
model_rigi_coe.head()

**Building model using Lasso Regression**

In [80]:
lasso = Lasso()

folds = 10
lasso_model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
lasso_model_cv.fit(x_train, y_train)

In [81]:
lasso_cv_results = pd.DataFrame(lasso_model_cv.cv_results_)
lasso_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])

In [82]:
#checking the value of optimum number of parameters
print(lasso_model_cv.best_params_)

In [83]:

lasso = Lasso(alpha=lasso_model_cv.best_params_['alpha'])

lasso.fit(x_train, y_train)
y_train_pred = lasso.predict(x_train)
y_test_pred = lasso.predict(x_test)

print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

In [84]:
mean_squared_error(y_test,y_test_pred)

**Finding :**
The optimal lambda value in case of Ridge and Lasso is as below:

Ridge - 5.0

Lasso - 0.005

The Mean Squared error in case of Ridge and Lasso are:

Ridge - 0.02255

Lasso - 0.01906


The r2_score for test data in case of Ridge and Lasso are:

Ridge - 87.9%

Lasso - 89.02%

In [85]:
testdata_1.isnull().sum().sort_values(ascending=False)


In [86]:
testdata_1['GarageYrBlt'].fillna(traindata['GarageYrBlt'].mode()[0],inplace=True)
testdata_1.isnull().sum().sort_values(ascending=False)

**So for Final Prediction we use Lasso Regression**

In [87]:
y_pred=lasso.predict(testdata_1)
y_pred_final=np.exp(y_pred)

In [88]:
submission=pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission.head()

In [89]:
submission.iloc[:, 1] =y_pred_final
submission.to_csv('submission.csv', index=False)

In [90]:
y_pred_final[0]