## 1. Importing Required Libraries

In [None]:
# importing Required libraries for our Analysis

import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Filter warnings: import warnings to avoid unnecessary runtime warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
# using pandas setoption to display all the columns and rows to have a clear view of data in  further steps

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## 2.Reading the train.csv file 

In [None]:
# Importing the csv file and storing in data variable,And displaying the top5 rows of data

data=pd.read_csv('train.csv')
data.head()

## 3.Data Description

In [None]:
#Shape of the dataframe

data.shape

## Observation:
- We have 1460 rows and 81 columns

In [None]:
# statistical Description of the data
data.describe()

## Insights:
- From the above statistical descripton count of some numerical columns are less than the normal count of 1460.
- `LotFrontage`,`MasvnrArea` and `GarageYrBlt` features required null value treatment which will be done in further steps

In [None]:
# Display non-null count ,datatype of each feature
data.info()

## Insights:
- Total number of features we have is 81.
- We have Features containing datatype of `int`,`object` and `float`
- We can see see some of the columns have null values.we will treat them in further steps.some of them are `PoolQc`,`Fencce`,`MiscFeature`

## 4.Data Preparation and EDA

In [None]:
# Display the Percentage of null values present in each column(or) feature

data.isnull().sum()/data.shape[0]

## Insights:
- In the above step we have clear view of null value percentage of each feature in our dataset

In [None]:
# lets sort the Percentage of null values for easy represenataion of the data
#nullval_per holds null value percentage of each column

nullval_per=data.isnull().sum()/data.shape[0]
nullval_per.sort_values(ascending=False)[:20]   #Display the column names of Top 20 highest null value percentage

## Insights:

- In the above step we are displaying the features null value percentage in descending order.
- Lets drop features/columns having null value percentage greater than 30

In [None]:
# Considering only the columns with less than 30 percentage of null values for our analysis
newcolumns=nullval_per[nullval_per<0.3]
newcolumns

In [None]:
# Create a new dataframe `data2` considering features where null value percentage is less than 30 percent
data2=data[newcolumns.index]
data2.head()

## Insights:
- We have created new dataframe called `data2` which holds features where null value percentage is less than 30

In [None]:
# Dropping id column
data2=data2.iloc[:,1:]
data2.head()

In [None]:
data2.shape

# Insights:
- Id column has been dropped since it will not help in our analysis considering this feature may mislead the analysis
- we are having 75 columns/features,lets consider further analysis

## Null value Treatment

In [None]:
# `NullValuecolumns` here holds the features having null value percentage greater than 0
nullval_per=data2.isnull().sum()/data2.shape[0]
NullValuecolumns=nullval_per[nullval_per>0.0].sort_values(ascending=False)
print(NullValuecolumns)
print()
print("Count of features having null values:",len(NullValuecolumns))

## Insights:
- We have 14 features having null vlaues in it.
- In the next step all the features names will be displayed 

In [None]:
NullValuecolumns.index

In [None]:
# Lets understand the datatype of the column,which helps us in treating outliers on the same

data2[NullValuecolumns.index].dtypes

## Insights:
- We can treat the outliers present in the above column since now we are aware of teh datatype of that feature
- We will follow the following approach :
    - Columns having null values in  `Numerical` columns will be replaced with Median values of that feature.
    - Columns having null values in  `Categorical` columns will be replaced based on the information we have from data dictionary

In [None]:
# Treat the numeric columns by replacing nan values with median values

data2['LotFrontage'] = data2['LotFrontage'].replace(np.NaN,data2['LotFrontage'].median())
data2['MasVnrArea'] = data2['MasVnrArea'].replace(np.NaN,data2['MasVnrArea'].median())
data2['GarageYrBlt'] = data2['GarageYrBlt'].replace(np.NaN,data2['GarageYrBlt'].median())

In [None]:
# Lets verify the null value count after performing null value treatment in the previous step

data2[['LotFrontage','MasVnrArea','GarageYrBlt']].isnull().sum()

##  Insights:
- Null values in the above numeical columns has been treated successfully.

In [None]:
# Lets validate Nullvalue percentage again
nullval_per=data2.isnull().sum()/data2.shape[0]
NullValuecolumns=nullval_per[nullval_per>0.0].sort_values(ascending=False)
NullValuecolumns

## Insights:
- Lets treat the above categorical values in the next steps in  which null values has been observed as shown above

## Lets visualize the categorical features

In [None]:
NullValuecolumns.index

In [None]:
## All the abve columns will be visualised to understand the spread of categories in the categorical features
counter=range(1,len(NullValuecolumns)+1)
plt.figure(figsize=(18,50))
for i,j in zip(NullValuecolumns.index,counter):
    plt.subplot(6,2,j)
    plt.title(i,fontdict={'fontsize':15,'color':'green'})
    sns.countplot(i,data=data2)
    plt.xticks(rotation=90,size=12,color='red')  
plt.show()

In [None]:
# From data dictionary we know that nan values in these columns indicate No Garage so lets replace nan values with NoGarage category
#  In the following categories : 'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType',Lets replace nan values with `No garage` categroy

data2['GarageCond']=data2['GarageCond'].replace(np.NaN,'No Garage')
data2['GarageQual']=data2['GarageQual'].replace(np.NaN,'No Garage')
data2['GarageFinish']=data2['GarageFinish'].replace(np.NaN,'No Garage')
data2['GarageType']=data2['GarageType'].replace(np.NaN,'No Garage')

## Insights:
- We have treated null values present in the Garage related columns and treated nan values with `No Garage` Category

In [None]:
# From data dictionary we know that nan values in these columns indicate No basement so lets replace nan values with No Basement category
# In the following categories : 'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual',Lets replace nan values with `No Basement` categroy

data2['BsmtFinType2']=data2['BsmtFinType2'].replace(np.NaN,'No Basement')
data2['BsmtExposure']=data2['BsmtExposure'].replace(np.NaN,'No Basement')
data2['BsmtFinType1']=data2['BsmtFinType1'].replace(np.NaN,'No Basement')
data2['BsmtCond']=data2['BsmtCond'].replace(np.NaN,'No Basement')
data2['BsmtQual']=data2['BsmtQual'].replace(np.NaN,'No Basement')

## Insights:
- We have treated null values present in the Garage related columns and replaced nan values with ` No Basement` category

In [None]:
 # Replacing nan values with mode(Highests occurence) value

data2['MasVnrType']=data2['MasVnrType'].replace(np.NaN,data2['MasVnrType'].mode()[0])
data2['Electrical']=data2['Electrical'].replace(np.NaN,data2['Electrical'].mode()[0])

## Insights:
- Replacing nan values in `MasVnrType` an `Electrical` with highest occurence category in the features

In [None]:
## Lets validate the null value count 
print(data2.isnull().sum())

## Insights:
- As shown above null value count is 0 in all the features

In [None]:
data2.info()

## Insights:
- As shown above all the 75 features are now validated for null values and no null values are present in the above features,lets continue our analysis

In [None]:
# From data dictionary we know that feature: MSSubClass is a categorical column represented in numerical form lets convert back to original representation which helps in EDA


data2['MSSubClass']=data2['MSSubClass'].replace(
    {20:'1-STORY 1946 & NEWER ALL STYLES',
     30:'1-STORY 1945 & OLDER',
     40:'1-STORY W/FINISHED ATTIC ALL AGES',
     45:'1-1/2 STORY - UNFINISHED ALL AGES',
     50:'1-1/2 STORY FINISHED ALL AGES',
     60:'2-STORY 1946 & NEWER',
     70:'2-STORY 1945 & OLDER',
     75:'2-1/2 STORY ALL AGES',
     80:'SPLIT OR MULTI-LEVEL',
     85:'SPLIT FOYER',
     90:'DUPLEX - ALL STYLES AND AGES',
     120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
     150:'1-1/2 STORY PUD - ALL AGES',
     160:'2-STORY PUD - 1946 & NEWER',
     180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
     190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'})

## Insights:
- Feature: `MSSubClass` is a categorical column represented in numerical form lets convert back to original representation which helps in EDA


In [None]:
## Lets treat `OverallQual` and `OverallCond` whicha re in numericla in nature but from data dictionary they represnt cateories of goodness measure
# values is a reusable method which when called convert the features into categories when numerical columns where given

values={10:'Very Excellent',
 9:'Excellent',
 8:'Very Good',
 7:'Good',
 6:'Above Average',
 5:'Average',
 4:'Below Average',
 3:'Fair',
 2:'Poor',
 1:'Very Poor'
}

## Insights:
- Reusable methods are created which helps in converting the features `overallQual` and `overallcond` to its original format specified in data dictionary

In [None]:
# OverallQual: Rates the overall material and finish of the house
# OverallCond: Rates the overall condition of the house

data2['OverallQual']=data2['OverallQual'].replace(values)
data2['OverallCond']=data2['OverallCond'].replace(values)

In [None]:
data2[['MSSubClass','OverallQual','OverallCond']].head()

In [None]:
data2[['MSSubClass','OverallQual','OverallCond']].info()

## Insights:
- We have treated the nan values and converted features like `MsSubClass`,`OverallQual` and `OverallCond` to the required format for further analysis

In [None]:
data2.head()

In [None]:
## Lets have a check at Target Column : SalePrice
# lets plot distribution plot for the target column 

sns.distplot(data2['SalePrice'])
plt.show()

## Insights:
- As observed data is right skewed and this is quite common in `SalesPrice` columns where few products/houses will have higher price  due to various factors.
- We need to treat this else model will get impacted because of the skewed data

In [None]:
data2['SalePrice']=np.log(data2['SalePrice'])
data2['SalePrice']

In [None]:
# Lets visualise the distribution plot for saleprice feature afer aplying tge log transfrmation
sns.distplot(data2['SalePrice'])
plt.show()

## Insights:
- As observed in the above distribution plot we have reduceed the skewness of teh data by applying log transformation.
- Due to this transformation we have a better results and results may not get deviated.
- The scale o the salesprice has changed drastically after transformation 

In [None]:
data2.shape

### Lets validate the integer and categorical columns and visualise them seperately using the plots

In [None]:
# Print the Features having integer and float datatype
data2.select_dtypes(include=['int64','float64']).columns

In [None]:
# Length of the numericla features
len(data2.select_dtypes(include=['int64','float64']).columns)

In [None]:
# Print the features having numerical datatype 
data2[data2.select_dtypes(include=['int64','float64']).columns].head()

In [None]:
# Print the features having categorical datatype

categorical_columns= data2.select_dtypes(include=['object']).columns
categorical_columns

In [None]:
#Length of categirical columns in our dataset
len(categorical_columns)

## Insights:
- In total we have 41 categorical columns lets visualise them to understand the count of categories present in them 
- In total we have 34 numerical fearures including Target feature which is in numerical format

## Lets understand how SalePrice is varied for various categorical columns

In [None]:
# Reusable method for countplot

def countplot(feature,rotation):
    plt.title(feature,size=14,color='green')           # Prints the tite for the plot
    sns.countplot(x=feature,data=data2)  # creates a countplot for the given categorical column
    plt.xticks(rotation=90)              # x- axis labels are rotated with the user given degree value
plt.show()

In [None]:
# Countplot for `MSSubClass`
# The plot displays the categories present in the feature and its count

countplot('MSSubClass',90)

## Insights:
- As shown above feature `MSSubClass` has multple categories and few categories like `1-STORY W/FINISHED ATTIC ALL AGES` has very less count in the feature

In [None]:
# Countplot for `MSZoning`,`Street`,`LotShape`,`LandCounter`,`Utilities` and `LotConfig` are shown below
# The plot displays the categories present in the feature and its count

plt.figure(figsize=(20,14))

plt.subplot(2,3,1)
countplot('MSZoning',90)

plt.subplot(2,3,2)
countplot('Street',90)


plt.subplot(2,3,3)
countplot('LotShape',90)

plt.subplot(2,3,4)
countplot('LandContour',90)

plt.subplot(2,3,5)
countplot('Utilities',90)


plt.subplot(2,3,6)
countplot('LotConfig',90)

plt.show()

## Insights:
- As shown above `Street`,`Utilities` and `LandContour` are highly skewed to a single category and considerng them in our model may not bring good results.
- As shown above `MSZoning` has categories calledd `RH` and `C(all)` which we can consider as a seperate category called others since its presence is very less in the category
- As shown above `LotConfig` has categories calledd `FR2` and `FR3` which we can consider as a seperate category called others since its presence is very less in the category

In [None]:
#creating as special category called `others` for the following features

data2['MSZoning'].replace(['RH','C (all)'],'others',inplace=True)
data2['LotConfig'].replace(['FR2','FR3'],'others',inplace=True)

In [None]:
# Lets drop Street and Utilities colum since they are highly skewed to a single category and downt help in our analysis

data2.drop(['Street','Utilities','LandContour'],axis=1,inplace=True)

## Insights:
- we have dropped few columns wth the reason mentioned above and few catgeories has been treated seperately as shown above

In [None]:
#LandSlope', 'Neighborhood', 'Condition1','Condition2', 'BldgType', 'HouseStyle'.

plt.figure(figsize=(20,18))

plt.subplot(2,3,1)
countplot('LandSlope',90)

plt.subplot(2,3,2)
countplot('Neighborhood',90)

plt.subplot(2,3,3)
countplot('Condition1',90)

plt.subplot(2,3,4)
countplot('Condition2',90)

plt.subplot(2,3,5)
countplot('BldgType',90)

plt.subplot(2,3,6)
countplot('HouseStyle',90)

plt.show()

## Insights:
- As shown above `LandSlope` and `Condition2` are highly skewed to a single category and considerng them in our model may not bring good results.
- As shown above `Condition1` feature has categories called 'RRAe','PosA','RRNn','RRNe' which we can consider as a seperate category called `others` since its presence is very less in the category


In [None]:
data2['Condition1'].value_counts()

In [None]:
data2['Condition1'].replace(['RRAe','PosA','RRNn','RRNe'],'others',inplace=True)

In [None]:
# Lets drop condition2 column since it is highly skewed to a single category

data2.drop(['LandSlope','Condition2'],axis=1,inplace=True)

## Insights:
- we have dropped few columns wth the reason that single category is biased in the categorical column few catgeories has been considered as `others` cateory which can be treated seperately.

In [None]:
 # 'OverallQual', 'OverallCond','RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd'
    
    
plt.figure(figsize=(20,14))

plt.subplot(2,3,1)
countplot('OverallQual',90)

plt.subplot(2,3,2)
countplot('OverallCond',90)

plt.subplot(2,3,3)
countplot('RoofStyle',90)

plt.subplot(2,3,4)
countplot('RoofMatl',90)

plt.subplot(2,3,5)
countplot('Exterior1st',90)

plt.subplot(2,3,6)
countplot('Exterior2nd',90)

plt.show()

## Insights:
- As shown above `RoofMatl`is highly skewed to a single category and considerng it in our model may not bring good results,so lets drop this fetaure.S lets drop them in the next steps.
- As shown above `Exterior1st` feature has categories called 'BrkComm','Stone','AsphShn','CBlock','ImStucc' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data
- As shown above `Exterior2nd` feature has categories called 'ImStucc','BrkComm','Stone','AsphShn','CBlock','ImStucc' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data


In [None]:
data2['RoofStyle'].replace(['Flat','Gambrel','Mansard','Shed'],'others',inplace=True)

## Insights:
- As shown above `RoofStyle` feature has categories called 'Flat','Gambrel','Mansard','Shed' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data


In [None]:
data2['Exterior1st'].replace(['BrkComm','Stone','AsphShn','CBlock','ImStucc'],'others',inplace=True)

In [None]:
data2['Exterior2nd'].replace(['ImStucc','BrkComm','Stone','AsphShn','CBlock','ImStucc'],'others',inplace=True)

In [None]:
data2.drop('RoofMatl',axis=1,inplace=True)

## Insights:
- we have dropped few columns wth the reason that single category is biased in the categorical column few catgeories has been considered as `others` cateory which can be treated seperately.

In [None]:
 # 'MasVnrType','ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond' 
    
plt.figure(figsize=(20,14))

plt.subplot(2,3,1)
countplot('MasVnrType',90)

plt.subplot(2,3,2)
countplot('ExterQual',90)

plt.subplot(2,3,3)
countplot('ExterCond',90)

plt.subplot(2,3,4)
countplot('Foundation',90)

plt.subplot(2,3,5)
countplot('BsmtQual',90)

plt.subplot(2,3,6)
countplot('BsmtCond',90)

plt.show()

## Insights:
- As shown above `Foundation` feature has categories called 'Slab','Stone','Wood' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data
- Presenceof other categorical is not very biased so we havent considered any special tret=atment for them

In [None]:
data2['Foundation'].replace(['Slab','Stone','Wood'],'others',inplace=True)

In [None]:
data2.drop(['BsmtCond','ExterCond'],axis=1,inplace=True)

## Insights:
- we have dropped few columns with the reason that single category is biased in the categorical column few catgeories has been considered as `others` cateory which can be treated seperately.

In [None]:
 # 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC','CentralAir' 

plt.figure(figsize=(20,14))

plt.subplot(2,3,1)
countplot('BsmtExposure',90)

plt.subplot(2,3,2)
countplot('BsmtFinType1',90)

plt.subplot(2,3,3)
countplot('BsmtFinType2',90)

plt.subplot(2,3,4)
countplot('Heating',90)

plt.subplot(2,3,5)
countplot('HeatingQC',90)

plt.subplot(2,3,6)
countplot('CentralAir',90)

plt.show()

In [None]:
data2['HeatingQC'].replace(['Fa','Po'],'others',inplace=True)

## Insights:
- As shown above `HeatingQC` feature has categories called 'Fa','Po' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data
- Presenceof other categorical is not very biased so we havent considered any special tret=atment for them

In [None]:
data2.drop(['Heating','CentralAir','BsmtFinType2'],axis=1,inplace=True)

## Insights:
- we have dropped few columns with the reason that single category is biased in the categorical column few catgeories has been considered as `others` cateory which can be treated seperately.

In [None]:
# 'Electrical', 'KitchenQual', 'Functional', 'GarageType' ,'GarageFinish', 'GarageQual',


plt.figure(figsize=(20,20))

plt.subplot(3,3,1)
countplot('Electrical',90)

plt.subplot(3,3,2)
countplot('KitchenQual',90)

plt.subplot(3,3,3)
countplot('Functional',90)

plt.subplot(3,3,4)
countplot('GarageType',90)

plt.subplot(3,3,5)
countplot('GarageFinish',90)

plt.subplot(3,3,6)
countplot('GarageCond',90)

plt.show()

## Insights:
- As observed form the plots `Functional` feature has highly skewed data since majority of the data belongs to single category

In [None]:
# Cunt of different categories in `GarageType`
data2['GarageType'].value_counts()

In [None]:
data2['GarageType'].replace(['Basment','CarPort','2Types'],'others',inplace=True)

## Insights:
- As shown above `GarageType` feature has categories called 'Basment','CarPort','2Types' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data
- Presenceof other categorical is not very biased so we havent considered any special treatment for them

In [None]:
## As observed above itsbetter to drop the functional column

data2.drop(['Electrical','Functional','GarageCond'],axis=1,inplace=True)

## Insights:
- we have dropped few columns with the reason that single category is biased in the categorical column few catgeories has been considered as `others` cateory which can be treated seperately.

In [None]:
#'PavedDrive', 'SaleType', 'SaleCondition'

plt.figure(figsize=(15,10))

plt.subplot(2,2,1)
countplot('PavedDrive',90)

plt.subplot(2,2,2)
countplot('SaleType',90)


plt.subplot(2,2,3)
countplot('SaleCondition',90)

plt.show()

In [None]:
data2['SaleCondition'].replace(['Family','Alloca','AdjLand'],'Others',inplace=True)

## Insights:
- As shown above `SaleCondition` feature has categories called 'Family','Alloca','AdjLand' which we can consider as a seperate category called `others` since its presence is very less in the category.We need to do this because these low presence of categories in the data can mislead or bias the data
- Presence of other categorical is not very biased so we havent considered any special treatment for them

In [None]:
# Lets drop the below columns/features since we have very less count for other categories and Category: WD is highly occuring in this column

data2.drop('SaleType',axis=1,inplace=True)
data2.drop('PavedDrive',axis=1,inplace=True)

## Insights:
- As shown above we have dropped few columns with the reason that single category is biased in the categorical column few catgeories has been considered as `others` cateory which can be treated seperately.

In [None]:
# categorical columns

categorical_columns= data2.select_dtypes(include=['object']).columns
categorical_columns

In [None]:
len(categorical_columns)

## Insights:
- As shown above now we have 25 categorical columns earlier the count was 41 since we dropped some categorical columns we are left with 25.
- The reason behind dropping of some categorical columns has been mentioned in the abive steps where the presence of single catgeory in the categorical column tell the same info to the model, Considering his we have dropped fewof the features.

## Lets see the relation between categorical columns and  SalesPrice using BoxPlot

In [None]:
# Reusable method to create the box plot in finding the relation between categorical column and Saleprice(Target) column
def boxplot(inputfeature,rotation):
    plt.title(inputfeature+' v/s SalePrice',size=15,color='green')
    sns.boxplot(x=inputfeature,y='SalePrice',data=data2)
    plt.xticks(rotation=rotation)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
boxplot('MSSubClass',45)

## Insights:
- A s shown above feature `MsSubClass` has many categories and its showing the good relation with Saleprice(target) column
- we could see that sale price is more when  categories fall under this categories: `2-STORY 1946 & NEWER` and `1-STORY 1946 & NEWER ALL STYLES`

In [None]:
 #  Box plot for `Sale Price versus 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood','Condition1', 'BldgType',
plt.figure(figsize=(20,25))

plt.subplot(3,3,1)
boxplot('MSZoning',90)

plt.subplot(3,3,2)
boxplot('LotShape',90)

plt.subplot(3,3,3)
boxplot('LotConfig',90)

plt.subplot(3,3,4)
boxplot('Neighborhood',90)

plt.subplot(3,3,5)
boxplot('Condition1',90)

plt.subplot(3,3,6)
boxplot('BldgType',90)


plt.show()

## Insights:
- As shown above we have plotted  `Sale Price` versus `MSZoning`,`LotShape`,`LotConfig`,`Neighborhood`,`Condition1`,`BldgType`. We may get good information from these features and can obtain best results in predicting the Saleprice.

In [None]:
 #  Box plot for `Sale Price` versus 'HouseStyle', 'OverallQual', 'OverallCond','RoofStyle', 'Exterior1st', 'Exterior2nd'
plt.figure(figsize=(20,28))

plt.subplot(3,3,1)
boxplot('HouseStyle',90)

plt.subplot(3,3,2)
boxplot('OverallQual',90)

plt.subplot(3,3,3)
boxplot('OverallCond',90)

plt.subplot(3,3,4)
boxplot('RoofStyle',90)

plt.subplot(3,3,5)
boxplot('Exterior1st',90)

plt.subplot(3,3,6)
boxplot('Exterior2nd',90)


plt.show()

## Insights:
- As shown above we have plotted  `Sale Price` versus 'HouseStyle', 'OverallQual', 'OverallCond','RoofStyle', 'Exterior1st', 'Exterior2nd'.
- RoofStyle median looks somethings suspicious has outliers as well.

In [None]:
 #  Box plot for `Sale Price` versus 'MasVnrType', 'ExterQual','Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1'

plt.figure(figsize=(20,25))

plt.subplot(3,3,1)
boxplot('MasVnrType',90)

plt.subplot(3,3,2)
boxplot('ExterQual',90)

plt.subplot(3,3,3)
boxplot('Foundation',90)

plt.subplot(3,3,4)
boxplot('BsmtQual',90)

plt.subplot(3,3,5)
boxplot('BsmtExposure',90)

plt.subplot(3,3,6)
boxplot('BsmtFinType1',90)


plt.show()

## Insights:
- As shown above we have plotted `Sale Price` versus 'MasVnrType', 'ExterQual','Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1'

- We have many outliers as sown above but lets not treat this at this stage.And we cannot say that these are complete outliers since we are dealing with categorical versus numerical multiple catgeories can impact the target column.

In [None]:
 #  Box plot for `Sale Price` versus 'HeatingQC','KitchenQual', 'GarageType', 'GarageFinish', 'GarageQual','SaleCondition'
plt.figure(figsize=(20,28))

plt.subplot(3,3,1)
boxplot('HeatingQC',90)

plt.subplot(3,3,2)
boxplot('KitchenQual',90)

plt.subplot(3,3,3)
boxplot('GarageType',90)

plt.subplot(3,3,4)
boxplot('GarageFinish',90)

plt.subplot(3,3,5)
boxplot('GarageQual',90)

plt.subplot(3,3,6)
boxplot('SaleCondition',90)


plt.show()

## Insights:
- As shown above we have plotted `Sale Price` versus 'HeatingQC','KitchenQual', 'GarageType', 'GarageFinish', 'GarageQual','SaleCondition'
- We have many outliers as shown above but lets not treat this at this stage.And we cannot say that these are complete outliers since we are dealing with categorical versus numerical multiple catgeories can impact the target column.

## Lets visualise and treat numerical columns if required

In [None]:
data2.select_dtypes(include=['int64','float64']).columns

In [None]:
len(data2.select_dtypes(include=['int64','float64']).columns)

In [None]:
data2.select_dtypes(include=['int64','float64']).head()

## Insights:
- In total we have 34 columns conatining datatype as `integer` and `float`

In [None]:
sns.distplot(data2['YearBuilt'])
plt.show()

## Insights:
- `YearBuilt` says the original construction year of the building
- As shown above `YearBuilt` feature has left skewed data and its not complete normally distributed.

In [None]:
sns.distplot(data2['YearRemodAdd'])

## Insights:
- `YearRemodAdd` says the original construction year of the building
- As shown above `YearRemodAdd` feature has skewed data and its not complete normally distributed.

### `Extract features`: Lets create a new features called `Remodelledyearsback` and `AgeOfBuilding` which tells how many years back building was constructed and how many years back building was reconstructed

In [None]:
## Lets add a feature called `Remodelledyearsback` which tells how many years remodel has been done for the building

data2['Remodelledyearsback']=2021-data['YearRemodAdd']
data2['Remodelledyearsback']

In [None]:
## Lets add a feature called `AgeOfBuilding` which tells how many years back building was constructed
data2['AgeOfBuilding']=2021-data['YearBuilt']
data2['AgeOfBuilding']

In [None]:
sns.distplot(data2['AgeOfBuilding'])
plt.show()

## Insights:
- Age of buiding is right skewed but the scale is better than`YearBuilt`
- As we know the AgeOfBuilding plays a key role in the price of the building.Lets take this varibale forward and do some modelling by considering this varibale

In [None]:
data2.drop(['YearBuilt','YearRemodAdd'],axis=1,inplace=True)

## Insights:
- Lets drop `YearBuilt` and `YearRemodAdd` since we have created new columns which indicates how old the buildings are from 2021

In [None]:
y=data2['SalePrice']
y

In [None]:
data2.drop('SalePrice',axis=1,inplace=True)

## Observation:
- As shown in the above two steps we have created a target variable(y) which is a `SalePrice` y and removed the same from data2 dataframe

In [None]:
# Display the top 5 rows of dataframe
data2.head()

In [None]:
# Displying the features having numerical(int and float) datatype 
data2.select_dtypes(include=['int64','float64']).head()

## Categorical Column Treatment

In [None]:
# Lets treat the categorical columns

CategoricalColumns=data2.select_dtypes(include=['object'])
CategoricalColumns.head()

In [None]:
### Display all the categorical columns

CategoricalColumns.columns

In [None]:
# CategoricalColumns stores dataframe of categorical columns obtained from data2
CategoricalColumns=data2[CategoricalColumns.columns]
CategoricalColumns

## Insights:
- As shown above `CategoricalColumns` stores dataframe containing the features having datatype of categorical features

In [None]:
# CategoricalColumns2 stores dataframe after converting categorical column into integer format
CategoricalColumns2=pd.get_dummies(CategoricalColumns,drop_first=True)
CategoricalColumns2.head()

## Insights:

- We have used `CategoricalColumns2` for storing dataframe after converting catgeries to numerical representation
- As shown above we have used get_dummies to convert categorical columns to numerical so that we can fit the data to the model.


In [None]:
CategoricalColumns2.shape

In [None]:
# Drop original categroical columns since we created new ones using get_dummies
data2.drop(CategoricalColumns.columns,axis=1,inplace=True)

## Insights:
- We have dropped our categorical columns from original dataframe `data2` and all the categorical column are stored in `CategoricalColumns2` after treating them

In [None]:
data2.head()

In [None]:
data2.shape

## Insights:
- As shown above we have 33 features which are numerical in nature

In [None]:
# Create new dataframe called `newdf` which is a combination of numerical fetures and Categorical columns after the treatment
newdf=pd.concat([data2,CategoricalColumns2],axis=1)
newdf.shape

In [None]:
newdf.head()

## Insights:
- In the previous steps we have obtained the dataframe which is suitable for model building so all the input features for our model is stored in the following variable`newdf`

## Perform Train_Test_Split for Training and evalating model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(newdf,y,train_size=0.75,random_state=42)

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
# Let initialise the standard scaler
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [None]:
x_train.head()

In [None]:
data2.columns

## Insights:
- As shown above all the column/feature names are displayed where datatype of the feature is integer

In [None]:
# Apply standard scaling on numerical column
x_train[data2.columns]=sc.fit_transform(x_train[data2.columns])
x_train.head()

## Insights:
- As shown above the features are applied with standard scaling which are numerical in nature
- We didnt make any changes to the features where we have conveted categories to numerical representation using get_dummies

In [None]:
x_test[data2.columns]=sc.transform(x_test[data2.columns])
x_test.head()

## Insights:
- As shown above we have applied transform on the test data for applying standard scaling 

## Model building 

In [None]:
# Importing required libraries which helps for Recursive feature elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Intialis Linear regression 
lr=LinearRegression()
lr.fit(x_train,y_train)

# Select Top 50 features using recursive feature elimination technique
rfe=RFE(lr,50)
rfe=rfe.fit(x_train,y_train)

## Insights:
- As shown above we have taken 50 top features which are best for our model building process using recursive feature elimination.

- The approach is taken, to reduce overall column count to th model since earlier we have 181 features since we had lot of categorical columns

In [None]:
# Lets display the column names,support and ranking of the selected features after applying RFE
list(zip(x_train.columns,rfe.support_,rfe.ranking_))

In [None]:
# create a variable called `newcol` which holds feature names of all the top 50 selected features
newcol=x_train.columns[rfe.support_]
newcol

## Insights:
- As shown above we have obtained the best 50 features which can be fitted with lasso and ridge regression models
- Now we have obtained best features after applying recursive feature elimination

In [None]:
x_train=x_train[newcol]
x_test=x_test[newcol]

In [None]:
x_train.shape

In [None]:
x_test.shape

## Insights:
- As shown above we have assigned new feature names for the x_train and x_test variables
- Split of training and test is as expected with rows and columns also as expected

In [None]:
# Import Ridge and Lasso Regression models
from sklearn.linear_model import Lasso,Ridge

## Ridge Regression

In [None]:
# Import Grid search Cv
from sklearn.model_selection import GridSearchCV

In [None]:
# Lets create params which is alpha in our case and passed to our GridsearchCV which creates multiple folds.
params={'alpha':[0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.,8,0.9,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,50,100,100]}
ridge=Ridge()   #Initisalising Ridge
folds=5  # Number of folds
ridgemodel=GridSearchCV(estimator=ridge,  #estimator here is ridge
                     param_grid=params,
                     cv=folds,
                     scoring='neg_mean_absolute_error',
                     return_train_score=True,
                     verbose=1)
ridgemodel.fit(x_train,y_train)  # Fit the model

In [None]:
# ilter bet params value for our model which is best alpha in this case
ridgemodel.best_params_

## Insights:
- As shown in the previous two steps we have applied Gridsearch with ridge estimator and obtained 135 fits.
- We have obtained best alpha value as 4.0,Lets fit and train the model with this alpha value in next step

In [None]:
## Lets fit the model with the alpha value 6
alpha=6.0,
ridge=Ridge(alpha=alpha)
ridge.fit(x_train,y_train)
print(ridge.coef_)

In [None]:
RidgeCoeff=pd.DataFrame(index=x_train.columns,data=sorted(ridge.coef_,reverse=True),columns=['Coef'])
RidgeCoeff

## Insights:
- As shown above all the coefficient values are shown along with its features.

In [None]:
ridge.score(x_train,y_train)

In [None]:
ridge.score(x_test,y_test)

## Insights:
- Train and Test results wth Ridge Reression
    - Training score: 0.87
    - Test score    : 0.87

In [None]:
## Top 5 features using Ridge
RidgeCoeff.head(5)

## Lasso

In [None]:
from sklearn.linear_model import Lasso

In [None]:
# Lets crete params which is alpha in our case and passed to our GridsearchCV model which creates multiple folds.
params={'alpha':[0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.,8,0.9,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,50,100,100]}
lasso=Lasso()   #Initialising Lasso 
folds=5   #Number of folds
lassomodel=GridSearchCV(estimator=lasso,   #estimator here is lasso
                     param_grid=params,
                     cv=folds,
                     scoring='neg_mean_absolute_error',
                     return_train_score=True,
                     verbose=1)
lassomodel.fit(x_train,y_train)  # fit the model

In [None]:
# Filter the best params value for alpha
lassomodel.best_params_

## Insights:
- As shown in the previous two steps we have applied Gridsearch with Lassos estimator and obtained 135 fits.
- We have obtained best alpha value as 0.001,Lets fit and train the model with this alpha value in next step

In [None]:
## Lets fit the model with the alpha value 0.001
alpha=0.001,
lasso=Lasso(alpha=alpha)
lasso.fit(x_train,y_train)
print(lasso.coef_)

In [None]:
LassoCoeff=pd.DataFrame(index=x_train.columns,data=sorted(lasso.coef_,reverse=True),columns=['Coef'])
LassoCoeff

## Insights:
- As shown above all the coefficient values are shown along with its features.
- We could see that coefficient values are 0 for variables like `Condition1_Norm`, `Condition1_RRAn`, `BldgType_Twnhs`,`OverallQual_Average`,`OverallQual_Below Average`,`OverallQual_Excellent`,`OverallQual_Fair`,`OverallQual_Good`,`OverallQual_Poor`,`OverallQual_Very Good` indicating less significance in preddicting the house prices.

In [None]:
lasso.score(x_train,y_train)

In [None]:
lasso.score(x_test,y_test)

## Insights:
- Train and Test results wth Lasso Reression
    - Training score: 0.866
    - Test score    : 0.87

In [None]:
## Top 5 features using lasso
LassoCoeff.head(5)

## Summary of the Analysis

## Optimal Value for Ridge and Lasso:

    - Optimal lambda value for Ridge : 6.0
    - Optimal lambda value for lasso : 0.001

## Variables significant in preicting the price of the house using Ridge Regression are as follows:

In [None]:
RidgeCoeff.head(5)  

## Variables significant in predicting the price of the house using Lasso Regression are as follows:

In [None]:
LassoCoeff.head()

## Metrics obtained using ridge and Lasso:
- Ridge regression:
    - Training: 0.87
    - Testing: 0.87
- lasso regression:
    - Training: 0.866
    - Testing:  0.87

## From both Ridge and Lasso Below variables shows strong Predictors in making predictions of houses, So company can concentrate on this variables :
- GrLivArea
- AgeOfBuilding
- MSSubClass_1-STORY 1946 & NEWER ALL STYLES
- MSSubClass_2-STORY PUD - 1946 & NEWER
- MSSubClass_PUD - MULTILEVEL - INCL SPLIT LEV/FOYER