# House Prices Prediction

## Importing:

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# get_description() is a function to get the description of any column name or a value.
# It takes the column/value name, or any list of them:
from data_description import get_description

%matplotlib inline
sns.set_style('darkgrid')

### Importing the 'train.csv' and 'test.csv' files:

In [4]:
train = pd.read_csv('data/train.csv', index_col='Id')
test = pd.read_csv('data/test.csv', index_col='Id')

In [5]:
# X_train is the training features data
X_train = train.drop(['SalePrice'], axis=1)
# y is the target variable [SalePrice]
y = pd.DataFrame(train['SalePrice'])

# X is the combination of the features data from 'train' and 'test'. This is where we'll deal with the NaV values.
X = pd.concat([X_train, test], axis=0)

## Exploratory Data Analysis

### Deviding the features into categorical and numerical:

In [6]:
categorical = [x for x in X.columns if X[x].dtype == 'object']
numerical = [x for x in X.columns if X[x].dtype != 'object']

print(f'Numerical variables [{len(numerical)}]:\n{numerical}')
print()
print(f'Categorical variables [{len(categorical)}]:\n{categorical}')

Numerical variables [36]:
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

Categorical variables [43]:
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',

### The features sliced into 5 sets:

In [7]:
feat_set1 = X.columns[0:16].tolist()
feat_set1

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle']

In [8]:
feat_set2 = X.columns[16:32].tolist()
feat_set2

['OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure']

In [9]:
feat_set3 = X.columns[32:48].tolist()
feat_set3

['BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath']

In [10]:
feat_set4 = X.columns[48:64].tolist()
feat_set4

['FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond']

In [11]:
feat_set4 = X.columns[64:80].tolist()
feat_set4

['PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

### Analysis:

In [12]:
feat_set2 = X.iloc[: , 17:33]
feat_set2.head()
feat_set2.shape ##That mean its contain 2919 rows and 16 columns.  
  
     


(2919, 16)

In [13]:
feat_set2.isnull()   

Unnamed: 0_level_0,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2916,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2917,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2918,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [14]:
feat_set2.isnull().sum()

OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        1
Exterior2nd        1
MasVnrType      1766
MasVnrArea        23
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
dtype: int64

In [15]:
feat_set2.isnull().sum().sum()

2115

Filling Null Values 

In [18]:
MasVnrType = feat_set2['MasVnrType'].fillna(method='pad')
Exterior1st = feat_set2['Exterior1st'].fillna(method='pad')
Exterior2nd = feat_set2['Exterior2nd'].fillna(method='pad')
BsmtQual = feat_set2['BsmtQual'].fillna(method = 'pad')
BsmtCond = feat_set2['BsmtCond'].fillna(method= 'pad')
BsmtExposure = feat_set2['BsmtExposure'].fillna(method='pad')
BsmtFinType1 = feat_set2['BsmtFinType1'].fillna(method='pad')
BsmtFinType1 = pd.DataFrame(BsmtFinType1 , columns = ['BsmtFinType1'])
BsmtExposure = pd.DataFrame(BsmtExposure , columns =['BsmtExposure'])
BsmtCond = pd.DataFrame(BsmtCond , columns = ['BsmtCond'])
BsmtQual = pd.DataFrame(BsmtQual , columns = ['BsmtQual'])
Exterior2nd = pd.DataFrame(Exterior2nd ,columns=['Exterior2nd'] )
Exterior1st = pd.DataFrame(Exterior1st , columns=['Exterior1st'])
MasVnrType  = pd.DataFrame(MasVnrType,columns=['MasVnrType'])
feat_set2['BsmtFinType1'] = BsmtFinType1['BsmtFinType1']
feat_set2['BsmtExposure'] = BsmtExposure['BsmtExposure']
feat_set2['BsmtCond'] = BsmtCond['BsmtCond']
feat_set2['BsmtQual'] = BsmtQual['BsmtQual']
feat_set2['MasVnrType'] = MasVnrType['MasVnrType']
feat_set2['Exterior1st'] = Exterior1st['Exterior1st']
feat_set2['Exterior2nd'] = Exterior2nd['Exterior2nd']
masvnearea_min = feat_set2.fillna(value = feat_set2['MasVnrArea'].min())
feat_set2['MasVnrArea'] = masvnearea_min['MasVnrArea']
feat_set2.head()


Unnamed: 0_level_0,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ
2,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,BrkFace,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ
3,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ
4,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,BrkFace,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ
5,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ


In [19]:
feat_set2.isnull().sum().sum()

0

In [20]:
feat_set2.isnull().sum()


OverallCond     0
YearBuilt       0
YearRemodAdd    0
RoofStyle       0
RoofMatl        0
Exterior1st     0
Exterior2nd     0
MasVnrType      0
MasVnrArea      0
ExterQual       0
ExterCond       0
Foundation      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
dtype: int64

Data visualization 


In [21]:
import plotly
plotly.offline.init_notebook_mode(connected=True)


fig = px.scatter(train, x='MasVnrArea' , y='SalePrice')
fig.show()



In [22]:


fig = px.bar(train, x="YearBuilt", y="SalePrice",
              barmode='group',
             height=600)
fig.show()



In [23]:


fig = px.bar(train, x="YearRemodAdd", y="SalePrice",
              barmode='group',
             height=600, color_discrete_sequence=px.colors.qualitative.Set1)
fig.show()



In [24]:
fig = px.pie(train, names = "BsmtFinType1", title = "Quality of basement finished area", color_discrete_sequence=px.colors.qualitative.Set3)
fig.show()

In [25]:
fig =px.bar(train,x='ExterQual', y='SalePrice',barmode='group',
             height=600)
fig.show()

### Missing Values:

In [12]:
missing_count = X.isna().sum(axis=0)
missing_percent = missing_count / len(X) * 100

missing_data = pd.DataFrame({
    'NaN Count': missing_count,
    'Percentage [%]': missing_percent
}).sort_values(by='NaN Count', ascending=False)
missing_data.index.name = 'Column Name'

missing_data = missing_data[missing_data['NaN Count'] > 0]
missing_data

Unnamed: 0_level_0,NaN Count,Percentage [%]
Column Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PoolQC,2909,99.657417
MiscFeature,2814,96.402878
Alley,2721,93.216855
Fence,2348,80.438506
MasVnrType,1766,60.500171
FireplaceQu,1420,48.646797
LotFrontage,486,16.649538
GarageYrBlt,159,5.447071
GarageFinish,159,5.447071
GarageQual,159,5.447071
