# ML Project Revisited

### Import Packages

In [31]:
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Import Data

In [33]:
train = pd.read_csv("train.csv")
train1 = pd.read_csv('train.csv')

test = pd.read_csv("test.csv")
test1 = pd.read_csv("test.csv")

In [34]:
train.shape

(1460, 81)

In [35]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Exploratory Data Analysis

### Numerical vs Categorical variables

In [36]:
numerical_features = train.dtypes[train.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_features))

categorical_features = train.dtypes[train.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_features))

Number of Numerical features:  38
Number of Categorical features:  43


In [37]:
numerical_features

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [38]:
categorical_features

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

Some numerical data will actually be categorical, must review one by one. Looking through the data description of our numerical variables will help confirm which variables are in fact categorical.

Numerical Features that are in fact categorical: MSSubClass, OverallQual, OverallCond, MoSold

#### Create numerical and categorical dataframes for later

In [39]:
numerical_features_train = train[numerical_features]
categorical_features_train = train[categorical_features]

Add the categorical features from numerical_features_train to correct dataframe and drop them from the numerical dataframe. Also add Sale Price to categorical dataframe for usefulness later.

In [40]:
categorical_features_train = categorical_features_train.join(numerical_features_train[['MSSubClass', 'OverallQual', 'OverallCond', 'GarageYrBlt', 'MoSold', 'SalePrice']])
numerical_features_train = numerical_features_train.drop(['MSSubClass', 'OverallQual', 'OverallCond', 'GarageYrBlt', 'MoSold'], axis = 1)

In [41]:
# Create list of categorical and numerical variables
numerical_features_list = list(numerical_features_train.columns.values)
categorical_features_list = list(categorical_features_train.columns.values)

In [43]:
print('Now he have ', numerical_features_train.shape[1]-1, ' numerical features')
print('Now he have ',categorical_features_train.shape[1]-1, ' categorical features')

Now he have  32  numerical features
Now he have  48  categorical features


### Missing Values

We must find where we have missing values and resolve missing data appropriately.

In [44]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data[missing_data.Percent > 0]

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageYrBlt,81,0.055479
GarageCond,81,0.055479
GarageType,81,0.055479
GarageFinish,81,0.055479


### Largely Missing Features

For many of our variables, missing data indicates that the property does not include such feature. This is the case for features describing pools, alleys, fences, fireplaces, basements, and miscellaneous. For categorical features, we should impute 'None' and for numerical variables describing these features, we should impute 0.

In [45]:
missing_columns = list(missing_data[missing_data.Percent > 0].index)

In [46]:
numerical_empties = list()
categorical_empties = list()

In [47]:
for column in missing_columns:
    if column in numerical_features_list:
        numerical_empties.append(column)
    else: categorical_empties.append(column)  

In [48]:
categorical_empties

['PoolQC',
 'MiscFeature',
 'Alley',
 'Fence',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageCond',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'BsmtFinType2',
 'BsmtExposure',
 'BsmtQual',
 'BsmtCond',
 'BsmtFinType1',
 'MasVnrType',
 'Electrical']

In [49]:
for column in categorical_empties:
    train[column].fillna('None',inplace=True)
    test[column].fillna('None',inplace=True)

In [50]:
for column in numerical_empties:
    train[column].fillna(0,inplace=True)
    test[column].fillna(0,inplace=True)

Once missing values are resolved, show there is no more missingness. 

In [51]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data_after = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data_after[missing_data_after.Percent > 0]

Unnamed: 0,Total,Percent


### Create heatmap of the correlation between features and sale price

First find highly correlated features to sale price. Then create heatmap

.corr() method will get rid of columns not suited for correlation

In [64]:
train.corr()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,1.0,0.011156,-0.019761,-0.033226,-0.028365,0.012609,-0.012713,-0.021998,-0.051071,-0.005024,...,-0.029643,-0.000477,0.002889,-0.046635,0.00133,0.057044,-0.006242,0.021172,0.000712,-0.021917
MSSubClass,0.011156,1.0,-0.215023,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.023573,-0.069836,...,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.021407,-0.084284
LotFrontage,-0.019761,-0.215023,1.0,0.100739,0.176561,-0.053457,0.036853,0.078686,0.10501,0.07667,...,-0.01678,0.069605,0.027366,0.023499,0.022969,0.114106,-0.059606,0.018942,-0.012094,0.209624
LotArea,-0.033226,-0.139781,0.100739,1.0,0.105806,-0.005636,0.014228,0.013788,0.103321,0.214103,...,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,-0.014261,0.263843
OverallQual,-0.028365,0.032628,0.176561,0.105806,1.0,-0.091932,0.572323,0.550684,0.407252,0.239666,...,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347,0.790982
OverallCond,0.012609,-0.059316,-0.053457,-0.005636,-0.091932,1.0,-0.375983,0.073741,-0.125694,-0.046231,...,-0.003334,-0.032589,0.070356,0.025504,0.054811,-0.001985,0.068777,-0.003511,0.04395,-0.077856
YearBuilt,-0.012713,0.02785,0.036853,0.014228,0.572323,-0.375983,1.0,0.592855,0.3116,0.249503,...,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618,0.522897
YearRemodAdd,-0.021998,0.040581,0.078686,0.013788,0.550684,0.073741,0.592855,1.0,0.176529,0.128451,...,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.035743,0.507101
MasVnrArea,-0.051071,0.023573,0.10501,0.103321,0.407252,-0.125694,0.3116,0.176529,1.0,0.261256,...,0.159991,0.122528,-0.109907,0.019144,0.062248,0.011928,-0.029512,-0.006723,-0.008317,0.472614
BsmtFinSF1,-0.005024,-0.069836,0.07667,0.214103,0.239666,-0.046231,0.249503,0.128451,0.261256,1.0,...,0.204306,0.111761,-0.102303,0.026451,0.062021,0.140491,0.003571,-0.015727,0.014359,0.38642


In [52]:
import seaborn as sns
import numpy as np
%matplotlib inline

In [57]:
categorical_features_train.columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition', 'MSSubClass', 'OverallQual', 'OverallCond',
       'GarageYrBlt', 'MoSold', 'SalePrice'],
      dtype='object')

In [61]:
categorical_features_pivot = categorical_features_train.pivot(columns='SalePrice')

In [63]:
sns.heatmap(categorical_features_pivot, annot=True, fmt='d')

ValueError: could not convert string to float: 'C (all)'

### Check skewness and kurtosis 

### Apply transformation for skewness

### Check for outliers 