In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',None)
pd.set_option('display.max_colwidth',None)

# Importing Essentials

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
housing = pd.read_csv(r"../input/house-prices-advanced-regression-techniques/train.csv")
housing.drop(['Id'],axis=1,inplace=True)

# 1. Dealing with missing values

In [None]:
housing.isnull().sum().sort_values(ascending=False)

# 2. Fixing missing values explicitly


In [None]:
# Replacing categorical columns with None
'''
cat_columns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition']
'''
df = housing
cat_columns = df.select_dtypes(include=['object']).columns

for col in cat_columns:
    df[col] = df[col].fillna("None")

#Changing LotFrontage to mean LotFrontage in the same Neighborhood
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

#Replacing numerical column null values with 0
num_columns = df.select_dtypes(exclude=['object']).columns
for col in num_columns:
    if col is not 'Electrical':
        df[col] = df[col].fillna(int(0))

#Replacing 'Electrical' with mode
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

#Dropping Utilities
df = df.drop(['Utilities'],axis=1)

In [None]:
#Checking the count of null values again
df.isnull().apply(sum).max()

In [None]:
df.info()

# 3. Dealing with Outliers

In [None]:
# Removing noisy data which is above 0.999 quantile
num_attributes = df[num_columns]

high_quant = df.quantile(.999)

for col in num_columns:
    df = df.drop(df[col][df[col]>high_quant[col]].index)

df.info()

# 4. Dealing with correlated attributes

In [None]:
# Removing highly correlated features calculated in the EDA Notebook while viewing scatter plot and corr values

attributes_drop = ['MiscVal', 'MoSold', 'YrSold', 'BsmtFinSF2', 'BsmtHalfBath', 'MSSubClass', 'GarageArea',
                  'GarageYrBlt', '3SsnPorch']
df.drop(attributes_drop, axis=1, inplace=True)

# Removing columns with lots of missing values - PoolQC: 1453, MiscFeature: 1406, Alley: 1369, Fence: 1179
attributes_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
df.drop(attributes_drop, axis=1, inplace=True)

# 5. Handling Text and Categorical Values

In [None]:
df.select_dtypes(include=['object']).columns

# 5. Handling Text and Categorical Values

In [None]:
# Transforming Categorial variables using OneHotEncoder
cat_encoder = OneHotEncoder()
df_cat_processed = cat_encoder.fit_transform(df)
df_cat_processed

# Data Transformation

In [None]:
#Separate features and target variables
housing_X = df.drop('SalePrice', axis=1)
housing_y = df['SalePrice']

# Getting list of numerical and categorical values separately
num_attributes = housing_X.select_dtypes(exclude=['object'])
cat_attributes = housing_X.select_dtypes(include=['object'])

num_attribs = list(num_attributes)
cat_attribs = list(cat_attributes)

# Numerical pipeline to impute any missing values with the median and scale attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [None]:
#Full pipeline that handles both numerical and categorical column's transformation
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

# Description before applying transforms
print("housing_y:\n",housing_y.describe())

# Applying log transformation to sales price - remember right-skewed data
housing_y_prepared = np.log(housing_y)

# Running transformation pipeline on all other attributes
housing_X_prepared = full_pipeline.fit_transform(housing_X)

# Description before applying transform
print("\nhousing_y_prepared:\n",housing_y_prepared)

housing_X_prepared