In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Import & Data Check

In [2]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
df_train.head()

In [4]:
df_train.describe()

#### 1.2 수치형, 범주형 분리 

In [5]:
numerical_feats = df_train.dtypes[df_train.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats)) 
categorical_feats = df_train.dtypes[df_train.dtypes == "object"].index 
print("Number of Categorical features: ", len(categorical_feats))

In [6]:
print(df_train[numerical_feats].columns)
print('-'*80)
print(df_train[categorical_feats].columns)

#### 1.3 결측치

In [7]:
for col in df_train.columns:
    msg = f'column: {col:>10}\t Percent of NaN value: {100 * (df_train[col].isnull().sum() / df_train[col].shape[0]):.2f}%'
    print(msg)

In [8]:
for col in df_test.columns:
    msg = f'column: {col:>10}\t Percent of NaN value: {100 * (df_test[col].isnull().sum() / df_test[col].shape[0]):.2f}%'
    print(msg)

In [9]:
missing = df_train.isnull().sum() 
missing = missing[missing > 0] 
missing.sort_values(inplace=True) 
missing.plot.bar(figsize = (12,6))

#### 1.4 이상치 탐색 및 제거

In [10]:
def detect_outliers(df, n, features):
    outlier_indices = []
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v>n)
    
    return multiple_outliers


Outliers_to_drop = detect_outliers(df_train, 2, ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'])

In [11]:
df_train.loc[Outliers_to_drop] # 이상치인 열이 >2 인 행들

In [12]:
df_train = df_train.drop(Outliers_to_drop, axis=0).reset_index(drop=True)
df_train.shape

# EDA

## Numerical Feature
* Correlation Heat Map
* Zoomed Heat Map
* Pair Plot

#### 2.1 Correlation Heat Map

In [13]:
corr_data = df_train[numerical_feats]
colormap = plt.cm.PuBu
sns.set(font_scale=1.0)

f, ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features with Sale Price', y=1, size=18)
sns.heatmap(corr_data.corr(), square=True, linewidths=0.1, cmap=colormap, linecolor='white', vmax=0.8)

변수 사이의 상관 관계가 너무 강하면 다중 공선성(MultiColarisity) 상황이 나타날 수 있음

#### 2.2 Zoomed Heat Map

In [14]:
# 상관계수가 높은 11개 
k=11
cols = corr_data.corr().nlargest(k, 'SalePrice')['SalePrice'].index
print(cols) 

In [15]:
cm = np.corrcoef(df_train[cols].values.T)

f, ax = plt.subplots(figsize = (12,10))
sns.heatmap(cm, vmax=.8, linewidths=0.1, square=True, annot=True, cmap=colormap,
           linecolor='white', xticklabels=cols.values, annot_kws={'size':14}, yticklabels=cols.values)

#### 2.3 Pair Plot

In [16]:
# Zoomed Heat Map에서 다중공선성을 보이는 변수 중 SalePrice와 연관이 덜 한 변수를 제외하고/ 변수간 관계를 더 알아보자

sns.set()
columns = ['SalePrice','OverallQual','TotalBsmtSF','GrLivArea','GarageCars','FullBath','YearBuilt','YearRemodAdd']
sns.pairplot(df_train[columns], size=2, kind='scatter', diag_kind='kde')

* 'TotalBsmtSF' 'GrLiveArea'
* 'SalePrice' 'YearBuilt'

## Categorical Feature
* Boxplot

In [17]:
for catg in list(categorical_feats):
    print(df_train[catg].value_counts())
    print('-'*80)

#### 2.4 Box Plot

In [18]:
li_cat_feats = list(categorical_feats)
nr_rows = 15
nr_cols = 3

fig, axs = plt.subplots(nr_rows, nr_cols, figsize=(nr_cols*4, nr_rows*3))

for r in range(0, nr_rows):
    for c in range(0, nr_cols):
        i = r * nr_cols + c
        if i < len(li_cat_feats):
            sns.boxplot(x=li_cat_feats[i], y=df_train['SalePrice'], data=df_train, ax=axs[r][c])

            
plt.tight_layout()
plt.show()