In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew
from scipy.stats import skew
from scipy.stats import boxcox
from scipy.special import boxcox1p

In [5]:
train = pd.read_csv("../input/House_Prices/train.csv")
test = pd.read_csv("../input/House_Prices/test.csv")

In [6]:
def fill_missing_values(df):
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
    for col in none_cols:
        df[col] = df[col].fillna("None")

    zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
    for col in zero_cols:
        df[col] = df[col].fillna(0)
        
    return df

In [7]:
def convert_dtypes(df):
    cols_to_str = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
    for col in cols_to_str:
        df[col] = df[col].apply(str)
    return df

In [None]:
# 1. 外れ値の削除
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 20000)].index)

# 2. データの結合
ntrain = train.shape[0] 
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)

# 3. 特徴量合成
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']


# 4. 数値特徴量の抽出。
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# 5. 歪度の計算
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})

# 6. Box-Cox変換の適用
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
l_opt = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], l_opt)

# 7. ダミー変数化
all_data = pd.get_dummies(all_data)

# 8. 再分割 
train = all_data[:ntrain]
test = all_data[ntrain:]

(1460, 289)