In [None]:
import pandas as pd 
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
import numpy as np 
from scipy.stats import skew
import scipy.stats as stats 

import seaborn as sns 
import matplotlib.pyplot as plt 

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
df = pd.concat(objs=[train, test], axis=0, sort=False, ignore_index=True)
df.head()

In [None]:
df.describe()

## 将nan值统一填充为 np.nan

In [None]:
df = df.fillna(np.nan)

## MSZoning 根据Neighborhood 填充缺失值

In [None]:
index = df.MSZoning[df.MSZoning.isnull()==True].index

df.loc[index[:3], 'MSZoning'] = 'IDOTRR'
df.loc[index[3:], 'MSZoning'] = 'Mitchel'

## 将 YearBuilt, YearRemodAdd, GarageYrAdd, YrSold, MssubClass 类型转为 object

In [None]:
df['YearBuilt'] = df['YearBuilt'].apply(str)
df['YearRemodAdd'] = df['YearRemodAdd'].apply(str)
df['GarageYrBlt'] = df['GarageYrBlt'].apply(str)
df['YrSold'] = df['YrSold'].apply(str)
df['MSSubClass'] = df['MSSubClass'].apply(str)

In [None]:
num_cols = df.dtypes[df.dtypes != 'object'].index
obj_cols = df.dtypes[df.dtypes == 'object'].index
num_cols

## GrLivArea 去除 4500 以上的两个离群点

In [None]:
df['GrLivArea'].skew()

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
stats.probplot(df['GrLivArea'].apply(np.log1p), plot=ax)
df['GrLivArea'] = df['GrLivArea'].apply(np.log1p)

df.drop(train['GrLivArea'][train['GrLivArea']>4500].index, axis=0, inplace=True)

In [None]:
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(1,2,1)
sns.regplot(df['GrLivArea'], df['SalePrice'])
ax2 = fig.add_subplot(1,2,2)
sns.scatterplot(df['GrLivArea'], df['SalePrice'])

## 画出与 SalePrice 相关系数较高的前几个特征与 SalePrice 的散点分布图

In [None]:
sns.pairplot(df[:len(train)].loc[:,['SalePrice', 'OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF']].fillna(0))

## 缺失值处理

In [None]:
# 缺失值个数 <=10 
df['SaleType'].fillna('WD', inplace=True)
df['Electrical'].fillna('SBrkr', inplace=True)

df['GarageArea'].fillna(df['GarageArea'].mean(), inplace=True)
df['GarageCars'].fillna(df['GarageCars'].mean(), inplace=True)
df['Exterior1st'].fillna('VinylSd', inplace=True)
df['Exterior2nd'].fillna('VinylSd', inplace=True)
df['KitchenQual'].fillna(df['KitchenQual'].mode()[0], inplace=True)

df['TotalBsmtSF'].fillna(0, inplace=True)
df['BsmtFinSF1'].fillna(0, inplace=True)
df['BsmtUnfSF'].fillna(0, inplace=True)
df['BsmtFinSF2'].fillna(0, inplace=True)

df['Functional'].fillna(df['Functional'].mode()[0], inplace=True)
df['BsmtHalfBath'].fillna(df['BsmtHalfBath'].mode()[0], inplace=True)
df['BsmtFullBath'].fillna(df['BsmtFullBath'].mode()[0], inplace=True)
# df['NumOfBath'].fillna(df['NumOfBath'].mode()[0], inplace=True)
df['Utilities'].fillna(df['Utilities'].mode()[0], inplace=True)

In [None]:
# 缺失值个数 >= 10
features = ['MasVnrType', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtCond','BsmtExposure',\
    'GarageType', 'GarageCond', 'GarageQual','GarageFinish', 'FireplaceQu',\
    'Fence', 'Alley', 'MiscFeature', 'PoolQC']

for feature in features:
    df[feature].fillna('None', inplace=True)

In [None]:
df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True)

In [None]:
df['MasVnrArea'].fillna(df['MasVnrArea'].mean(), inplace=True)

In [None]:
df.isnull().sum()[df.isnull().sum()!=0].sort_values()

In [None]:
df.corr()['SalePrice'].sort_values(ascending=False)

## OverallQual

In [None]:
# sns.scatterplot(df['LotFrontage'].fillna(0), df['SalePrice'])
fig = plt.figure()
ax1 = fig.add_subplot()
sns.scatterplot(df['OverallQual'], df['SalePrice'])

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
sns.barplot(df['GarageCars'], df['SalePrice'])

In [None]:
sns.distplot(df['GarageArea'].fillna(0))
df['GarageArea'].fillna(df['GarageArea'].mean()).skew()
# df[df['GarageArea'].isnull()]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
stats.probplot(df['TotalBsmtSF'], plot=ax)
# sns.distplot(df['TotalBsmtSF'])

## 1stFlrSF

In [None]:
sns.distplot(df['1stFlrSF'].apply(np.log1p))
df['1stFlrSF'].apply(np.log1p).skew()
df['1stFlrSF'] = df['1stFlrSF'].apply(np.log1p)

In [None]:
train.groupby('TotRmsAbvGrd')['SalePrice'].mean().plot()

In [None]:
train.groupby(['YearBuilt'])['SalePrice'].mean().plot()
# train.groupby(['YearBuilt'])['SalePrice'].mean().plot()

In [None]:
df['YearRemodAdd-YearBuilt'] = df['YearRemodAdd'].apply(np.int) - df['YearBuilt'].apply(np.int)
df.groupby('YearRemodAdd-YearBuilt')['SalePrice'].count().plot()

In [None]:
df['LotArea'] = df['LotArea'].apply(np.log1p)
# ss.distplot(df['LotArea'])

# fig = plt.figure()
# ax = fig.add_subplot(111)
# stats.probplot(df['LotArea'], plot=ax)

In [None]:
df['Overall'] = df['OverallCond'] * df['OverallQual']
df['NumOfBath'] = df['BsmtFullBath'] + df['BsmtHalfBath']*0.5 + df['FullBath'] + df['HalfBath']*0.5
df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']

df = df.drop(['Utilities','Street','PoolQC'], axis=1)

In [None]:
df.columns

In [None]:
df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x>0 else 0)
df['HasSecFlr'] = df['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x>0 else 0)
df['HasBamt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x>0 else 0)
# df['HasFirePlace'] = df['Fireplaces'].apply(lambda x: 1 if x>0 else 0)

In [None]:
df.corr().sort_values(by=['SalePrice'], ascending=False).SalePrice

## Modeling

In [None]:
from sklearn.grid_search import GridSearchCV 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LassoCV, Lasso
from sklearn.svm import SVR

import lightgbm as lgb
from xgboost import XGBRegressor

In [None]:
df_copy = df.copy()
df_y = np.log1p(df_copy['SalePrice'])
df_copy.drop(['Id', 'SalePrice'], axis=1, inplace=True)
df_x = pd.get_dummies(df_copy)
x = df_x[:len(train)-2]
y = df_y[:len(train)-2]
test_x = df_x[len(train)-2:]

train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=42)

## lightgbm

In [None]:
paramaters = {
    'max_depth':[4],
    'num_leaves':[3],
    'feature_fraction': [0.2],
    'cat_smooth': [1],
    'bagging_fraction':[0.9],
    'bagging_freq': [3],
    'cat_smooth':[0]
    
}

gbm = lgb.LGBMRegressor(
                    objective='regression',
                    learning_rate = 0.01,
                    n_estimators=5000,
                    verbose = -1
)

gscv = GridSearchCV(gbm,
                    param_grid=paramaters,
                    cv=5,
                    n_jobs=-1
                   )
gscv.fit(x, y)

In [None]:
gscv.best_params_

In [None]:
gscv.best_score_

In [None]:
np.sqrt(np.sum(np.power((gscv.best_estimator_.predict(valid_x) - valid_y),2))/len(valid_y))

## xgboost

In [None]:
params = {
    'max_depth': [3]
}
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=3460,
                       min_child_weight=0,
                       gamma = 0,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)
gscv_xg = GridSearchCV(xgboost,
                       cv=5,
                       param_grid=params,
                       n_jobs=-1)
gscv_xg.fit(x, y)

In [None]:
gscv_xg.best_score_

In [None]:
np.sqrt(np.sum(np.power((gscv_xg.best_estimator_.predict(valid_x) - valid_y),2))/len(valid_y))

## svr

In [None]:
params = {
    'epsilon' : [0.001],
    'gamma' : [0.0001]
}
svr = SVR(C=20)
gscv_svr = GridSearchCV(svr,
                        cv=5,
                       param_grid = params,
                       n_jobs = -1)
gscv_svr.fit(x, y)

In [None]:
gscv_svr.best_score_

In [None]:
gscv_svr.best_params_

In [None]:
np.sqrt(np.sum(np.power((gscv_svr.best_estimator_.predict(valid_x) - valid_y),2))/len(valid_y))

## Lasso

In [None]:
params = {
    'alpha': [0.0001, 0.0003, 0.001] 
}
lasso = Lasso(max_iter=1e5,
             random_state=42)
gscv_lasso = GridSearchCV(lasso,
                          cv=3
                         param_grid = params,
                         n_jobs = -1)
gscv_lasso.fit(x, y)

In [None]:
print('Best score: ', gscv_lasso.best_score_)
print('Best params: ', gscv_lasso.best_params_)

In [None]:
np.sqrt(np.sum(np.power((gscv_lasso.best_estimator_.predict(valid_x) - valid_y),2))/len(valid_y))

In [None]:
pred_lgb = gscv.predict(test_x)
pred_xg = gscv_xg.best_estimator_.predict(test_x)
pred = 0.3*pred_lgb + 0.7*pred_xg

submission = pd.DataFrame({'Id':test.Id, 'SalePrice':np.expm1(pred)})
submission.to_csv('submission_2020_02_19.csv', index=False)