In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [23]:
# 이상치 제거
df = df.drop(df[(df['GrLivArea']>4000) & (df['SalePrice']<300000)].index)

In [24]:
def DataPreProcessing(df, test=False):
    if not test:
        df.drop('Id',axis=1, inplace=True)
    garage_num_cols = ['GarageYrBlt','GarageCars','GarageArea']
    garage_str_cols = ['GarageType', 'GarageFinish','GarageQual','GarageCond']
    df[garage_num_cols] = df[garage_num_cols].fillna(0)
    df[garage_str_cols] = df[garage_str_cols].fillna('None')
    
    bsmt_num_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
    bsmt_str_cols =  ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
    df[bsmt_num_cols] = df[bsmt_num_cols].fillna(0)
    df[bsmt_str_cols] = df[bsmt_str_cols].fillna('None')
    
    df=df.dropna(axis=0, subset=['Electrical'])
    
    df[["MasVnrArea"]] = df[["MasVnrArea"]].fillna(0)
    df[["MasVnrType"]] = df[["MasVnrType"]].fillna("None")
    
    df[["PoolQC"]] = df[["PoolQC"]].fillna("None")
    df[["MiscFeature"]] = df[["MiscFeature"]].fillna("None")
    df[["Alley"]] = df[["Alley"]].fillna("None")
    df[["Fence"]] = df[["Fence"]].fillna("None")
    
    df[["FireplaceQu"]] = df[["FireplaceQu"]].fillna("None")
    
    df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())
    
    return df

In [25]:
def obj_mapping(df, test_df, column, print_mode=False):
    df[column].fillna('None', inplace=True)
    
    uniques = df[column].unique()
    if print_mode:
        # value별로 SalePrice가 어떻게 분포되어있는지 plot을 이용하여 보여줍니다.
        print(uniques)
        for v in uniques:
            df[df[column] == v]['SalePrice'].plot(label=v)
        plt.legend()
        plt.show()
    
    # value 별로 평균 saleprice를 계산하고, 그 평균을 기준으로 정렬합니다.
    grades = [[df[df[column] == v]['SalePrice'].mean(), v] for v in uniques]
    grades.sort()
    
    if print_mode:
        print(grades)
        
    # 정렬된 평균값 기준으로 mapping합니다 ex) 0, 1, 2, ...
    mapping = {v: i for i, (_, v) in enumerate(grades)}
    df[column] = df[column].map(mapping)
    test_df[column] = test_df[column].map(mapping)
    
    #print('----',column)
    #print(df[[column, 'SalePrice']].corr())
    
    #if df[[column, 'SalePrice']].corr()['SalePrice'][column] < 0.35:
        #df.drop(column,axis=1, inplace=True)
        #test_df.drop(column,axis=1, inplace=True)
        #print(column,' dropped')

In [26]:
df = DataPreProcessing(df)
test_id = test_df['Id'].copy()
test_df = DataPreProcessing(test_df, test=True)
test_df.drop('Id',axis=1, inplace=True)

object_columns = df.select_dtypes(include='object').columns
for col in object_columns:
    obj_mapping(df,test_df,col, print_mode=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
  ])

num_tr = num_pipeline.fit_transform(df)
X_submit_test = num_pipeline.fit_transform(test_df)

# 모델 구축 (Numerical Data를 이용한)

In [28]:
X = num_tr[:,:-1]
y = df['SalePrice'].copy()

In [29]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1457, dtype: int64

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.01, shuffle=True, random_state=42)

In [33]:
#1. linear regression

from sklearn.linear_model import LinearRegression

m_lr = LinearRegression()

m_lr.fit(X_train, y_train)

#predict

y_pred_lr = m_lr.predict(X_test)

# 4. 평가 MAE와 R^2

from sklearn.metrics import mean_squared_error as mse, r2_score as r2


print('linear regression R^2: ', r2(y_test,y_pred_lr))
print('linear regression RMSE: ', np.sqrt(mse(y_test,y_pred_lr)))

linear regression R^2:  0.8435660857144962
linear regression RMSE:  29412.641920131664


In [34]:
#predict

pred_submit_test = m_lr.predict(X_submit_test)

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = pred_submit_test
sub.to_csv('submission1.csv',index=False)

In [35]:
#2. Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

tree_lr = DecisionTreeRegressor()

tree_lr.fit(X_train, y_train)

tree_y_pred_tree = tree_lr.predict(X_test)

print('linear regression R^2: ', r2(y_test,tree_y_pred_tree))

print('linear regression RMSE: ', np.sqrt(mse(y_test,tree_y_pred_tree)))

linear regression R^2:  0.668623764697664
linear regression RMSE:  42808.425961875


In [36]:
pred_submit_test = tree_lr.predict(X_submit_test)

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = pred_submit_test
sub.to_csv('submission2.csv',index=False)

In [37]:
import xgboost as xgb
import lightgbm as lgb

In [38]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [39]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [40]:
model_xgb.fit(X_train, y_train)

tree_y_pred_xgb = model_xgb.predict(X_test)

print('linear regression R^2: ', r2(y_test,tree_y_pred_xgb))

print('linear regression RMSE: ', np.sqrt(mse(y_test,tree_y_pred_xgb)))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


linear regression R^2:  0.8512020041902523
linear regression RMSE:  28685.810186704137


In [41]:
model_lgb.fit(X_train, y_train)

tree_y_pred_lgb = model_lgb.predict(X_test)

print('linear regression R^2: ', r2(y_test,tree_y_pred_lgb))

print('linear regression RMSE: ', np.sqrt(mse(y_test,tree_y_pred_lgb)))

linear regression R^2:  0.8583586310653604
linear regression RMSE:  27987.46975650186


In [42]:
print('linear regression R^2: ', r2(y_test,tree_y_pred_xgb * 0.5 + tree_y_pred_lgb * 0.5))

print('linear regression RMSE: ', np.sqrt(mse(y_test,tree_y_pred_xgb * 0.5 + tree_y_pred_lgb * 0.5)))

linear regression R^2:  0.8564666195612326
linear regression RMSE:  28173.77462749721


In [43]:
print('linear regression R^2: ', r2(y_test,tree_y_pred_xgb * 0.4 + tree_y_pred_lgb * 0.4 + tree_y_pred_tree * 0.1 + y_pred_lr * 0.1))

print('linear regression RMSE: ', np.sqrt(mse(y_test,tree_y_pred_xgb * 0.4 + tree_y_pred_lgb * 0.4 + tree_y_pred_tree * 0.1 + y_pred_lr * 0.1)))

linear regression R^2:  0.8443761371747336
linear regression RMSE:  29336.390304243152


In [44]:
pred_submit_test = model_lgb.predict(X_submit_test)

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = pred_submit_test
sub.to_csv('submission3.csv',index=False)

In [45]:
pred_submit_test = model_xgb.predict(X_submit_test)

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = pred_submit_test
sub.to_csv('submission4.csv',index=False)

  "because it will generate extra copies and increase " +


In [46]:
ensemble = model_lgb.predict(X_submit_test) * 0.5 + model_xgb.predict(X_submit_test) * 0.5

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = ensemble
sub.to_csv('submission5.csv',index=False)

  "because it will generate extra copies and increase " +


In [47]:
# point 성능이 좋지 않은 모델이더라도 ensemble할 경우 긍정적인 효과가 나타난다.
ensemble = model_lgb.predict(X_submit_test) * 0.45 + model_xgb.predict(X_submit_test) * 0.45 + tree_lr.predict(X_submit_test) * 0.1

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = ensemble
sub.to_csv('submission6.csv',index=False)

  "because it will generate extra copies and increase " +
