# House Advanced Regression Lab

In [12]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import joblib
from scipy.stats.distributions import uniform, randint
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('train.csv')

In [3]:
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [4]:
num_cols = df.columns[(df.dtypes == 'int64').ravel()].drop(['Id', 'MSSubClass', 'SalePrice'])

In [5]:
df_num = df.loc[:, num_cols]

In [6]:
df_t = df_num.copy()
df_t['1stFlrSF_Ratio'] = df_t['1stFlrSF']/df_t['GrLivArea']
df_t['2ndFlrSF_Ratio'] = df_t['2ndFlrSF']/df_t['GrLivArea']
df_t['Bedroom_Ratio'] = df_t['BedroomAbvGr']/df_t['TotRmsAbvGrd']
df_t['Kitchen_Ratio'] = df_t['KitchenAbvGr']/df_t['TotRmsAbvGrd']
df_t['LowQualFinSF_ratio'] = df_t['LowQualFinSF']/df_t['GrLivArea']
df_t['Bsmt_FinSF1_ratio'] = df_t['BsmtFinSF1']/df_t['TotalBsmtSF']
df_t['Bsmt_FinSF2_ratio'] = df_t['BsmtFinSF2']/df_t['TotalBsmtSF']
df_t['Bsmt_UnfSF_ratio'] = df_t['BsmtUnfSF']/df_t['TotalBsmtSF']
df_t['GarageArea_ratio'] = df_t['GarageArea']/df_t['LotArea']

df_t.corrwith(y).sort_values(ascending=False)

OverallQual           0.790982
GrLivArea             0.708624
GarageCars            0.640409
GarageArea            0.623431
TotalBsmtSF           0.613581
1stFlrSF              0.605852
FullBath              0.560664
TotRmsAbvGrd          0.533723
YearBuilt             0.522897
YearRemodAdd          0.507101
Fireplaces            0.466929
BsmtFinSF1            0.386420
WoodDeckSF            0.324413
2ndFlrSF              0.319334
OpenPorchSF           0.315856
HalfBath              0.284108
LotArea               0.263843
BsmtFullBath          0.227122
BsmtUnfSF             0.214479
BedroomAbvGr          0.168213
2ndFlrSF_Ratio        0.148097
Bsmt_FinSF1_ratio     0.139733
ScreenPorch           0.111447
GarageArea_ratio      0.094723
PoolArea              0.092404
MoSold                0.046432
3SsnPorch             0.044584
BsmtFinSF2           -0.011378
BsmtHalfBath         -0.016844
MiscVal              -0.021190
LowQualFinSF         -0.025606
YrSold               -0.028923
LowQualF

In [7]:
def write_test(mod,name):
    df_t = pd.read_csv('test.csv')
    df_pred = pd.concat([df_t.Id, pd.Series(mod.predict(df_t), name='SalePrice')], axis=1)
    df_pred.to_csv(name, index=False)

In [8]:
class Transform_Vars(TransformerMixin, BaseEstimator):
    def __init__(self, activate=True):
        self.activate=activate
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        Xt = X.copy()
        if self.activate:
            Xt['YearBuilt'] = (Xt['YrSold'] - Xt['YearBuilt'] + Xt['YrSold'] - Xt['YearRemodAdd'])/2
            Xt['1stFlrSF'] = Xt['1stFlrSF']/Xt['GrLivArea']
            Xt['2ndFlrSF'] = Xt['2ndFlrSF']/Xt['GrLivArea']
            Xt['BedroomAbvGr'] = Xt['BedroomAbvGr']/Xt['TotRmsAbvGrd']
            Xt['KitchenAbvGr'] = df_t['KitchenAbvGr']/Xt['TotRmsAbvGrd']
            Xt['BsmtUnfSF'] = Xt['BsmtUnfSF']/Xt['TotalBsmtSF']
            Xt['BsmtFinSF1'] = Xt['BsmtFinSF1']/Xt['TotalBsmtSF']
            Xt['BsmtFinSF2'] = Xt['BsmtFinSF2']/Xt['TotalBsmtSF']
            Xt['BsmtUnfSF'] = Xt['BsmtUnfSF']/Xt['TotalBsmtSF']
            Xt['LowQualFinSF'] = Xt['LowQualFinSF']/Xt['GrLivArea']
            
        return Xt.drop(['YrSold', 'YearRemodAdd', 'MoSold', 'MiscVal',
                           'PoolArea', '3SsnPorch'], axis=1)

In [9]:
X_num = df_num.copy()
y = y.copy()

In [29]:
pipe_RF = Pipeline([
    ('Preprocess', Transform_Vars()),
    ('Impute_NA', SimpleImputer(strategy='constant', fill_value=0)),
    ('Scale', StandardScaler()),
    ('Predict', SVR(kernel='poly', degree=30))
])

In [None]:
cross_val_score(pipe_RF, X_num, y, cv=10, scoring='neg_mean_squared_log_error').mean()