In [1]:
import numpy as np  # linear algebra
import pandas as pd  #
from datetime import datetime

from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

from mlxtend.regressor import StackingCVRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import os

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Train set size:", train.shape)
print("Test set size:", test.shape)
print('START data processing', datetime.now(), )

Train set size: (1460, 81)
Test set size: (1459, 80)
START data processing 2019-12-20 15:02:59.615854


In [3]:
train_ID = train['Id']
test_ID = test['Id']
# Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [4]:
# Deleting outliers
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

In [5]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train.SalePrice.reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

In [6]:
features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

(2917, 79)


In [7]:
features['Functional'] = features['Functional'].fillna('Typ')
features['Electrical'] = features['Electrical'].fillna("SBrkr")
features['KitchenQual'] = features['KitchenQual'].fillna("TA")
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

In [8]:
features["PoolQC"] = features["PoolQC"].fillna("None")


In [9]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')

In [10]:
features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [11]:
objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)

In [12]:
features.update(features[objects].fillna('None'))


In [13]:
features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))


In [14]:
# Filling in the rest of the NA's

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))

In [15]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)


In [16]:
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

In [17]:
for i in skew_index:
    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))




In [18]:
features = features.drop(['Utilities', 'Street', 'PoolQC', ], axis=1)


In [19]:
features['YrBltAndRemod'] = features['YearBuilt'] + features['YearRemodAdd']
features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']


In [20]:
features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])


In [21]:
features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))


In [22]:
features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])


In [23]:
# simplified features
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)


In [24]:
print(features.shape)
final_features = pd.get_dummies(features).reset_index(drop=True)
print(final_features.shape)

(2917, 86)
(2917, 333)


In [25]:
X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(X):, :]

In [26]:
print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)


X (1458, 333) y (1458,) X_sub (1459, 333)


In [27]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

In [28]:
overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

In [29]:
overfit = list(overfit)
overfit.append('MSZoning_C (all)')

In [30]:
X = X.drop(overfit, axis=1).copy()
X_sub = X_sub.drop(overfit, axis=1).copy()


In [31]:
print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)


X (1453, 331) y (1453,) X_sub (1459, 331)


In [32]:
from sklearn.preprocessing import StandardScaler
rs = StandardScaler()
X_ = rs.fit_transform(X)
pd.DataFrame(X_).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,321,322,323,324,325,326,327,328,329,330
count,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,...,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0,1453.0
mean,-5.018544e-16,-1.11007e-15,-2.6895970000000002e-17,3.679858e-16,1.072171e-15,5.486779e-15,6.357230000000001e-17,2.934106e-17,-3.423124e-17,-6.723994000000001e-17,...,1.9560710000000002e-17,-5.623704e-17,4.890177e-18,7.213011e-17,-1.222544e-18,4.890177e-18,7.335266e-18,-1.467053e-17,-9.780354e-18,5.868213000000001e-17
std,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,...,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344,1.000344
min,-2.773401,-3.483957,-3.710877,-4.568874,-3.291969,-1.689624,-0.7439096,-1.149345,-0.3503238,-1.852849,...,-0.05876256,-0.2986727,-0.04548588,-2.586084,-0.2718636,-0.05254069,-0.09125541,-0.1151071,-2.157243,-0.3027547
25%,-0.421364,-0.3716936,-0.798302,-0.5153023,-0.5726603,-0.8657418,-0.7439096,-1.149345,-0.3503238,-0.6851079,...,-0.05876256,-0.2986727,-0.04548588,0.386685,-0.2718636,-0.05254069,-0.09125541,-0.1151071,0.4635547,-0.3027547
50%,0.06490483,0.07129761,-0.07015837,-0.5153023,0.05742351,0.4427771,-0.7439096,0.06245443,-0.3503238,-0.01731944,...,-0.05876256,-0.2986727,-0.04548588,0.386685,-0.2718636,-0.05254069,-0.09125541,-0.1151071,0.4635547,-0.3027547
75%,0.52349,0.469306,0.6579853,0.3945464,0.9528057,0.9274138,0.8769014,0.736537,-0.3503238,0.666762,...,-0.05876256,-0.2986727,-0.04548588,0.386685,-0.2718636,-0.05254069,-0.09125541,-0.1151071,0.4635547,-0.3027547
max,8.001206,7.088191,2.842416,2.984197,1.284429,1.218196,3.811309,3.036459,4.138631,2.856733,...,17.01764,3.348146,21.98484,0.386685,3.678315,19.03287,10.95825,8.687559,0.4635547,3.303004


In [33]:
X_sub_ = rs.transform(X_sub)
pd.DataFrame(X_sub_).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,321,322,323,324,325,326,327,328,329,330
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,-0.051213,-0.058916,-0.012765,-0.025245,0.002964,-0.058202,-0.023029,-0.007131,0.029083,-0.02904,...,-0.02365,-0.006227,0.014913,-0.020823,-0.0309,0.052109,-0.000375,0.04176,0.005499,-0.006188
std,1.008075,0.955867,1.046205,1.010888,1.007805,1.02406,0.998349,1.024249,1.042582,0.999701,...,0.773804,0.990779,1.152328,1.022781,0.945727,1.409853,0.998301,1.164978,0.995658,0.990994
min,-2.773401,-3.281694,-3.710877,-4.568874,-3.059833,-1.689624,-0.74391,-1.149345,-0.350324,-1.852849,...,-0.058763,-0.298673,-0.045486,-2.586084,-0.271864,-0.052541,-0.091255,-0.115107,-2.157243,-0.302755
25%,-0.421364,-0.408563,-0.798302,-0.515302,-0.605823,-1.059597,-0.74391,-1.149345,-0.350324,-0.697809,...,-0.058763,-0.298673,-0.045486,0.386685,-0.271864,-0.052541,-0.091255,-0.115107,0.463555,-0.302755
50%,0.064905,0.055175,-0.070158,-0.515302,0.057424,0.34585,-0.74391,-0.013169,-0.350324,-0.049835,...,-0.058763,-0.298673,-0.045486,0.386685,-0.271864,-0.052541,-0.091255,-0.115107,0.463555,-0.302755
75%,0.52349,0.455131,0.657985,0.394546,0.985968,0.927414,0.8722,0.811355,-0.350324,0.647325,...,-0.058763,-0.298673,-0.045486,0.386685,-0.271864,-0.052541,-0.091255,-0.115107,0.463555,-0.302755
max,4.862692,3.849003,2.842416,2.984197,1.284429,1.218196,3.400013,5.279539,4.175619,2.620588,...,17.017638,3.348146,21.984843,0.386685,3.678315,19.032866,10.958254,8.687559,0.463555,3.303004


In [34]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.0.0


In [35]:
from sklearn.model_selection import train_test_split

In [36]:
train, test, y_train, y_test = train_test_split(X_, y, test_size=0.1, random_state=0)

In [37]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [38]:
def build_model(train=train, y_train=y_train, test=test, y_test=y_test, 
                batch_size=128, epochs=1001, patience=1000, 
                nlayers=1, nneurals=64):
    start = datetime.now()
    model = keras.Sequential([
    layers.Dense(nneurals, activation='relu', input_shape=[train.shape[1]]),

    ])
    for i in range(nlayers-1):
        model.add(layers.Dense(nneurals, activation='relu'))
    model.add(layers.Dense(1))

    optimizer = tf.keras.optimizers.Adam()

    model.compile(loss='mae',
                optimizer=optimizer)
    history = model.fit(pd.DataFrame(train), y_train, 
                        epochs=epochs, batch_size=batch_size,
                        validation_split=0.1, verbose=0, 
                        callbacks=[tfdocs.modeling.EpochDots(), 
                                   tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)])
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    print(hist.tail())
    model.evaluate(pd.DataFrame(test), y_test)
    print('time=', datetime.now() - start)
    return model

In [39]:
start = datetime.now()
for nlayers in [8, 16, 32, 64, 128]:
    for nneurals in [256, 512, 1024]:
        print('nlayers=', nlayers, 'nneurals=', nneurals)
        model = build_model(nlayers=nlayers, nneurals=nneurals)
print('total time: ', datetime.now() - start)

nlayers= 8 nneurals= 256
..................................................
Epoch: 100, loss:0.5115,  val_loss:0.6215,  
....................................................................................................
Epoch: 200, loss:0.3067,  val_loss:0.4275,  
....................................................................................................
Epoch: 300, loss:0.1152,  val_loss:0.2607,  
....................................................................................................
Epoch: 400, loss:0.1149,  val_loss:0.2146,  
....................................................................................................
Epoch: 500, loss:0.1098,  val_loss:0.1095,  
....................................................................................................
Epoch: 600, loss:0.0761,  val_loss:0.1446,  
....................................................................................................
Epoch: 700, loss:0.0327,  val_loss:0.0905,  
...

KeyboardInterrupt: 

In [None]:
start = datetime.now()
nlayers, nneurals = 16, 512
print('nlayers=', nlayers, 'nneurals=', nneurals)
model = build_model(train=X_, y_train=y, nlayers=nlayers, nneurals=nneurals)
submission = pd.read_csv("sample_submission.csv")

submission.iloc[:, 1] = np.floor(np.expm1(model.predict(pd.DataFrame(X_sub_))))

submission.to_csv("submission.csv", index=False)
print('submission time:', datetime.now() - start)

In [None]:
!kaggle competitions submit -c home-data-for-ml-course -f submission.csv -m 'a submission'