In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats

from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Read train and test dataset
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Show all Columns in DF
pd.set_option('display.max_rows', train.shape[0])
pd.set_option('display.max_columns', train.shape[1])

# Train df missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
train = train.drop((missing_data[missing_data['Total'] > 1]).index,1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)
train.isnull().sum().max() #just checking that there's no missing data missing.

# Test df missing data
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
test = test.drop((missing_data[missing_data['Total'] > 1]).index,1)
test = test.drop(test.loc[test['Electrical'].isnull()].index)
test.isnull().sum().max() #just checking that there's no missing data missing.

# Standardize features by removing the mean and scaling to unit variance
saleprice_scaled = StandardScaler().fit_transform(train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]

# Delete outliers points
train.sort_values(by = 'GrLivArea', ascending = False)[:2]
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)

# Logarithmic transformation in some variables
train['SalePrice'] = np.log(train['SalePrice'])
train['GrLivArea'] = np.log(train['GrLivArea'])
test['GrLivArea'] = np.log(test['GrLivArea'])

#create column for new variable
train['HasBsmt'] = pd.Series(len(train['TotalBsmtSF']), index=train.index)
train['HasBsmt'] = 0 
train.loc[train['TotalBsmtSF'] > 0,'HasBsmt'] = 1

# transform data
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])

#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
test['HasBsmt'] = pd.Series(len(test['TotalBsmtSF']), index=test.index)
test['HasBsmt'] = 0 
test.loc[test['TotalBsmtSF']>0,'HasBsmt'] = 1

# transform data
test.loc[test['HasBsmt']==1,'TotalBsmtSF'] = np.log(test['TotalBsmtSF'])

# Transform qualitative feature into quantitative feature
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Separate price column from train table
y = train['SalePrice']
train = train.drop(["SalePrice"], axis = 1)

# Remove Missing Columns from train
missing_cols = set( train.columns ) - set( test.columns )
train.drop( missing_cols , inplace = True, axis = 1 )

# Features with no value is filled with the mean
mean = train.mean().astype(np.int32)
train.fillna( mean , inplace = True)
mean = test.mean().astype(np.int32)
test.fillna( mean , inplace = True)



In [3]:
# Check if dataframe has NULL elements
test.isnull().sum()
train.isnull().sum()

Id                       0
MSSubClass               0
LotArea                  0
OverallQual              0
OverallCond              0
YearBuilt                0
YearRemodAdd             0
BsmtFinSF1               0
BsmtFinSF2               0
BsmtUnfSF                0
TotalBsmtSF              0
1stFlrSF                 0
2ndFlrSF                 0
LowQualFinSF             0
GrLivArea                0
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageCars               0
GarageArea               0
WoodDeckSF               0
OpenPorchSF              0
EnclosedPorch            0
3SsnPorch                0
ScreenPorch              0
PoolArea                 0
MiscVal                  0
MoSold                   0
YrSold                   0
HasBsmt                  0
Street_Grvl              0
Street_Pave              0
LotShape_IR1             0
LotShape_IR2             0
L

In [4]:
# See correlation between features and SalePrice
train.corrwith(y).sort_values(ascending=False)

OverallQual              0.821589
GrLivArea                0.737430
GarageCars               0.681053
GarageArea               0.656157
1stFlrSF                 0.620761
FullBath                 0.596021
YearBuilt                0.587301
YearRemodAdd             0.566208
TotRmsAbvGrd             0.537716
Foundation_PConc         0.531414
ExterQual_Gd             0.510033
Fireplaces               0.492159
HeatingQC_Ex             0.467098
KitchenQual_Ex           0.417482
KitchenQual_Gd           0.407084
BsmtFinSF1               0.392429
TotalBsmtSF              0.373552
ExterQual_Ex             0.362392
Neighborhood_NridgHt     0.351858
CentralAir_Y             0.351605
Exterior2nd_VinylSd      0.337860
Exterior1st_VinylSd      0.336551
WoodDeckSF               0.334250
SaleType_New             0.331362
SaleCondition_Partial    0.326547
OpenPorchSF              0.325277
2ndFlrSF                 0.319998
HalfBath                 0.314339
Electrical_SBrkr         0.305995
PavedDrive_Y  

In [5]:
# Create new features based on existent features

train['OverallRate'] = train['OverallQual'] / train['OverallCond']
train['OverallRate2'] = train['OverallQual'] + train['OverallCond']

train['AreaLivGarage'] = train['GrLivArea'] / train['GarageArea']
train['AreaLivGarage'] = train['GrLivArea'] + train['GarageArea']

train['areaPerCar'] = train['GarageArea'] / train['GarageCars']

# Create new features based on existent features

test['OverallRate'] = test['OverallQual'] / test['OverallCond']
test['OverallRate2'] = test['OverallQual'] + test['OverallCond']

test['AreaLivGarage'] = test['GrLivArea'] / test['GarageArea']
test['AreaLivGarage'] = test['GrLivArea'] + test['GarageArea']

test['areaPerCar'] = test['GarageArea'] / test['GarageCars']

# Fill Nan values with 0
train.fillna( 0 , inplace = True)
test.fillna( 0 , inplace = True)

In [6]:
# Initialize KMeans with 800 Clusters, 300 maximum iterations, 0.0001 of tolerance
cluster = KMeans(n_clusters= 800, max_iter=300, tol=0.0001, verbose=0, random_state = 0, n_jobs=-1)

# Execute Kmeans in train dataframe using fit()
kmeans_train = cluster.fit(train)

# Get Labels to kmeans
labels_train = kmeans_train.labels_

# Predict cluster index for each sample
kmeans_testing = cluster.predict(test)

# Insert values from kmeans to train and test dataframe
train['cluster'] = labels_train
test['cluster'] = kmeans_testing

print("Data points = %i " % train.shape[0])
print("Features = %i " % train.shape[1])

Data points = 1457 
Features = 197 


In [7]:
# Drop ID columns from dataframes
id = test['Id']
test.drop(['Id'], axis = 1, inplace = True)
train.drop(['Id'], axis = 1, inplace = True)

In [8]:
# 0.05% of data will be used to test
x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=42 )

In [9]:
# Fit the model
et  = ExtraTreesRegressor(n_estimators=950 ,  max_features = 'auto', max_leaf_nodes=None, n_jobs= -1, random_state = 0, verbose = 0)
gbr = GradientBoostingRegressor()
lasso = Lasso()
xgbr = XGBRegressor()
svr = SVR(kernel= 'rbf', gamma= 'auto', tol=0.001, C=100.0, max_iter=-1)
rf = RandomForestRegressor(n_estimators=900,  random_state=0)
lr = LinearRegression(fit_intercept=True, normalize=True, copy_X=True, n_jobs=-1)
knnR = KNeighborsRegressor(n_neighbors=20, n_jobs=-1)
reg = StackingCVRegressor(regressors=[xgbr , gbr, lr, et], meta_regressor=lr)

reg.fit(x_train.values, y_train.values)

StackingCVRegressor(cv=5,
          meta_regressor=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True),
          refit=True,
          regressors=[XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
    ..._estimators=950, n_jobs=-1,
          oob_score=False, random_state=0, verbose=0, warm_start=False)],
          shuffle=True, store_train_meta_features=False,
          use_features_in_secondary=False)

In [10]:
print('Test')
print(reg.score(x_test.values, y_test.values))
print('\nRMSLE')
print(sqrt(mean_squared_error(reg.predict(x_test.values), y_test)))
print('\nTrain')
print(reg.score(x_train.values, y_train.values))

Test
0.9084510306572695

RMSLE
0.12380894807740396

Train
0.9781730376688778
