In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p, boxcox
import warnings
import datetime

warnings.filterwarnings('ignore')
%matplotlib inline



In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_id = pd.read_csv("test.csv")
len_train = train.shape[0]
combined = pd.concat([train,test], sort=False)
print(train.shape)
print(test.shape)

(1351, 13)
(338, 12)


In [3]:
#Delete Data-URL, irrelevant
train = train.drop('data-url', 1)
test = test.drop('data-url', 1)

In [4]:
#First try, fill with 0's
zeros = ('buildingSize','erfSize','garage')
ones = ('bathroom', 'bedroom')


for col in zeros:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)
    
for col in ones:
    train[col] = train[col].fillna(1)
    test[col] = test[col].fillna(1)

In [5]:
train.isnull().sum().sort_index()[train.isnull().sum() > 0]

Series([], dtype: int64)

In [6]:
test.isnull().sum().sort_index()[test.isnull().sum() > 0]

Series([], dtype: int64)

In [7]:
train = train.drop(train[(train['erfSize']>10000) & (train['data-price']<3000000)].index)

In [8]:
train.shape

(1349, 12)

In [9]:
#Correct for skewness in SalesPrice using nplog
train['data-price'] = np.log1p(train['data-price'])

In [10]:
len_train=train.shape[0]
combined = pd.concat([train,test], sort=False)
print(combined.shape)

(1687, 12)


In [12]:
#add_datepart(combined, 'data-date')

In [11]:
combined.head()

Unnamed: 0,house-id,area,bathroom,bedroom,buildingSize,data-date,data-isonshow,data-location,erfSize,garage,type,data-price
0,1,boland-winelands,3.0,3.0,300.0,2018-05-31 14:44:46,False,La Sandra,1240.0,2.0,house,15.598902
1,2,northern-suburbs,1.0,2.0,0.0,2018-05-16 12:23:46,False,Observatory,105.0,0.0,house,14.343194
2,3,boland-winelands,3.0,5.0,0.0,2018-05-29 15:38:01,False,Riebeek Valley,238.0,2.0,apartment,15.341567
3,4,boland-winelands,2.0,3.0,124.0,2016-07-04 10:26:20,False,Strand South,124.0,1.0,apartment,14.038655
4,5,boland-winelands,4.0,4.0,400.0,2016-10-21 09:30:10,False,Ceres,500.0,0.0,house,15.725053


In [12]:
num_feats = combined.dtypes[combined.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = combined[num_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness = skewness.drop('data-price', 0)
skewness.head(15)


Skew in numerical features: 



Unnamed: 0,Skew
erfSize,7.44856
data-isonshow,5.429007
buildingSize,1.005642
bathroom,0.960948
bedroom,0.724898
garage,0.530627
house-id,0.000858


In [13]:
#Correct for skewness by using boxcox1p
skewness = skewness[abs(skewness) > 0.5]
skewness = skewness.drop('house-id' ,0)
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    combined[feat] = boxcox1p(combined[feat], lam)

There are 6 skewed numerical features to Box Cox transform


In [14]:
combined.head()

Unnamed: 0,house-id,area,bathroom,bedroom,buildingSize,data-date,data-isonshow,data-location,erfSize,garage,type,data-price
0,1,boland-winelands,1.540963,1.540963,9.02589,2018-05-31 14:44:46,0.0,La Sandra,12.741053,1.194318,house,15.598902
1,2,northern-suburbs,0.730463,1.194318,0.0,2018-05-16 12:23:46,0.0,Observatory,6.751853,0.0,house,14.343194
2,3,boland-winelands,1.540963,2.055642,0.0,2018-05-29 15:38:01,0.0,Riebeek Valley,8.492259,1.194318,apartment,15.341567
3,4,boland-winelands,1.194318,1.540963,7.087847,2016-07-04 10:26:20,0.0,Strand South,7.087847,0.730463,apartment,14.038655
4,5,boland-winelands,1.820334,1.820334,9.715842,2016-10-21 09:30:10,0.0,Ceres,10.272202,0.0,house,15.725053


In [15]:
combined['data-date'] = pd.to_datetime(combined['data-date'])

In [16]:
combined['Year'] = combined['data-date'].map(lambda x: x.year)

In [17]:
combined['Month'] = combined['data-date'].map(lambda x: x.month)

In [18]:
combined['Day'] = combined['data-date'].map(lambda x: x.day)

In [19]:
combined['Time'] = combined['data-date'].map(lambda x: x.strftime('%H:%M:%S'))

In [20]:
combined = combined.drop('data-date', 1)

In [21]:
combined.head()

Unnamed: 0,house-id,area,bathroom,bedroom,buildingSize,data-isonshow,data-location,erfSize,garage,type,data-price,Year,Month,Day,Time
0,1,boland-winelands,1.540963,1.540963,9.02589,0.0,La Sandra,12.741053,1.194318,house,15.598902,2018,5,31,14:44:46
1,2,northern-suburbs,0.730463,1.194318,0.0,0.0,Observatory,6.751853,0.0,house,14.343194,2018,5,16,12:23:46
2,3,boland-winelands,1.540963,2.055642,0.0,0.0,Riebeek Valley,8.492259,1.194318,apartment,15.341567,2018,5,29,15:38:01
3,4,boland-winelands,1.194318,1.540963,7.087847,0.0,Strand South,7.087847,0.730463,apartment,14.038655,2016,7,4,10:26:20
4,5,boland-winelands,1.820334,1.820334,9.715842,0.0,Ceres,10.272202,0.0,house,15.725053,2016,10,21,09:30:10


In [22]:
#Get Dummies
combined = pd.get_dummies(combined)
len_train = train.shape[0]

In [23]:
train.shape

(1349, 12)

In [24]:
train = combined[:len_train]
y_train = train['data-price'].values
train = train.drop(['house-id','data-price'], 1)
test = combined[len_train:]
test = test.drop(['house-id','data-price'], 1)
train.shape

(1349, 1198)

In [25]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error




In [26]:
#Validation function - Courtesy of ....
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)


### Linear Regression

In [27]:
LinReg = LinearRegression()
score = rmsle_cv(LinReg)
print("Random Forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Random Forest score: 0.2186 (0.0251)



### Ridge Regression

In [28]:
RG = Ridge(alpha=0.01, random_state = 5)
score = rmsle_cv(RG)
print("Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Ridge score: 0.2143 (0.0204)



### Lasso Regression

In [29]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.00001, random_state = 5))
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Lasso score: 0.2142 (0.0234)



In [60]:
from sklearn.kernel_ridge import KernelRidge
KRR = KernelRidge(alpha=0.085, kernel='polynomial', degree=2, coef0=2.5)
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Kernel Ridge score: 0.2139 (0.0213)



### XGBoost

In [34]:
'''data_dmatrix = xgb.DMatrix(data=train.values,label=y_train)
XGBoost = xgb.XGBRegressor(random_state = 5, max_depth = 3, alpha = 10, n_estimators = 1000,
                           learning_rate = 0.3, objective = 'reg:squarederror', colsample_bytree = 0.5,
                          subsample = 0.75)
score = rmsle_cv(XGBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))'''

'data_dmatrix = xgb.DMatrix(data=train.values,label=y_train)\nXGBoost = xgb.XGBRegressor(random_state = 5, max_depth = 3, alpha = 10, n_estimators = 1000,\n                           learning_rate = 0.3, objective = \'reg:squarederror\', colsample_bytree = 0.5,\n                          subsample = 0.75)\nscore = rmsle_cv(XGBoost)\nprint("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))'

## Elastic Net Regression

Elastic net is basically a combination of both L1 and L2 regularization. So if you know elastic net, you can implement both Ridge and Lasso by tuning the parameters.

In [41]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0001, l1_ratio=0.1, random_state=3))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.2136 (0.0206)



### We then fit the training data to all models

In [61]:
#Fit the training dataset on every model
lr = LinReg.fit(train, y_train)
ls = lasso.fit(train, y_train)
rg = RG.fit(train, y_train)
kr = KRR.fit(train, y_train)
#rf = RandomForest.fit(train, y_train)
#gb = GBoost.fit(train,y_train)
#xg = XGBoost.fit(train,y_train)
en = ENet.fit(train, y_train)

In [63]:
pred_lr = np.expm1(lr.predict(test))
pred_ls = np.expm1(ls.predict(test))
pred_rg = np.expm1(rg.predict(test))
#pred_rf = np.expm1(rf.predict(test))
#pred_gb = np.expm1(gb.predict(test))
#pred_xg = np.expm1(xg.predict(test))
pred_en = np.expm1(en.predict(test))
pred_kr = np.expm1(kr.predict(test))

### Mean of all model's prediction.
np.expm1 ( ) is used to calculate exp(x) - 1 for all elements in the array. 

In [93]:
final_predictions = (pred_ls + pred_rg + pred_en + pred_kr) / 4
# Tried weighted average, scored less
final_weighted = (0.42 * pred_ls) + (0.18 * pred_rg) + (0.20 * pred_en) + (0.20 * pred_kr)

### Check the predictions

In [91]:
final_weighted

array([3264628.72153768, 1728451.73464089, 7345048.46581766,
       3650842.47285995, 7158527.25362357, 2779843.31964912,
       2468296.11121481, 5649936.89960853, 3062466.84409037,
       3433703.59641683, 3551823.77022157, 5546401.70070149,
       5519456.02820734, 1851346.74503876, 4102232.83689114,
       1915549.60786289, 3841511.85801393, 1418633.7399795 ,
       4387637.41912822, 4438381.78328385, 4132555.98806967,
       6225975.52703202, 6215186.47978419, 3347022.64858446,
       2279987.35772922, 1720067.4633333 , 2265223.41553767,
       3853754.68032787, 6382915.30433237, 3662541.08942197,
       5919660.52428469, 7460592.18355838, 3445297.41923015,
       5723753.62425276, 4519771.07922431, 4326536.65120466,
        863452.18073512, 2644342.79589836, 1638751.58836708,
       2964891.68042888, 3742746.84499017, 6047051.20282603,
       1671376.65879027, 4861840.42927601, 1923880.96119124,
       2729656.52248854, 2600130.43480169, 1601623.63043502,
       4686325.52478715,

***
<a id='section10'></a>
## 10. Output for Kaggle submission

In [94]:
#Output to CSV
'''output_avg = pd.DataFrame({'house-id':test_id['house-id'], 'price': final_predictions})
output_avg.to_csv('sub33.csv', index=False)'''

output_avg = pd.DataFrame({'house-id':test_id['house-id'], 'price': final_weighted})
output_avg.to_csv('sub-weighted310.csv', index=False)
'''
output_avg = pd.DataFrame({'house-id':test_id['house-id'], 'price': pred_ls})
output_avg.to_csv('sub-ls32.csv', index=False)'''

"\noutput_avg = pd.DataFrame({'house-id':test_id['house-id'], 'price': pred_ls})\noutput_avg.to_csv('sub-ls32.csv', index=False)"