In [1]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

DirBase = "/Volumes/wms_ssd/Qualifying Exam/"

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train = pd.read_csv(DirBase+'Input/data.csv')
properties_2016_df = pd.read_csv(DirBase+'Input/properties_2016_mod.csv')
properties_2017_df = pd.read_csv(DirBase+'Input/properties_2017_mod.csv')
test = pd.read_csv(DirBase+'Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later
output = test.copy()

# Analyse the Dimensions of our Datasets.

print("Training Size:" + str(train.shape))
print("Sample Size:" + str(test.shape))


  return f(*args, **kwds)
  return f(*args, **kwds)


Training Size:(167888, 87)
Sample Size:(2985217, 7)


In [2]:

# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:        
        train[c] = train[c].astype(np.float32)
    if dtype == np.int64:
        train[c] = train[c].astype(np.int32)

for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

df_test_2016 = test.merge(properties_2016_df, how='left', on='parcelid')
df_test_2017 = test.merge(properties_2017_df, how='left', on='parcelid')

### Remove previous variables to keep some memory
del properties_2016_df, properties_2017_df

print('Memory usage reduction...')
train[['latitude', 'longitude']] /= 1e6
df_test_2016[['latitude', 'longitude']] /= 1e6
df_test_2017[['latitude', 'longitude']] /= 1e6

train['censustractandblock'] /= 1e12
df_test_2016['censustractandblock'] /= 1e12
df_test_2017['censustractandblock'] /= 1e12

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set



x_train = train.drop(['parcelid', 'logerror', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'month', 'year'], axis=1)

df_test_2016 = df_test_2016.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

df_test_2017 = df_test_2017.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train = train['logerror'].values


Memory usage reduction...


In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# split data into X and y
X = x_train
y = y_train

# grid search
model = XGBRegressor()
subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
max_depth = [5, 6, 7, 8]
gamma = [0, 0.01, 0.1]
colsample_bytree = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
colsample_bylevel = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
reg_lambda = [1, 5, 10]
reg_alpha = [0, 0.01, 0.1]

param_grid = dict(subsample=subsample, 
                  learning_rate=learning_rate, 
                  max_depth=max_depth, 
                  gamma=gamma,
                  colsample_bytree=colsample_bytree,
                  colsample_bylevel=colsample_bylevel,
                  reg_lambda=reg_lambda,
                  reg_alpha=reg_alpha)

kfold = KFold(n_splits=10, shuffle=True, random_state=7)
random_search = RandomizedSearchCV(model, param_grid, scoring="neg_mean_absolute_error", cv=kfold, random_state=7, n_iter=20)
result = random_search.fit(X, y)


In [14]:
# summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))
means = result.cv_results_['mean_test_score']
stds = result.cv_results_['std_test_score']
params = result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

    # plot
# pyplot.errorbar(subsample, means, yerr=stds)
# pyplot.title("XGBoost subsample vs Log Loss")
# pyplot.xlabel('subsample')
# pyplot.ylabel('Log Loss')
# pyplot.savefig('subsample.png')

Best: -0.069379 using {'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 0, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.3}
-0.083757 (0.001519) with: {'subsample': 0.4, 'reg_lambda': 1, 'reg_alpha': 0.01, 'max_depth': 6, 'learning_rate': 1.0, 'gamma': 0.1, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.8}
-0.069379 (0.001005) with: {'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 0, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.3}
-0.088760 (0.001441) with: {'subsample': 0.3, 'reg_lambda': 5, 'reg_alpha': 0, 'max_depth': 7, 'learning_rate': 0.8, 'gamma': 0.01, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.8}
-0.073624 (0.001061) with: {'subsample': 0.8, 'reg_lambda': 10, 'reg_alpha': 0, 'max_depth': 6, 'learning_rate': 0.5, 'gamma': 0, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7}
-0.088085 (0.001204) with: {'subsample': 0.1, 'reg_lambda': 5, 'reg_alpha': 0.1, 'max_depth'

In [15]:
### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y = y_train 

Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dvalid = xgb.DMatrix(Xvalid, label=yvalid)

# Best: -0.069305 using {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0, 'max_depth': 5, 'learning_rate': 0.5, 
# 'gamma': 10, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.2}

# Best: -0.069555 using {'subsample': 0.4, 'reg_lambda': 1, 'reg_alpha': 1, 'max_depth': 3, 'learning_rate': 0.4,
# 'gamma': 1, 'colsample_bytree': 0.1, 'colsample_bylevel': 0.3}

# Best: -0.069379 using {'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 0, 'max_depth': 6, 'learning_rate': 0.1,
# 'gamma': 0.1, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.3}

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'eval_metric': 'mae', 
              'objective': 'reg:linear' , 'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 0, 'max_depth': 6, 
              'learning_rate': 0.1,'gamma': 0.1, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.3}           

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model_xgb = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)


[0]	train-mae:0.448501	valid-mae:0.448152
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.177229	valid-mae:0.177208
[20]	train-mae:0.094576	valid-mae:0.094483
[30]	train-mae:0.074119	valid-mae:0.074171
[40]	train-mae:0.069996	valid-mae:0.070106
[50]	train-mae:0.069006	valid-mae:0.069232
[60]	train-mae:0.068671	valid-mae:0.06899
[70]	train-mae:0.068506	valid-mae:0.068911
[80]	train-mae:0.068381	valid-mae:0.06892
[90]	train-mae:0.068296	valid-mae:0.068953
[100]	train-mae:0.068238	valid-mae:0.068997
[110]	train-mae:0.068167	valid-mae:0.069012
[120]	train-mae:0.068128	valid-mae:0.069074
[130]	train-mae:0.068051	valid-mae:0.069129
[140]	train-mae:0.067986	valid-mae:0.069161
[150]	train-mae:0.067914	valid-mae:0.069196
[160]	train-mae:0.067876	valid-mae:0.069261
[170]	train-mae:0.067803	valid-mae:0.069296
Stopping. Best iteration:
[71]	train-mae:0.068482	valid-mae:0.068909



In [17]:
# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

months = [10, 11, 12]
years = [2016, 2017]


for y in years:
    if y == 2016:
        x_test = df_test_2016
    else:
        x_test = df_test_2017

    print("PREDICTING " + str(y) + " ...")
    
    dtest = xgb.DMatrix(x_test.values)

    Predicted_test_xgb = model_xgb.predict(dtest)

    
    for m in months:        
        name = str(y) + str(m)
        output[name] = Predicted_test_xgb

print(output.head(50))
# Submitting the Results 
print('Preparing the csv file ...')
output.to_csv(DirBase+'xgb_predicted_results_mod8.csv', index=False, float_format='%.4f')
print("Finished writing the file")

PREDICTING 2016 ...
PREDICTING 2017 ...
    parcelid    201610    201611    201612    201710    201711    201712
0   10754147 -0.117378 -0.117378 -0.117378 -0.058102 -0.058102 -0.058102
1   10759547 -0.054343 -0.054343 -0.054343 -0.061658 -0.061658 -0.061658
2   10843547 -0.146123 -0.146123 -0.146123  0.082087  0.082087  0.082087
3   10859147  0.043041  0.043041  0.043041  0.012807  0.012807  0.012807
4   10879947  0.021128  0.021128  0.021128  0.029644  0.029644  0.029644
5   10898347  0.159445  0.159445  0.159445  0.132184  0.132184  0.132184
6   10933547 -0.011254 -0.011254 -0.011254 -0.014799 -0.014799 -0.014799
7   10940747 -0.045428 -0.045428 -0.045428 -0.061039 -0.061039 -0.061039
8   10954547 -0.131553 -0.131553 -0.131553 -0.178623 -0.178623 -0.178623
9   10976347 -0.033353 -0.033353 -0.033353 -0.049597 -0.049597 -0.049597
10  11073947  0.110666  0.110666  0.110666  0.113915  0.113915  0.113915
11  11114347 -0.107274 -0.107274 -0.107274 -0.068423 -0.068423 -0.068423
12  1111694

In [None]:
print(output.head(50))