In [1]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

DirBase = "/Volumes/wms_ssd/Qualifying Exam/"

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train = pd.read_csv(DirBase+'Input/data.csv')
properties_2016_df = pd.read_csv(DirBase+'Input/properties_2016_mod.csv')
properties_2017_df = pd.read_csv(DirBase+'Input/properties_2017_mod.csv')
test = pd.read_csv(DirBase+'Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later
output = test.copy()

# Analyse the Dimensions of our Datasets.

print("Training Size:" + str(train.shape))
print("Sample Size:" + str(test.shape))


Training Size:(167888, 87)
Sample Size:(2985217, 7)


In [2]:

# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:        
        train[c] = train[c].astype(np.float32)
    if dtype == np.int64:
        train[c] = train[c].astype(np.int32)

for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

df_test_2016 = test.merge(properties_2016_df, how='left', on='parcelid')
df_test_2017 = test.merge(properties_2017_df, how='left', on='parcelid')

### Remove previous variables to keep some memory
del properties_2016_df, properties_2017_df

print('Memory usage reduction...')
train[['latitude', 'longitude']] /= 1e6
df_test_2016[['latitude', 'longitude']] /= 1e6
df_test_2017[['latitude', 'longitude']] /= 1e6

train['censustractandblock'] /= 1e12
df_test_2016['censustractandblock'] /= 1e12
df_test_2017['censustractandblock'] /= 1e12

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set

x_train = train.drop(['parcelid', 'logerror', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'month', 'year'], axis=1)

df_test_2016 = df_test_2016.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

df_test_2017 = df_test_2017.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train = train['logerror'].values


Memory usage reduction...


In [3]:
### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y = y_train 

Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dvalid = xgb.DMatrix(Xvalid, label=yvalid)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.035, 'colsample_bytree': 0.5, 'max_depth': 4,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'mae', 'objective': 'reg:linear' }           

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model_xgb = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)


[0]	train-mae:0.479375	valid-mae:0.478985
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.342542	valid-mae:0.342362
[20]	train-mae:0.248636	valid-mae:0.248603
[30]	train-mae:0.184948	valid-mae:0.184869
[40]	train-mae:0.142229	valid-mae:0.142081
[50]	train-mae:0.11412	valid-mae:0.113922
[60]	train-mae:0.096142	valid-mae:0.095919
[70]	train-mae:0.084992	valid-mae:0.084782
[80]	train-mae:0.078334	valid-mae:0.078181
[90]	train-mae:0.074446	valid-mae:0.074323
[100]	train-mae:0.072198	valid-mae:0.07208
[110]	train-mae:0.07087	valid-mae:0.070754
[120]	train-mae:0.070111	valid-mae:0.070008
[130]	train-mae:0.069644	valid-mae:0.069559
[140]	train-mae:0.069334	valid-mae:0.069276
[150]	train-mae:0.069128	valid-mae:0.069089
[160]	train-mae:0.068988	valid-mae:0.068975
[170]	train-mae:0.068897	valid-mae:0.068912
[180]	train-mae:0.068829	valid-mae:0.068876
[190]	train-mae:0.068771	valid-mae

In [4]:
# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

months = [10, 11, 12]
years = [2016, 2017]


for y in years:
    if y == 2016:
        x_test = df_test_2016
    else:
        x_test = df_test_2017

    print("PREDICTING " + str(y) + " ...")
    
    dtest = xgb.DMatrix(x_test.values)

    Predicted_test_xgb = model_xgb.predict(dtest)

    
    for m in months:        
        name = str(y) + str(m)
        output[name] = Predicted_test_xgb

print(output.head(50))
# Submitting the Results 
print('Preparing the csv file ...')
output.to_csv(DirBase+'xgb_predicted_results_mod4.csv', index=False, float_format='%.4f')
print("Finished writing the file")

PREDICTING 2016 ...
PREDICTING 2017 ...
    parcelid    201610    201611    201612    201710    201711    201712
0   10754147 -0.055875 -0.055875 -0.055875 -0.065116 -0.065116 -0.065116
1   10759547  0.028946  0.028946  0.028946  0.000703  0.000703  0.000703
2   10843547  0.023374  0.023374  0.023374  0.134059  0.134059  0.134059
3   10859147  0.009479  0.009479  0.009479 -0.015481 -0.015481 -0.015481
4   10879947  0.018441  0.018441  0.018441  0.018218  0.018218  0.018218
5   10898347  0.050160  0.050160  0.050160  0.031975  0.031975  0.031975
6   10933547 -0.004305 -0.004305 -0.004305 -0.003058 -0.003058 -0.003058
7   10940747 -0.062723 -0.062723 -0.062723 -0.061476 -0.061476 -0.061476
8   10954547 -0.057813 -0.057813 -0.057813 -0.049105 -0.049105 -0.049105
9   10976347 -0.060091 -0.060091 -0.060091 -0.057998 -0.057998 -0.057998
10  11073947  0.022292  0.022292  0.022292  0.023338  0.023338  0.023338
11  11114347 -0.027423 -0.027423 -0.027423 -0.013595 -0.013595 -0.013595
12  1111694

In [5]:
print(output.head(50))

    parcelid    201610    201611    201612    201710    201711    201712
0   10754147 -0.055875 -0.055875 -0.055875 -0.065116 -0.065116 -0.065116
1   10759547  0.028946  0.028946  0.028946  0.000703  0.000703  0.000703
2   10843547  0.023374  0.023374  0.023374  0.134059  0.134059  0.134059
3   10859147  0.009479  0.009479  0.009479 -0.015481 -0.015481 -0.015481
4   10879947  0.018441  0.018441  0.018441  0.018218  0.018218  0.018218
5   10898347  0.050160  0.050160  0.050160  0.031975  0.031975  0.031975
6   10933547 -0.004305 -0.004305 -0.004305 -0.003058 -0.003058 -0.003058
7   10940747 -0.062723 -0.062723 -0.062723 -0.061476 -0.061476 -0.061476
8   10954547 -0.057813 -0.057813 -0.057813 -0.049105 -0.049105 -0.049105
9   10976347 -0.060091 -0.060091 -0.060091 -0.057998 -0.057998 -0.057998
10  11073947  0.022292  0.022292  0.022292  0.023338  0.023338  0.023338
11  11114347 -0.027423 -0.027423 -0.027423 -0.013595 -0.013595 -0.013595
12  11116947  0.012339  0.012339  0.012339  0.01343