In [2]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

DirBase = "/Volumes/wms_ssd/Qualifying Exam/"

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train = pd.read_csv(DirBase+'Input/data.csv')
properties_2016_df = pd.read_csv(DirBase+'Input/properties_2016_mod.csv')
properties_2017_df = pd.read_csv(DirBase+'Input/properties_2017_mod.csv')
test = pd.read_csv(DirBase+'Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later
output = test.copy()

# Analyse the Dimensions of our Datasets.

print("Training Size:" + str(train.shape))
print("Sample Size:" + str(test.shape))


Training Size:(167888, 87)
Sample Size:(2985217, 7)


In [3]:

# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:        
        train[c] = train[c].astype(np.float32)
    if dtype == np.int64:
        train[c] = train[c].astype(np.int32)

for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

df_test_2016 = test.merge(properties_2016_df, how='left', on='parcelid')
df_test_2017 = test.merge(properties_2017_df, how='left', on='parcelid')

### Remove previous variables to keep some memory
del properties_2016_df, properties_2017_df

print('Memory usage reduction...')
train[['latitude', 'longitude']] /= 1e6
df_test_2016[['latitude', 'longitude']] /= 1e6
df_test_2017[['latitude', 'longitude']] /= 1e6

train['censustractandblock'] /= 1e12
df_test_2016['censustractandblock'] /= 1e12
df_test_2017['censustractandblock'] /= 1e12

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set

train['sign'] = train.apply(lambda row: 0 if row['logerror'] < -1 else 1, axis=1)

x_train = train.drop(['parcelid', 'sign', 'logerror', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'month', 'year'], axis=1)

df_test_2016 = df_test_2016.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

df_test_2017 = df_test_2017.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train_1 = abs(train['logerror'].values)
y_train_2 = train['sign']




Memory usage reduction...


In [34]:
train['sign'] = train.apply(lambda row: 0 if row['logerror'] < 0 else 1, axis=1)
y_train_2 = train['sign'].values
# print(train['sign'])

In [8]:
### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y1 = y_train_1 
y2 = y_train_2

Xtrain_1, Xvalid_1, ytrain_1, yvalid_1 = train_test_split(X, y1, test_size=0.2, random_state=42)
Xtrain_2, Xvalid_2, ytrain_2, yvalid_2 = train_test_split(X, y2, test_size=0.2, random_state=42)

assert((Xtrain_1 == Xtrain_2).all())

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain_1 = xgb.DMatrix(Xtrain_1, label=ytrain_1)
dvalid_1 = xgb.DMatrix(Xvalid_1, label=yvalid_1)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.035, 'colsample_bytree': 0.5, 'max_depth': 4,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'mae', 'objective': 'reg:linear' }           

watchlist = [(dtrain_1, 'train_1'), (dvalid_1, 'valid_1')]

model_xgb_1 = xgb.train(xgb_params, dtrain_1, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)


[0]	train_1-mae:0.430384	valid_1-mae:0.430026
Multiple eval metrics have been passed: 'valid_1-mae' will be used for early stopping.

Will train until valid_1-mae hasn't improved in 100 rounds.
[10]	train_1-mae:0.310414	valid_1-mae:0.310104
[20]	train_1-mae:0.228337	valid_1-mae:0.227938
[30]	train_1-mae:0.172681	valid_1-mae:0.172298
[40]	train_1-mae:0.135277	valid_1-mae:0.134962
[50]	train_1-mae:0.110359	valid_1-mae:0.110085
[60]	train_1-mae:0.09389	valid_1-mae:0.093628
[70]	train_1-mae:0.083034	valid_1-mae:0.082778
[80]	train_1-mae:0.075919	valid_1-mae:0.075689
[90]	train_1-mae:0.071187	valid_1-mae:0.070972
[100]	train_1-mae:0.068055	valid_1-mae:0.067869
[110]	train_1-mae:0.065952	valid_1-mae:0.065797
[120]	train_1-mae:0.064521	valid_1-mae:0.064408
[130]	train_1-mae:0.063544	valid_1-mae:0.063472
[140]	train_1-mae:0.062841	valid_1-mae:0.062804
[150]	train_1-mae:0.062373	valid_1-mae:0.062367
[160]	train_1-mae:0.062005	valid_1-mae:0.06203
[170]	train_1-mae:0.061742	valid_1-mae:0.061791
[

In [35]:
y2 = y_train_2
# print(y2)
Xtrain_2, Xvalid_2, ytrain_2, yvalid_2 = train_test_split(X, y2, test_size=0.2, random_state=42)
# print(ytrain_2)
# print(yvalid_2)

In [42]:
### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain_2 = xgb.DMatrix(Xtrain_2, label=ytrain_2)
dvalid_2 = xgb.DMatrix(Xvalid_2, label=yvalid_2)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.3, 'colsample_bytree': 0.5, 'max_depth': 6,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0.1,
            'eval_metric': 'merror', 'objective': 'multi:softmax', 'num_class': 2}           

watchlist = [(dtrain_2, 'train_2'), (dvalid_2, 'valid_2')]

model_xgb_2 = xgb.train(xgb_params, dtrain_2, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)



[0]	train_2-merror:0.42259	valid_2-merror:0.431741
Multiple eval metrics have been passed: 'valid_2-merror' will be used for early stopping.

Will train until valid_2-merror hasn't improved in 100 rounds.
[10]	train_2-merror:0.398228	valid_2-merror:0.420335
[20]	train_2-merror:0.38292	valid_2-merror:0.416821
[30]	train_2-merror:0.372325	valid_2-merror:0.416016
[40]	train_2-merror:0.364381	valid_2-merror:0.415153


KeyboardInterrupt: 

In [26]:
# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

months = [10, 11, 12]
years = [2016, 2017]


for y in years:
    if y == 2016:
        x_test = df_test_2016
    else:
        x_test = df_test_2017

    print("PREDICTING " + str(y) + " ...")
    
    dtest = xgb.DMatrix(x_test.values)

    Predicted_test_xgb = model_xgb_2.predict(dtest)

    
    for m in months:        
        name = str(y) + str(m)
        output[name] = Predicted_test_xgb
        output[name] = output.apply(lambda row: -1 if row[name] < 0.5 else 1, axis=1)

print(output.head(50))
# Submitting the Results 
# print('Preparing the csv file ...')
# output.to_csv(DirBase+'xgb_predicted_results_mod8.csv', index=False, float_format='%.4f')
# print("Finished writing the file")

PREDICTING 2016 ...
PREDICTING 2017 ...
    parcelid  201610  201611  201612  201710  201711  201712
0   10754147       1       1       1       1       1       1
1   10759547      -1      -1      -1       1       1       1
2   10843547       1       1       1       1       1       1
3   10859147       1       1       1       1       1       1
4   10879947       1       1       1       1       1       1
5   10898347       1       1       1       1       1       1
6   10933547       1       1       1       1       1       1
7   10940747       1       1       1       1       1       1
8   10954547      -1      -1      -1      -1      -1      -1
9   10976347       1       1       1       1       1       1
10  11073947       1       1       1       1       1       1
11  11114347      -1      -1      -1      -1      -1      -1
12  11116947      -1      -1      -1       1       1       1
13  11142747       1       1       1      -1      -1      -1
14  11193347      -1      -1      -1      -1 

In [27]:
# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

months = [10, 11, 12]
years = [2016, 2017]


for y in years:
    if y == 2016:
        x_test = df_test_2016
    else:
        x_test = df_test_2017

    print("PREDICTING " + str(y) + " ...")
    
    dtest = xgb.DMatrix(x_test.values)

    Predicted_test_xgb = model_xgb_1.predict(dtest)

    
    for m in months:        
        name = str(y) + str(m)
        output[name] = output[name] * Predicted_test_xgb

print(output.head(50))


PREDICTING 2016 ...
PREDICTING 2017 ...
    parcelid    201610    201611    201612    201710    201711    201712
0   10754147  0.253454  0.253454  0.253454  0.226232  0.226232  0.226232
1   10759547 -0.092452 -0.092452 -0.092452  0.135281  0.135281  0.135281
2   10843547  0.427953  0.427953  0.427953  0.612129  0.612129  0.612129
3   10859147  0.536065  0.536065  0.536065  0.512513  0.512513  0.512513
4   10879947  0.391063  0.391063  0.391063  0.425663  0.425663  0.425663
5   10898347  0.439333  0.439333  0.439333  0.431588  0.431588  0.431588
6   10933547  0.118800  0.118800  0.118800  0.121055  0.121055  0.121055
7   10940747  0.274317  0.274317  0.274317  0.268063  0.268063  0.268063
8   10954547 -0.306277 -0.306277 -0.306277 -0.261130 -0.261130 -0.261130
9   10976347  0.303563  0.303563  0.303563  0.296585  0.296585  0.296585
10  11073947  0.404283  0.404283  0.404283  0.443952  0.443952  0.443952
11  11114347 -0.039725 -0.039725 -0.039725 -0.021680 -0.021680 -0.021680
12  1111694

In [None]:
print(output.head(50))

In [28]:
# Submitting the Results 
print('Preparing the csv file ...')
output.to_csv(DirBase+'xgb_predicted_results_mod12.csv', index=False, float_format='%.4f')
print("Finished writing the file")

Preparing the csv file ...
Finished writing the file
