In [6]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train = pd.read_csv('Input/train_2016_v2.csv' , parse_dates=["transactiondate"])
train = train.append(pd.read_csv('Input/train_2017.csv' , parse_dates=["transactiondate"]))
properties = pd.read_csv('Input/properties_2016.csv')   
test = pd.read_csv('Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later

# Analyse the Dimensions of our Datasets.

print("Training Size:" + str(train.shape))
print("Property Size:" + str(properties.shape))
print("Sample Size:" + str(test.shape))


# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(properties.columns, properties.dtypes):
    if dtype == np.float64:        
        properties[c] = properties[c].astype(np.float32)
    if dtype == np.int64:
        properties[c] = properties[c].astype(np.int32)


for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

#living area proportions 
properties['living_area_prop'] = properties['calculatedfinishedsquarefeet'] / properties['lotsizesquarefeet']

#tax value ratio
properties['value_ratio'] = properties['taxvaluedollarcnt'] / properties['taxamount']

#tax value proportions
properties['value_prop'] = properties['structuretaxvaluedollarcnt'] / properties['landtaxvaluedollarcnt']


### Merging the Datasets ###

# We are merging the properties dataset with training and testing dataset for model building and testing 
# prediction

df_train = train.merge(properties, how='left', on='parcelid') 
df_test = test.merge(properties, how='left', on='parcelid')


### Remove previous variables to keep some memory
del properties, train
gc.collect();


print('Memory usage reduction...')
df_train[['latitude', 'longitude']] /= 1e6
df_test[['latitude', 'longitude']] /= 1e6

df_train['censustractandblock'] /= 1e12
df_test['censustractandblock'] /= 1e12


# Label Encoding For Machine Learning &amp; Filling Missing Values 

# We are now label encoding our datasets. All of the machine learning algorithms employed in scikit learn assume 
# that the data being fed to them is in numerical form. LabelEncoding ensures that all of our categorical 
# variables are in numerical representation. Also note that we are filling the missing values in our dataset with
#a zero before label encoding them. This is to ensure that label encoder function does not experience any problems 
# while carrying out its operation

from sklearn.preprocessing import LabelEncoder  

lbl = LabelEncoder()
for c in df_train.columns:
    df_train[c]=df_train[c].fillna(0)
    if df_train[c].dtype == 'object':
        lbl.fit(list(df_train[c].values))
        df_train[c] = lbl.transform(list(df_train[c].values))

for c in df_test.columns:
    df_test[c]=df_test[c].fillna(0)
    if df_test[c].dtype == 'object':
        lbl.fit(list(df_test[c].values))
        df_test[c] = lbl.transform(list(df_test[c].values))     

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', ], axis=1)

x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train = df_train['logerror'].values

### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y = y_train 

Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dvalid = xgb.DMatrix(Xvalid, label=yvalid)
dtest = xgb.DMatrix(x_test.values)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.035, 'colsample_bytree': 0.5, 'max_depth': 4,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'mae', 'objective': 'reg:linear' }           

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model_xgb = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)

# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

Predicted_test_xgb = model_xgb.predict(dtest)

# Submitting the Results 

# Once again load the file and start submitting the results in each column

sample_file = pd.read_csv('Output/sample_submission.csv') 
for c in sample_file.columns[sample_file.columns != 'ParcelId']:
    sample_file[c] = Predicted_test_xgb

print('Preparing the csv file ...')
sample_file.to_csv('xgb_predicted_results.csv', index=False, float_format='%.4f')
print("Finished writing the file")

  interactivity=interactivity, compiler=compiler, result=result)


Training Size:(167888, 3)
Property Size:(2985217, 58)
Sample Size:(2985217, 7)
Memory usage reduction...
[0]	train-mae:0.479326	valid-mae:0.479116
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.342448	valid-mae:0.34235
[20]	train-mae:0.248458	valid-mae:0.248453
[30]	train-mae:0.184682	valid-mae:0.18493
[40]	train-mae:0.141902	valid-mae:0.142406
[50]	train-mae:0.113752	valid-mae:0.114481
[60]	train-mae:0.095711	valid-mae:0.096593
[70]	train-mae:0.08455	valid-mae:0.085607
[80]	train-mae:0.077897	valid-mae:0.079062
[90]	train-mae:0.07405	valid-mae:0.075263
[100]	train-mae:0.071825	valid-mae:0.073076
[110]	train-mae:0.070538	valid-mae:0.071806
[120]	train-mae:0.069809	valid-mae:0.071093
[130]	train-mae:0.069362	valid-mae:0.070662
[140]	train-mae:0.06908	valid-mae:0.070402
[150]	train-mae:0.068896	valid-mae:0.070241
[160]	train-mae:0.068771	valid-mae:0.070129
[170]	train-mae:0.0

In [8]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

DirBase = "/Volumes/wms_ssd/Qualifying Exam/"

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train = pd.read_csv(DirBase+'Input/train_2016_v2.csv' , parse_dates=["transactiondate"])
train = train.append(pd.read_csv(DirBase+'Input/train_2017.csv' , parse_dates=["transactiondate"]))
properties = pd.read_csv(DirBase+'Input/properties_2016_mod.csv')   
test = pd.read_csv(DirBase+'Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later

# Analyse the Dimensions of our Datasets.

print("Training Size:" + str(train.shape))
print("Property Size:" + str(properties.shape))
print("Sample Size:" + str(test.shape))


# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(properties.columns, properties.dtypes):
    if dtype == np.float64:        
        properties[c] = properties[c].astype(np.float32)
    if dtype == np.int64:
        properties[c] = properties[c].astype(np.int32)


for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

#living area proportions 
properties['living_area_prop'] = properties['calculatedfinishedsquarefeet'] / properties['lotsizesquarefeet']

#tax value ratio
properties['value_ratio'] = properties['taxvaluedollarcnt'] / properties['taxamount']

#tax value proportions
properties['value_prop'] = properties['structuretaxvaluedollarcnt'] / properties['landtaxvaluedollarcnt']


### Merging the Datasets ###

# We are merging the properties dataset with training and testing dataset for model building and testing 
# prediction

df_train = train.merge(properties, how='left', on='parcelid') 
df_test = test.merge(properties, how='left', on='parcelid')


### Remove previous variables to keep some memory
del properties, train
gc.collect();


print('Memory usage reduction...')
df_train[['latitude', 'longitude']] /= 1e6
df_test[['latitude', 'longitude']] /= 1e6

df_train['censustractandblock'] /= 1e12
df_test['censustractandblock'] /= 1e12


# Label Encoding For Machine Learning &amp; Filling Missing Values 

# We are now label encoding our datasets. All of the machine learning algorithms employed in scikit learn assume 
# that the data being fed to them is in numerical form. LabelEncoding ensures that all of our categorical 
# variables are in numerical representation. Also note that we are filling the missing values in our dataset with
#a zero before label encoding them. This is to ensure that label encoder function does not experience any problems 
# while carrying out its operation

from sklearn.preprocessing import LabelEncoder  

lbl = LabelEncoder()
for c in df_train.columns:
    df_train[c]=df_train[c].fillna(0)
    if df_train[c].dtype == 'object':
        lbl.fit(list(df_train[c].values))
        df_train[c] = lbl.transform(list(df_train[c].values))

for c in df_test.columns:
    df_test[c]=df_test[c].fillna(0)
    if df_test[c].dtype == 'object':
        lbl.fit(list(df_test[c].values))
        df_test[c] = lbl.transform(list(df_test[c].values))     

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', ], axis=1)

x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train = df_train['logerror'].values

### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y = y_train 

Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dvalid = xgb.DMatrix(Xvalid, label=yvalid)
dtest = xgb.DMatrix(x_test.values)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.035, 'colsample_bytree': 0.5, 'max_depth': 4,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'mae', 'objective': 'reg:linear' }           

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model_xgb = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)

# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

Predicted_test_xgb = model_xgb.predict(dtest)

# Submitting the Results 

# Once again load the file and start submitting the results in each column

sample_file = pd.read_csv('Output/sample_submission.csv') 
for c in sample_file.columns[sample_file.columns != 'ParcelId']:
    sample_file[c] = Predicted_test_xgb

print('Preparing the csv file ...')
sample_file.to_csv('xgb_predicted_results_mod.csv', index=False, float_format='%.4f')
print("Finished writing the file")

Training Size:(167888, 3)
Property Size:(2985217, 45)
Sample Size:(2985217, 7)
Memory usage reduction...
[0]	train-mae:0.479329	valid-mae:0.479117
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.342412	valid-mae:0.342304
[20]	train-mae:0.248477	valid-mae:0.248444
[30]	train-mae:0.184624	valid-mae:0.18485
[40]	train-mae:0.141882	valid-mae:0.14238
[50]	train-mae:0.113725	valid-mae:0.114438
[60]	train-mae:0.095728	valid-mae:0.096602
[70]	train-mae:0.084561	valid-mae:0.085589
[80]	train-mae:0.077943	valid-mae:0.079076
[90]	train-mae:0.074074	valid-mae:0.075253
[100]	train-mae:0.07186	valid-mae:0.073066
[110]	train-mae:0.070579	valid-mae:0.071813
[120]	train-mae:0.069831	valid-mae:0.071085
[130]	train-mae:0.069377	valid-mae:0.070654
[140]	train-mae:0.069104	valid-mae:0.070397
[150]	train-mae:0.068931	valid-mae:0.070235
[160]	train-mae:0.068816	valid-mae:0.070136
[170]	train-mae:0

In [4]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

DirBase = "/Volumes/wms_ssd/Qualifying Exam/"

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train_2016 = pd.read_csv(DirBase+'Input/train_2016_v2.csv' , parse_dates=["transactiondate"])
train_2017 = pd.read_csv(DirBase+'Input/train_2017.csv' , parse_dates=["transactiondate"])
properties_2016_df = pd.read_csv(DirBase+'Input/properties_2016_mod.csv')  
properties_2017_df = pd.read_csv(DirBase+'Input/properties_2017_mod.csv')
test = pd.read_csv(DirBase+'Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later

# Analyse the Dimensions of our Datasets.

# print("Training Size:" + str(train.shape))
# print("Property Size:" + str(properties.shape))
# print("Sample Size:" + str(test.shape))


# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(properties_2016_df.columns, properties_2016_df.dtypes):
    if dtype == np.float64:        
        properties_2016_df[c] = properties_2016_df[c].astype(np.float32)
    if dtype == np.int64:
        properties_2016_df[c] = properties_2016_df[c].astype(np.int32)

for c, dtype in zip(properties_2017_df.columns, properties_2017_df.dtypes):
    if dtype == np.float64:        
        properties_2017_df[c] = properties_2017_df[c].astype(np.float32)
    if dtype == np.int64:
        properties_2017_df[c] = properties_2017_df[c].astype(np.int32)




for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

### Merging the Datasets ###

# We are merging the properties dataset with training and testing dataset for model building and testing 
# prediction

df_train_2016 = train_2016.merge(properties_2016_df, how='left', on='parcelid') 
df_train_2017 = train_2017.merge(properties_2017_df, how='left', on='parcelid')
df_train = df_train_2016.append(df_train_2017)
df_test = test.merge(properties_2017_df, how='left', on='parcelid')


### Remove previous variables to keep some memory
del properties_2016_df, properties_2017_df, train_2016, train_2017
gc.collect();


print('Memory usage reduction...')
df_train[['latitude', 'longitude']] /= 1e6
df_test[['latitude', 'longitude']] /= 1e6

df_train['censustractandblock'] /= 1e12
df_test['censustractandblock'] /= 1e12


# Label Encoding For Machine Learning &amp; Filling Missing Values 

# We are now label encoding our datasets. All of the machine learning algorithms employed in scikit learn assume 
# that the data being fed to them is in numerical form. LabelEncoding ensures that all of our categorical 
# variables are in numerical representation. Also note that we are filling the missing values in our dataset with
#a zero before label encoding them. This is to ensure that label encoder function does not experience any problems 
# while carrying out its operation

from sklearn.preprocessing import LabelEncoder  

lbl = LabelEncoder()
for c in df_train.columns:
    df_train[c]=df_train[c].fillna(0)
    if df_train[c].dtype == 'object':
        lbl.fit(list(df_train[c].values))
        df_train[c] = lbl.transform(list(df_train[c].values))

for c in df_test.columns:
    df_test[c]=df_test[c].fillna(0)
    if df_test[c].dtype == 'object':
        lbl.fit(list(df_test[c].values))
        df_test[c] = lbl.transform(list(df_test[c].values))     

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', ], axis=1)

x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train = df_train['logerror'].values

### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y = y_train 

Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dvalid = xgb.DMatrix(Xvalid, label=yvalid)
dtest = xgb.DMatrix(x_test.values)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.035, 'colsample_bytree': 0.5, 'max_depth': 4,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'mae', 'objective': 'reg:linear' }           

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model_xgb = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)

# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

Predicted_test_xgb = model_xgb.predict(dtest)

# Submitting the Results 

# Once again load the file and start submitting the results in each column

sample_file = pd.read_csv(DirBase+'Output/sample_submission.csv') 
for c in sample_file.columns[sample_file.columns != 'ParcelId']:
    sample_file[c] = Predicted_test_xgb

print('Preparing the csv file ...')
sample_file.to_csv(DirBase+'xgb_predicted_results_mod3.csv', index=False, float_format='%.4f')
print("Finished writing the file")

Memory usage reduction...
[0]	train-mae:0.479328	valid-mae:0.479121
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.342432	valid-mae:0.34234
[20]	train-mae:0.248471	valid-mae:0.248471
[30]	train-mae:0.184645	valid-mae:0.1849
[40]	train-mae:0.141844	valid-mae:0.142393
[50]	train-mae:0.113689	valid-mae:0.114498
[60]	train-mae:0.095678	valid-mae:0.096651
[70]	train-mae:0.084513	valid-mae:0.085664
[80]	train-mae:0.077857	valid-mae:0.079108
[90]	train-mae:0.073982	valid-mae:0.075273
[100]	train-mae:0.071746	valid-mae:0.073068
[110]	train-mae:0.070444	valid-mae:0.071798
[120]	train-mae:0.069693	valid-mae:0.071078
[130]	train-mae:0.069231	valid-mae:0.070647
[140]	train-mae:0.068942	valid-mae:0.070383
[150]	train-mae:0.068753	valid-mae:0.070226
[160]	train-mae:0.068609	valid-mae:0.070109
[170]	train-mae:0.068524	valid-mae:0.070058
[180]	train-mae:0.068464	valid-mae:0.070021
[190]	tr

_I will now add some considerable tweaks to the script (but no major changes to the model)._

In [None]:
#Importing Libraries or Packages that are needed throughout the Program 
import numpy as np 
import pandas as pd 
import xgboost as  xgb 
import random 
import datetime as dt 
import gc

# Load the Datasets #

DirBase = "/Volumes/wms_ssd/Qualifying Exam/"

# We need to load the datasets that will be needed to train our machine learning algorithms, handle our data and 
# make predictions. Note that these datasets are the ones that are already provided once you enter the competition
# by accepting terms and conditions

train_2016 = pd.read_csv(DirBase+'Input/train_2016_mod.csv' , parse_dates=["transactiondate"])
train_2017 = pd.read_csv(DirBase+'Input/train_2017_mod.csv' , parse_dates=["transactiondate"])
properties_2016_df = pd.read_csv(DirBase+'Input/properties_2016_mod.csv')  
properties_2017_df = pd.read_csv(DirBase+'Input/properties_2017_mod.csv')
test = pd.read_csv(DirBase+'Output/sample_submission.csv') 
test = test.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later

# Analyse the Dimensions of our Datasets.

print("Training Size:" + str(train_2016.shape) + ', ' + str(train_2017.shape))
print("Property Size:" + str(properties_2016_df.shape))
print("Sample Size:" + str(test.shape))


# Type Converting the DataSet 

# The processing of some of the algorithms can be made quick if data representation is made in int/float32 instead
# of int/float64. Therefore, in order to make sure that all of our columns types are in float32, we are
# implementing the following lines of code #

for c, dtype in zip(properties_2016_df.columns, properties_2016_df.dtypes):
    if dtype == np.float64:        
        properties_2016_df[c] = properties_2016_df[c].astype(np.float32)
    if dtype == np.int64:
        properties_2016_df[c] = properties_2016_df[c].astype(np.int32)

for c, dtype in zip(properties_2017_df.columns, properties_2017_df.dtypes):
    if dtype == np.float64:        
        properties_2017_df[c] = properties_2017_df[c].astype(np.float32)
    if dtype == np.int64:
        properties_2017_df[c] = properties_2017_df[c].astype(np.int32)




for column in test.columns:
    if test[column].dtype == int:
        test[column] = test[column].astype(np.int32)
    if test[column].dtype == float:
        test[column] = test[column].astype(np.float32)

### Merging the Datasets ###

# We are merging the properties dataset with training and testing dataset for model building and testing 
# prediction

df_train_2016 = train_2016.merge(properties_2016_df, how='left', on='parcelid') 
df_train_2017 = train_2017.merge(properties_2017_df, how='left', on='parcelid')
df_train = df_train_2016.append(df_train_2017)
df_test = test.merge(properties_2017_df, how='left', on='parcelid')


### Remove previous variables to keep some memory
del properties_2016_df, properties_2017_df, train_2016, train_2017
gc.collect();


print('Memory usage reduction...')
df_train[['latitude', 'longitude']] /= 1e6
df_test[['latitude', 'longitude']] /= 1e6

df_train['censustractandblock'] /= 1e12
df_test['censustractandblock'] /= 1e12


# Label Encoding For Machine Learning &amp; Filling Missing Values 

# We are now label encoding our datasets. All of the machine learning algorithms employed in scikit learn assume 
# that the data being fed to them is in numerical form. LabelEncoding ensures that all of our categorical 
# variables are in numerical representation. Also note that we are filling the missing values in our dataset with
#a zero before label encoding them. This is to ensure that label encoder function does not experience any problems 
# while carrying out its operation

from sklearn.preprocessing import LabelEncoder  

lbl = LabelEncoder()
for c in df_train.columns:
    df_train[c]=df_train[c].fillna(0)
    if df_train[c].dtype == 'object':
        lbl.fit(list(df_train[c].values))
        df_train[c] = lbl.transform(list(df_train[c].values))

for c in df_test.columns:
    df_test[c]=df_test[c].fillna(0)
    if df_test[c].dtype == 'object':
        lbl.fit(list(df_test[c].values))
        df_test[c] = lbl.transform(list(df_test[c].values))     

### Rearranging the DataSets ###

# We will now drop the features that serve no useful purpose. We will also split our data and divide it into the
# representation to make it clear which features are to be treated as determinants in predicting the outcome for
# our target feature. Make sure to include the same features in the test set as were included in the training set

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', ], axis=1)

x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 

x_train = x_train.values
y_train = df_train['logerror'].values

### Cross Validation ###

# We are dividing our datasets into the training and validation sets so that we could monitor and the test the
# progress of our machine learning algorithm. This would let us know when our model might be over or under fitting
# on the dataset that we have employed.

from sklearn.model_selection import train_test_split

X = x_train
y = y_train 

Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement the Xgboost #

# We can now select the parameters for Xgboost and monitor the progress of results on our validation set. The
# explanation of the xgboost parameters and what they do can be found on the following link
# http://xgboost.readthedocs.io/en/latest/parameter.html 

dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dvalid = xgb.DMatrix(Xvalid, label=yvalid)
dtest = xgb.DMatrix(x_test.values)

# Try different parameters! 
xgb_params = {'min_child_weight': 5, 'eta': 0.035, 'colsample_bytree': 0.5, 'max_depth': 4,
            'subsample': 0.85, 'lambda': 0.8, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'mae', 'objective': 'reg:linear' }           

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model_xgb = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100,
                  maximize=False, verbose_eval=10)

# Predicting the results #

# Let us now predict the target variable for our test dataset. All we have to do now is just fit the already
# trained model on the test set that we had made merging the sample file with properties dataset #

Predicted_test_xgb = model_xgb.predict(dtest)

# Submitting the Results 

# Once again load the file and start submitting the results in each column

sample_file = pd.read_csv(DirBase+'Output/sample_submission.csv') 
for c in sample_file.columns[sample_file.columns != 'ParcelId']:
    sample_file[c] = Predicted_test_xgb

print('Preparing the csv file ...')
sample_file.to_csv(DirBase+'xgb_predicted_results_mod3.csv', index=False, float_format='%.4f')
print("Finished writing the file")