In [1]:
# general
import pandas as pd
import numpy as np
import sklearn
import sys
import csv

# data preprocessing
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder

# feature selection
from sklearn.feature_selection import VarianceThreshold

# models
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# metrics
from sklearn.metrics import mean_absolute_error

# dump data to xgboost files
from sklearn.datasets import dump_svmlight_file



In [2]:
# define some constants

n_jobs = 2
random_state = 1

# data
data_dir = '../data/'
xgboost_data_dir = '../xgboost_data/'
hyperParamFile = 'hyperparams_cat.txt'

# size of data
train_size = 188318
test_size = 125546

# features to be selected
num_features_cat = 50
num_features_cont = 14

# training parameters
criterion = 'mae'
n_folds = 3  # cross validation

# random forest params
verbose_tree = 0

# xgboost params
early_stopping_rounds = 20
num_boost_rounds = 1000

In [3]:
def write_hyperparams(hyperparams, fileName):
    f = open(fileName, 'w')
    f.write(' '.join(str(x) for x in hyperparams) + '\n')
    f.close()

def read_hyperparams(fileName):
    f = open(fileName, 'r')
    n_folds, n_estimators, max_depth, max_features, \
        score = f.read().split()
    return int(n_folds), int(n_estimators), int(max_depth), \
        max_features, float(score)

In [4]:
# function to take the data and build features that can then be used to train a model

def build_features(df):
    ''' build the features to use for the model'''
    
    # remove the labels and the ids for use
    
    y = df.pop('loss').values if 'loss' in list(df) else None
    ids = df.pop('id').values

    # do not always impute data for tree models
    # compute using another category
    
    # create an imputer for imputer values later
    imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    
    # determine whether data is categorical or continuous
    leave_one_out_cols = []
    leave_one_out_counts = []
    one_hot_cols = []
    col_count = 0
    for column in df:

        # determine if the column is categorical or not
        if 'cat' in column:

            # create an encoding for categorical vars
            mapping = {label:idx for idx, label in \
                enumerate(np.unique(df[column]))}

            # convert everything into integers for categorical
            df[column] = df[column].map(mapping)
            df[column] = df[column].astype(int)
            
            unique_elems = len(mapping)

            # perform leave-one-out counting
            if unique_elems > 10:
                #df[column] = [df[column].values.tolist().count(x)-1 for \
                #	x in df[column].values.tolist()]
                leave_one_out_cols.append(col_count)
                # initialize counts to -1 for Leave One Out counting
                leave_one_out_counts.append([-1 for x in range(unique_elems)])
            else:
                one_hot_cols.append(col_count)

        col_count += 1

    imputer = imputer.fit(df)
    X = imputer.transform(df.values)
    X_cat = X[:, :116]
    X_cont = X[:, 116:]
    
    # transform data to leave-one-out counting
    for num, col in enumerate(leave_one_out_cols):
        # count the data
        for idx, value in enumerate(X_cat[:, col]):
            leave_one_out_counts[num][int(value)] = leave_one_out_counts[num][int(value)] + 1
        # apply the counted data to form LOO data
        for idx, value in enumerate(X_cat[:, col]):
            X_cat[idx][col] = leave_one_out_counts[num][int(value)] 
            
    # transform data to one-hot encoded
    one_hot_encoder = OneHotEncoder(categorical_features=one_hot_cols, sparse=False)
    X_cat = one_hot_encoder.fit_transform(X_cat)
    
    print('X cat size')
    print(X_cat.shape)
    print('X cont size')
    print(X_cont.shape)
    
    return X_cat, X_cont, y, ids

In [5]:
# read in the train dataset
print('loading data...')

df_train = pd.read_csv(data_dir + 'train.csv', header=0)
df_test = pd.read_csv(data_dir + 'test.csv', header=0)
df_all = df_train.append(df_test)  # we want to process all data at the same time for counts and such

print('processing cross validation data...')
# process the training data alone for cross validation
X_cv_cat, X_cv_cont, y_cv, ids_cv = build_features(df_train)

print('processing all data...')
# process the training and test data together
X_all_cat, X_all_cont, y_all, ids_all = build_features(df_all)

loading data...
processing cross validation data...
X cat size
(188318, 297)
X cont size
(188318, 14)
processing all data...
X cat size
(313864, 301)
X cont size
(313864, 14)


In [6]:
# separate the data into training and test sets

# test
X_test_cat = X_all_cat[train_size:, :]
X_test_cont = X_all_cont[train_size:, :]
y_test = y_all[train_size:]
ids_test = ids_all[train_size:]

# train
X_train_cat = X_all_cat[:train_size, :]
X_train_cont = X_all_cont[:train_size, :]
y_train = y_all[:train_size]
ids_train = ids_all[:train_size]

In [7]:
# convert the data into datarames and then write to .csv files
print('converting data to dataframes...')

# cross validation data
df_cv_cont = pd.DataFrame(X_cv_cont)
df_cv_cat = pd.DataFrame(X_cv_cat)
df_cv_y = pd.DataFrame(y_cv)
df_cv_ids = pd.DataFrame(ids_cv)

# test data
df_test_cont = pd.DataFrame(X_test_cont)
df_test_cat = pd.DataFrame(X_test_cat)
df_test_ids = pd.DataFrame(ids_test)

# train data
df_train_cont = pd.DataFrame(X_train_cont)
df_train_cat = pd.DataFrame(X_train_cat)
df_train_y = pd.DataFrame(y_train)
df_train_ids = pd.DataFrame(ids_train)

print('writing the data to .csv files...')

# cross validation data
df_cv_cont.to_csv(path_or_buf=data_dir+'X_cv_cont.csv')
df_cv_cat.to_csv(path_or_buf=data_dir+'X_cv_cat.csv')
df_cv_y.to_csv(path_or_buf=data_dir+'y_cv.csv')
df_cv_ids.to_csv(path_or_buf=data_dir+'ids_cv.csv')

# test data
df_test_cont.to_csv(path_or_buf=data_dir+'X_test_cont.csv')
df_test_cat.to_csv(path_or_buf=data_dir+'X_test_cat.csv')
df_test_ids.to_csv(path_or_buf=data_dir+'ids_test.csv')

# train data
df_train_cont.to_csv(path_or_buf=data_dir+'X_train_cont.csv')
df_train_cat.to_csv(path_or_buf=data_dir+'X_train_cat.csv')
df_train_y.to_csv(path_or_buf=data_dir+'y_train.csv')
df_train_ids.to_csv(path_or_buf=data_dir+'ids_train.csv')

print('data has been processed and written to .csv files in ' + data_dir)

converting data to dataframes...
writing the data to .csv files...
data has been processed and written to .csv files in ../data/


In [8]:
# read in the train dataset from .csv files
# (only needed if starting from here and not above)
print('loading data...')

# cross validation data
df_cv_cont = pd.read_csv(data_dir+'X_cv_cont.csv', header=0, index_col=0)
df_cv_cat = pd.read_csv(data_dir+'X_cv_cat.csv', header=0, index_col=0)
df_cv_y = pd.read_csv(data_dir+'y_cv.csv', header=0, index_col=0)
df_cv_ids = pd.read_csv(data_dir+'ids_cv.csv', header=0, index_col=0)

# test data
df_test_cont = pd.read_csv(data_dir+'X_test_cont.csv', header=0, index_col=0)
df_test_cat = pd.read_csv(data_dir+'X_test_cat.csv', header=0, index_col=0)
df_test_ids = pd.read_csv(data_dir+'ids_test.csv', header=0, index_col=0)

# train data
df_train_cont = pd.read_csv(data_dir+'X_train_cont.csv', header=0, index_col=0)
df_train_cat = pd.read_csv(data_dir+'X_train_cat.csv', header=0, index_col=0)
df_train_y = pd.read_csv(data_dir+'y_train.csv', header=0, index_col=0)
df_train_ids = pd.read_csv(data_dir+'ids_train.csv', header=0, index_col=0)
print('data has been loaded!')

loading data...
data has been loaded!


In [9]:
# get values from the dataframes
X_cv_cont = df_cv_cont.values
X_cv_cat = df_cv_cat.values
y_cv = np.ravel(df_cv_y.values)

# create a basic Random Forest Classifier to use for feature selection
print('creating a model...')
# create a tree to select features
tree_cat = RandomForestRegressor(n_jobs=n_jobs,
    random_state=1, n_estimators=100,
    max_features='sqrt', max_depth=10)
tree_cont = RandomForestRegressor(n_jobs=n_jobs,
    random_state=1, n_estimators=100,
    max_features='sqrt', max_depth=10)
print('done.')

creating a model...
done.


In [10]:
# feature selection
print('selecting features...')

# use variance threshold to select features
# many of the features are in categories with few vars
selector_variance_cat = VarianceThreshold(threshold=0.01)
X_cv_cat = selector_variance_cat.fit_transform(X_cv_cat)
print('shape of X_cv_cat after variance threshold')
print(X_cv_cat.shape)


selecting features...
shape of X_cv_cat after variance threshold
(188318, 175)


In [11]:
# fit the model for continuous data
print('fitting tree to continuous data...')
tree_cont.fit(X_cv_cont, y_cv)
feature_importances_cont = tree_cont.feature_importances_
feature_mapping_cont = {importance:idx for idx, importance in \
    enumerate(feature_importances_cont)}
sorted_features_cont = feature_importances_cont.argsort()
sorted_indices_cont = []
print(sorted_features_cont)
for x in sorted_features_cont[:num_features_cont]:
    sorted_indices_cont.insert(0, x)

# create a basic tree for categorical features
print('fitting tree to categorical data...')
tree_cat.fit(X_cv_cat, y_cv)
feature_importances_cat = tree_cat.feature_importances_
feature_mapping_cat = {importance:idx for idx, importance in \
    enumerate(feature_importances_cat)}
sorted_features_cat = feature_importances_cat.argsort()
sorted_indices_cat = []
print(sorted_features_cat)
for x in sorted_features_cat[:num_features_cat]:
    sorted_indices_cat.insert(0, x)

print('finished features selection!')

fitting tree to continuous data...
[ 4  9  8  7  3  0 12  5  2 10 11  1 13  6]
fitting tree to categorical data...
[116 148  89 154 141 109 105 157  74 110  79 143  88 145  75  78 136 156
  61  44  46  45  26 113  60  80  27  58  54  32  14 155  64  33 111  55
  15  81 130  43 134  29 121 122 106 144  65 153  59 108 140 152 107 112
  47  98  42 138 104 151  66 133 150 129  67 147 127 142  56 139  92  41
 131  70 114 117  40  50  34  71  93 115 149 132 125  49  51 103  30  86
 126 146  31  39  57  37  35  48  28  10  36   6  94 128  69  11  68  38
  87   8   5   7  20   9  83  82  52 158 101  53  63  62 165  73  91   4
  72 170  17  90 162  24 159   2  25  16 166 172 169 164  21 119 168   3
 161 135 173 167  84 137   0  85 174  12 120 124   1  18 123 100 171 163
  22  19  23  13 102  77  76 118 160  95  97  96  99]
finished features selection!


In [12]:
print('writing output data with selected features...')

# cross validation
df_cv_cont_sel = df_cv_cont.iloc[:, sorted_indices_cont[:num_features_cont]]
df_cv_cont_sel.to_csv(path_or_buf=xgboost_data_dir+'X_cv_cont_sel.csv')
df_cv_cat_sel = df_cv_cat.iloc[:, sorted_indices_cat[:num_features_cat]]
df_cv_cat_sel.to_csv(path_or_buf=xgboost_data_dir+'X_cv_cat_sel.csv')

# test
df_test_cont_sel = df_test_cont.iloc[:, sorted_indices_cont[:num_features_cont]]
df_test_cont_sel.to_csv(path_or_buf=xgboost_data_dir+'X_test_cont_sel.csv')
df_test_cat_sel = df_test_cat.iloc[:, sorted_indices_cat[:num_features_cat]]
df_test_cat_sel.to_csv(path_or_buf=xgboost_data_dir+'X_test_cat_sel.csv')

#train
df_train_cont_sel = df_train_cont.iloc[:, sorted_indices_cont[:num_features_cont]]
df_train_cont_sel.to_csv(path_or_buf=xgboost_data_dir+'X_test_cont_sel.csv')
print(sorted_indices_cat[:num_features_cat])
print(df_train_cat.shape)
df_train_cat_sel = df_train_cat.iloc[:, sorted_indices_cat[:num_features_cat]]
df_train_cat_sel.to_csv(path_or_buf=xgboost_data_dir+'X_test_cat_sel.csv')

# write the data to svmlight files that can be used by xgboost

# cross validation
X_cv_cont_sel = df_cv_cont_sel.values
dump_svmlight_file(X_cv_cont_sel, y_cv, xgboost_data_dir+'cv_cont_sel.dat',
                   zero_based=True, multilabel=False)

# train
X_train_cont_sel = df_train_cont_sel.values
dump_svmlight_file(X_train_cont_sel, y_train, xgboost_data_dir+'X_train_cont_sel.dat',
                    zero_based=True, multilabel=False)

# test
X_test_cont_sel = df_test_cont_sel.values
# make up y_test because it does not exist
y_test = [ 0 for x in range(X_test_cont_sel.shape[0])]
dump_svmlight_file(X_test_cont_sel, y_test, xgboost_data_dir+'X_test_cont_sel.dat',
                   zero_based=True, multilabel=False)


# cross validation
X_cv_cat_sel = df_cv_cat_sel.values
# some checks to see status
print('df_cv_cat_sel head')
print(df_cv_cat_sel.head())
print('df_cv_cat_sel column sum')
print(df_cv_cat_sel.sum(axis=0))
print('df_cv_y head')
print(df_cv_y.head())
dump_svmlight_file(X_cv_cat_sel, y_cv, xgboost_data_dir+'cv_cat_sel.dat',
                   zero_based=True, multilabel=False)

# train
X_train_cat_sel = df_train_cat_sel.values
dump_svmlight_file(X_train_cat_sel, y_train, xgboost_data_dir+'X_train_cat_sel.dat',
                   zero_based=True, multilabel=False)

# test
X_test_cat_sel = df_test_cat_sel.values
print('df_test_cat_sel head')
print(df_test_cat_sel.head())
dump_svmlight_file(X_test_cat_sel, y_test, xgboost_data_dir+'X_test_cat_sel.dat',
                   zero_based=True, multilabel=False)

print('successfully wrote selected features to file!')

writing output data with selected features...
[108, 59, 153, 65, 144, 106, 122, 121, 29, 134, 43, 130, 81, 15, 55, 111, 33, 64, 155, 14, 32, 54, 58, 27, 80, 60, 113, 26, 45, 46, 44, 61, 156, 136, 78, 75, 145, 88, 143, 79, 110, 74, 157, 105, 109, 141, 154, 89, 148, 116]
(188318, 301)
df_cv_cat_sel head
   108   59  153   65  144  106  122  121   29  134 ...   110   74  157  105  \
0  1.0  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  1.0 ...   1.0  1.0  0.0  0.0   
1  1.0  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  1.0 ...   1.0  1.0  0.0  0.0   
2  1.0  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  1.0 ...   1.0  1.0  0.0  0.0   
3  1.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0 ...   1.0  1.0  0.0  0.0   
4  1.0  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  1.0 ...   1.0  1.0  0.0  0.0   

   109  141  154   89  148  116  
0  0.0  0.0  0.0  0.0  0.0  1.0  
1  0.0  0.0  0.0  0.0  0.0  1.0  
2  0.0  0.0  0.0  0.0  0.0  1.0  
3  0.0  0.0  0.0  0.0  0.0  1.0  
4  0.0  0.0  0.0  0.0  0.0  1.0  

[5 rows x 5

In [13]:
# paramlist for xgboost
param = {}
param['objective'] = 'reg:linear'
param['nthread'] = 2
param['eval_metric'] = 'mae'
# maximum depth of tree
param['max_depth'] = 10  #3-10
# analogous to learning rate
param['eta'] = 0.05  #0.01-0.02
# larger values prevent overfitting
param['min_child_weight'] = 0.3  #0-1
param['silent'] = 1  # prints messages to screen (1 silences these)
param['tree_method'] = 'auto'
param['lambda'] = 0  #0-1
param['alpha'] = 0  #0-1

In [14]:
# train the categorical model for the data

print('cross validation using categorical xgboost model...')
data_cv_cat = xgb.DMatrix(xgboost_data_dir+'cv_cat_sel.dat')
evaluations_cat = xgb.cv(params=param, dtrain=data_cv_cat, num_boost_round=num_boost_rounds,
            nfold=n_folds, early_stopping_rounds=early_stopping_rounds)
print(evaluations_cat)

print('training with all cat train data...')
data_train_cat = xgb.DMatrix(xgboost_data_dir+'X_train_cat_sel.dat')
tree_cat = xgb.train( param, data_train_cat, num_boost_rounds)
tree_cat.save_model('categorical.model')

print('predicting outputs...')
data_test_cat = xgb.DMatrix(xgboost_data_dir+'X_test_cat_sel.dat')
y_pred_cat = tree_cat.predict(data_test_cat)
print(y_pred_cat)

cross validation using categorical xgboost model...
    test-mae-mean  test-mae-std  train-mae-mean  train-mae-std
0     2885.043050     16.779644     2885.056966       8.111193
1     2741.287923     16.463348     2741.263835       7.689561
2     2606.070800     16.099239     2605.879232       7.290617
3     2480.235596     15.731681     2479.781087       6.894811
4     2365.227213     15.301017     2364.294271       6.526983
5     2261.316569     14.896306     2259.838542       6.176213
6     2168.536296     14.447310     2166.397949       5.875583
7     2086.390950     14.078405     2083.548421       5.619960
8     2014.153442     13.600379     2010.526042       5.404990
9     1950.998210     13.132238     1946.467814       5.211249
10    1896.078125     12.680925     1890.621216       4.983472
11    1848.434814     12.307592     1842.058390       4.800816
12    1807.278157     11.978721     1799.890259       4.694602
13    1771.799520     11.726861     1763.399821       4.506312
14 

In [15]:
# train the continuous model for the data

print('cross validation using continuous xgboost model...')
data_cv_cont = xgb.DMatrix(xgboost_data_dir+'cv_cont_sel.dat')
evaluations_cont = xgb.cv(params=param, dtrain=data_cv_cont, num_boost_round=num_boost_rounds,
            nfold=n_folds, early_stopping_rounds=early_stopping_rounds)
print(evaluations_cont)

print('training with all continous train data...')
data_train_cont = xgb.DMatrix(xgboost_data_dir+'X_train_cont_sel.dat')
tree_cont = xgb.train( param, data_train_cont, num_boost_rounds)
tree_cont.save_model('continuous.model')

print('predicting outputs...')
data_test_cont = xgb.DMatrix(xgboost_data_dir+'X_test_cont_sel.dat')
y_pred_cont = tree_cont.predict(data_test_cont)
print(y_pred_cont)

cross validation using continuous xgboost model...
    test-mae-mean  test-mae-std  train-mae-mean  train-mae-std
0     2885.000733     17.566056     2885.046387       8.116454
1     2741.687826     18.068665     2741.249105       7.719913
2     2607.395589     18.369722     2606.254639       7.323146
3     2484.220378     18.568338     2481.954508       7.019232
4     2374.279541     18.497003     2370.353027       6.772639
5     2278.070231     18.172985     2272.374186       6.669026
6     2195.375325     17.988749     2187.642659       6.516154
7     2124.711914     17.628771     2114.935872       6.451837
8     2064.680054     17.190060     2052.587443       6.537306
9     2013.584839     16.457641     1999.315877       6.571406
10    1970.574056     15.895278     1953.770223       6.502162
11    1934.475179     15.548049     1915.096232       6.522559
12    1904.349650     14.972430     1882.288656       6.541310
13    1879.015503     14.537666     1854.568237       6.619546
14  

In [20]:
# Average the outputs together and write the to .csv files

test_pred = (y_pred_cat + y_pred_cont) / 2
print(test_pred)
print(ids_test)

[ 2359.90478516  3104.21313477  5786.97460938 ...,  2947.47460938
  2374.19750977  2965.62744141]
[     4      6      9 ..., 587627 587629 587634]


In [21]:
# write output to a file
outputFile = 'allstateClaims_combined.csv'
print('writing output to %s...' % outputFile)
prediction_file = open(outputFile, 'w')
open_file_object = csv.writer(prediction_file)
open_file_object.writerow(['id', 'loss'])
open_file_object.writerows(zip(ids_test, test_pred))
prediction_file.close()
print('completed')

writing output to allstateClaims_combined.csv...
completed


In [19]:
# average the outputs together
print('loading all prediction data...')

df_out_cont = pd.read_csv(xgboost_data_dir+'allstateClaims_cont.csv', header=0, index_col=0)
df_out_cat = pd.read_csv(xgboost_data_dir+'allstateClaims_cat.csv', header=0, index_col=0)
df_out_ids = pd.read_csv(data_dir+'ids_test.csv', header=0, index_col=0)

# average the output of the two columns
out_cont = df_out_cont.values
print(out_cont[:,0])
out_cat = df_out_cat.values
print(out_cat[:, 0])
out_ids = df_out_ids.values
print(out_ids[:, 0])

test_pred = (out_cont + out_cat) / 2.0
print(test_pred[:, 0])

# write the output to csv
outputFile = '../allStateClaims_combination.csv'
print('writing output to %s...' % outputFile)
prediction_file = open(outputFile, 'w')
open_file_object = csv.writer(prediction_file)
open_file_object.writerow(['id', 'loss'])
open_file_object.writerows(zip(out_ids[:, 0], test_pred[:, 0]))
prediction_file.close()
print('completed!')

loading all prediction data...
[ 1994.87121582  2295.13256836  2442.53393555 ...,  2353.89086914
  3014.96923828  1895.92468262]
[ 20294.671875    16088.92382812   5645.06201172 ...,  16088.92382812
  16088.92382812  16088.92382812]
[     4      6      9 ..., 587627 587629 587634]
[ 11144.77154541   9192.02819824   4043.79797363 ...,   9221.40734863
   9551.9465332    8992.42425537]
writing output to ../allStateClaims_combination.csv...
completed!
