In [32]:
import pandas as pd
import numpy as np
import csv
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
import boruta_py2 as brt
from sklearn.ensemble import RandomForestClassifier
import operator

from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

In [33]:
def ceate_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()

In [34]:
print('Load data...')
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
test['target'] = 5

df_merged = pd.concat([train,test])
target = df_merged['target'].values
id_merged= df_merged['ID'].values

df_merged = df_merged.drop(["target","ID"],axis=1)

Load data...


In [35]:
cat_columns = list()
num_columns = list()
for col in list(df_merged):
    if df_merged[col].dtype == 'O':
        cat_columns.append(col)
    else:
        num_columns.append(col)
        #print col
        #print len(df_merged[col].unique())
#V22 has high cardinality

In [36]:
#Add number of NA's per row as feature
df_merged['na_count'] = df_merged.isnull().sum(axis=1)

In [37]:
#For each parameter, add corresponding column if the value is non-NA
df_naList = pd.DataFrame()
for col in list(df_merged):
    temp = df_merged[col].isnull().astype(int)
    df_naList = pd.concat([df_naList,temp],axis=1)

In [38]:
cols = list(df_merged)
colNames = [s + "_isNA" for s in cols]

df_naList.columns = colNames
df_merged = pd.concat([df_merged,df_naList],axis=1)

In [39]:
#Filling NAs
for (df_merged_name, df_merged_series) in (df_merged.iteritems()):
    if df_merged_series.dtype == 'O':
        df_merged.loc[df_merged_series.isnull(),df_merged_name] = "NOT_CAPTURED"
    else:
        df_merged.loc[df_merged_series.isnull(), df_merged_name] = -9999

In [40]:
#One Hot encoding of categorical features except for V22
cat_columns.remove('v22')
df_categ = df_merged[cat_columns]

In [41]:
df_cat_Ohe = pd.get_dummies(df_categ)
df_merged = df_merged.drop(cat_columns,axis=1)
df_merged = pd.concat([df_merged,df_cat_Ohe],axis=1)

In [42]:
cat_columns.append('v22')

In [43]:
#Generate categorical probabilities - For now, use the existing file and concatinat
cat_Probs = pd.read_csv("./input/Transformed_Merged_Train_Test.csv")
cat_Probs = cat_Probs[cat_columns]
cat_Probs = cat_Probs.reset_index(drop=True)
df_merged = df_merged.drop('v22',axis=1)
df_merged = df_merged.reset_index(drop=True)
df_merged = pd.concat([df_merged,cat_Probs],axis=1)

In [44]:
#v22_freqs = dict( train['v22'].value_counts() )
df_merged['target'] = target
df_merged['id_merged']= id_merged
train = df_merged[df_merged.target != 5]
test = df_merged[df_merged.target == 5]

train_target = train.target
train = train.drop(['target','id_merged'],axis=1)
testID = test['id_merged']
test = test.drop(['target','id_merged'],axis=1)

In [47]:
rm_list = list()
for col in list(train):
    if float(np.std(train[col])) == 0.0:
        rm_list.append(col)

In [48]:
train = train.drop(rm_list,axis=1)
test = test.drop(rm_list,axis=1)

In [18]:
#remove highly correlating parameters
pandas2ri.activate()
r_corrMat = ro.r.cor(train)

In [51]:
caret = importr('caret')
rm_params = caret.findCorrelation(r_corrMat,cutoff = 0.99,names = True)
rm_params =list(rm_params)
len(rm_params)

242

In [22]:
train = train.drop(rm_params,axis=1)
test = test.drop(rm_params,axis=1)

In [56]:
#Feature Selection module
#low varying params - do not do it for now

#var_threshold = 0.1
#low_var_params = list()

#for col in list(train):
#    if np.var(train[col]<var_threshold):
#        #print np.var(train[col])
#        low_var_params.append(col)
        
#len(low_var_params)

621

In [None]:
#recursive feature elimination
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(train, train_target)


In [74]:
xgtrain = xgb.DMatrix(train.values, train_target)
xgtest = xgb.DMatrix(test.values)

In [106]:
# XGBoost params:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.01, # 0.06, #0.01,
   #"min_child_weight": 240,
   "subsample": 0.9,
   "colsample_bytree": 0.7,
   "max_depth": 7
}

In [58]:
print('Fit the model...')
boost_round = 2000 #CHANGE THIS BEFORE START
clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)

Fit the model...


In [61]:
#Run xgboost parameter importance to see if that is matching with current top
features = list(train)
ceate_feature_map(features)

In [64]:
importance = clf.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df_PI = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_PI['fscore'] = df_PI['fscore'] / df_PI['fscore'].sum()
df_PI= df_PI.sort(['fscore'],ascending=False)



In [68]:
params_prep = list(df_PI['feature'])

In [84]:
params_200 = params_prep[:200]
params_150 = params_prep[:150]
params_125 = params_prep[:125]
params_100 = params_prep[:100]
params_75 = params_prep[:75]
params_50 = params_prep[:50]

In [75]:
xgb.cv(xgboost_params,xgtrain,num_boost_round=10, nfold=5)

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.722229,0.017961,0.732297,0.018799
1,0.729386,0.015625,0.739713,0.01672
2,0.732184,0.016959,0.742613,0.017479
3,0.73953,0.008364,0.75053,0.007829
4,0.742531,0.005147,0.753725,0.004083
5,0.743455,0.0039,0.754965,0.002073
6,0.743379,0.004181,0.755139,0.002959
7,0.744059,0.003541,0.756039,0.001935
8,0.744737,0.003259,0.757141,0.001357
9,0.745048,0.003004,0.757556,0.001083


In [117]:
train_200 = train[params_200]
xgtrain_200 = xgb.DMatrix(train_200.values, train_target)

train_150 = train[params_150]
test_150 = test[params_150]
xgtrain_150 = xgb.DMatrix(train_150.values, train_target)
xgtest_150 = xgb.DMatrix(test_150.values)

train_125 = train[params_125]
xgtrain_125 = xgb.DMatrix(train_125.values, train_target)

train_100 = train[params_100]
xgtrain_100 = xgb.DMatrix(train_100.values, train_target)

train_75 = train[params_75]
xgtrain_75 = xgb.DMatrix(train_75.values, train_target)

train_50 = train[params_50]
xgtrain_50 = xgb.DMatrix(train_50.values, train_target)

In [107]:
cv_200 = xgb.cv(xgboost_params,xgtrain_200,num_boost_round=1, nfold=5)
cv_150 = xgb.cv(xgboost_params,xgtrain_150,num_boost_round=1, nfold=5)
cv_125 = xgb.cv(xgboost_params,xgtrain_125,num_boost_round=1, nfold=5)
cv_100 = xgb.cv(xgboost_params,xgtrain_100,num_boost_round=1, nfold=5)
cv_75 = xgb.cv(xgboost_params,xgtrain_75,num_boost_round=1, nfold=5)
cv_50 = xgb.cv(xgboost_params,xgtrain_50,num_boost_round=1, nfold=5)

In [91]:
merged = pd.concat([cv_200,cv_150,cv_125,cv_100,cv_75,cv_50])

In [108]:
cv_150

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.734236,0.003747,0.744638,0.001068


In [109]:
cv_125

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.718324,0.014959,0.729029,0.016208


In [110]:
cv_100

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.727097,0.017007,0.735668,0.014856


In [111]:
cv_75

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.727164,0.015951,0.736994,0.014056


In [112]:
cv_50

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.721482,0.020765,0.732258,0.021164


In [114]:
print('Fit the model...')
boost_round = 2000 #CHANGE THIS BEFORE START
clf = xgb.train(xgboost_params,xgtrain_150,num_boost_round=boost_round,verbose_eval=True,maximize=False)

Fit the model...


In [118]:
test_preds = clf.predict(xgtest_150, ntree_limit=clf.best_iteration)

In [119]:
pd.DataFrame({"ID": testID, "PredictedProb": test_preds}).to_csv('cgb_param_imp.csv',index=False)