### XGB trainging code framework:

Use ***shell scripting*** with ***python code*** to do XGB grid searching training and output all training log and scoring results for visulization or comparasion

In [None]:
#############################
#### 1. shell scripting: ####
#############################
### semi-manual grid search using shell scripting #####
### output all scoring results for each combination ###

#!/bin/bash
set -euo pippfail

#default parameters if not specified
n_estimators=200
depth=4
reg_lambda=100
scale_pos_weight=1

#define/check options and parameters
while [$# -gt 0 ] ; do
    case "$1" in 
        -n) n_estimators=$2; shift;;
        -d) depth=$2; shift;;
        -l) reg_lambda=$2; shift;;
        -s) scale_pos_weight=$2; shift;;
        -h) echo "Set tuning parameters:"
            echo "Options: -n number of trees"
            echo "         -d depth"
            echo "         -l regularization lambda"
            echo "         -s scale_pos_weight"
            exit;;
    esac
    shift
done
       
dir='pwd'
train_param=n${n_estimators}_d${depth}_l${reg_lambda}_s${scale_pos_weight}
score_file=${dir}/XGBTrainScores_${train_param}.dat
       
#### Train XGB one by one ### 
train_template=${dir}/trainXGB_template.py

# modify template with hyperparameters for this round
cat $train_template|sed "s/_NTREE_/$n_estimators/g;s/_DEPTH_/$depth/g;s/_LAMBDA_/$reg_lambda/g;s/_SCALE_/$scale_pos_weight/g" > ${dir}/trainXGB.py
# run the python script and save the outputs       
python trainXGB.py -t train.dat -m XGBtrainScores.dat -f featureMap.txt > training_${train_param}.log
mv XGBtrainScores.dat XGBtrainScores${train_param}.dat


In [None]:
###################################
#### 2. trainXGB_template.py ######
###################################

# trainXGB_template.py
import sys
import os
import argparse
import numpy as np
import pandas as pd
import csv
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import pickle
import pdb

#set display formats
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

#define functions
def printVarImport(varImportMap):
    idx=0
    for key, value in sorted(varImportmap.iteritems(), key=lambda (k,v): (v,k), reverse=True):
        if idx < 200:
            print('|'.join([key,str(value)]))
        else:
            print "..."
            break
        idx += 1

        
        
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description =
             """
             read in pipe delimited data and train model
             """)
    parser.add_argument('-t','--trainDat',required=True, dest='trainDat',help='train data file')
    parser.add_argument('-s','--testDat',required=False, dest='testDat',help='test data file to be scored')
    parser.add_argument('-v','--valDat',required=False, dest='valDat',help='validation data file to be scored')
    parser.add_argument('-o','--outTestScore',required=False, dest='outTestScore',help='output test score file')
    parser.add_argument('-m','--outTrainScore',required=False, dest='outTrainScore',help='output train score file')
    parser.add_argument('-z','--outValScore',required=False, dest='outValScore',help='output validation score file')
    parser.add_argument('-f','--featureMap',required=True, dest='featureMap',help='feature map as model inputs')
                       
        
    args = parser.parse_args()
    
    trainFile = args.trainDat
    testFile = args.testDat
    valFile = args.valDat
    outTestScore = args.outTestScore
    outTrainScore = args.outTrainScore   
    outValScore = args.outValScore
    train_fmap = args.featureMap

    train = np.genfromtxt(tranFile, delimiter='|', comments=None, missing_values=('.','',' '), filling_values=(np.nan))
    trainY = train[0:1] # second column is target
    trainX = train[0:,3:] # starting from 4th column
    wgt = train[0:,2] # weight is 2nd column

    # Fix a few hyperparameters here. Could add more into tweaking list
    learning_rate = 0.1
    gamma = 0
    subsample = 0.7
    colsample_bytree= 0.7
    min_child_weight = 50
    eta = 0.1
    
    n_estimators = _NTREE_
    max_depth = _DEPTH_
    reg_lambda = _LAMBDA_
    scale_pos_weight = _SCALE_ # for unbalanced data
    
    seed = 27
    nthread = 4
    cv_folds = 3
    early_stopping_rounds = 50

    # train classification trees, using AUC as the criteria
    xgb_class = xgb.XGBClassifier(learning_rate = learning_rate,
                                 n_estimators = n_estimators,
                                 max_depth = max_depth,
                                 min_child_weight = min_child_weight,
                                 reg_lambda = reg_lambda,
                                 subsample = subsample,
                                 colsample_bytree =colsample_bytree,
                                 objective='binary:logistic',
                                 scale_pos_weight = scale_pos_weight,
                                 nthread = nthread,
                                 seed = seed)
    xgb_param = xgb_class.get_xgb_params()
    xgtrain = xgb.DMatrix(trainX, label=trainY,weight = wgt)
    cvresult = xgb.cv(xgb_param,xgtrain, num_boost_round = n_estimators, nfold=cv_folds,metrics='auc',early_stopping_rounds=early_stopping_rounds)
    cvresult['diff-auc'] = cvresult['train-auc-mean'] - cvresult['test-auc-mean']
    xgb_class.fit(trainX, trainY,eval_metric='auc')
    
    # preditions
    dtrain_predictions = xgb_class.predict(trainX)
    dtrain_predprob = xgb_class.predict_proba(trainX)[:,1]
    
    # variable importance
    gainVarImport = xgb_class.get_booster().get_score(fmap=train_fmap, importance_type='gain')
    coverVarImport = xgb_class.get_booster().get_score(fmap=train_fmap, importance_type='cover')
       
    print "Top variables by Gain Importance"
    printVarImport(gainVarImport)
    print 
    
    print "Top variables by Cover Importance"
    printVarImport(coverVarImport)
    print 
    
    clf = xgb_class.get_booster()
    clf.dump_model('xgbModel.txt', fmap=train_fmap)
    pickle.dump(clf,open("xgb.pickle","wb"))

    
    if outTrainScore:
        print "score training data..."
        seqnumTrain = train[0:,0]
        dtrain = xgb.DMatrix(trainX, label=trainY)
        scoreTrain = clf.predict(dtrain)
        
        fTrain = open(outTrainScore,'w')
        writerTrain = csv.writer(fTrain, lineterminator='\n')
        for i in range(len(seqnumTrain)):
            int_score = int(max(1,min(999,1000*(1-scoreTrain[i]))))
            writerTrain.writerow([("%.0f" %seqnumTrain[i]),trainY[i],wgt[i],scoreTrain[i],int_score])
            
    if outTestScore:
        print "score test data..."
        scoretestData = np.genfromtxt(testFile, delimiter='|', comments=None, missing_values=('.','',' '), filling_values=(np.nan))
        scoretestY = scoretestData[0:1] # second column is target
        scoretestX = scoretestData[0:,3:] # starting from 4th column
        testwgt = scoretestData[0:,2] # weight is 2nd column
        testseqnum = scoretestData[0:,0]
        
        dtest = xgb.DMatrix(scoretestX, label=None)
        testscore = clf.predict(dtest)
        
        f = open(outTestScore,'w')
        writerTrain = csv.writer(f, lineterminator='\n')
        for i in range(len(testseqnum)):
            int_score = int(max(1,min(999,1000*(1-testscore[i]))))
            writer.writerow([("%.0f" %testseqnum[i]),scoretestX[i],testwgt[i],scoretestY[i],int_score])
            
              