In [1]:
from base_layer_utils import BaseLayerDataRepo, BaseLayerResultsRepo, ModelName
from base_layer_utils import SklearnBLE

#from fast_text_data import fasttext_data_process
#from tfidf_data import tfidf_data_process

import pandas as pd
import numpy as np
#from sklearn.cross_validation import KFold # replace with model_selection?
#from sklearn.model_selection import KFold
import time, re, gc
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
PATH = '~/data/toxic/data/'

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

print(train.shape)
print(test.shape)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

(159571, 8)
(153164, 2)


# Layer2 feature engineer (Add some features and combine them with layer1 model_data)

In [4]:
from sklearn.preprocessing import StandardScaler

#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)

INPUT_COLUMN = "comment_text"
# Engineer features
feature_functions = [len, asterix_freq, uppercase_freq]
features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)



## Ensembling:

In [5]:
def combine_layer_oof_per_label(layer1_oof_dict, label):
    """
    Util method for stacking
    """
    x = None
    data_list = layer1_oof_dict[label]
    for i in range(len(data_list)):
        if i == 0:
            x = data_list[0]
        else:
            x = np.concatenate((x, data_list[i]), axis=1)
    return x

### 1. simple blend of two models.
### (Ignore this method for now and check out: 2. stack)

result = np.empty((test.shape[0],len(label_cols)))

# mix the first two models
for i, label in enumerate(label_cols):
    x_train = combine_layer_oof_per_label(layer1_oof_train, label)
    x_test = combine_layer_oof_per_label(layer1_oof_test, label)
    for j in range(x_train.shape[1]):
        roc = roc_auc_score(train[label], x_train[:,j])
        print(label, j, roc) # print out roc for meta feature on meta label (which is just the original train label)
    
    roc_scores_of_a_label = []
    alphas = np.linspace(0,1,1001)
    best_roc = 0
    best_alpha = 0
    for alpha in alphas:
        roc = roc_auc_score(train[label], alpha*x_train[:,0] + (1-alpha)*x_train[:,1])
        if roc > best_roc:
            best_roc = roc
            best_alpha = alpha
    
    print(label, best_roc, best_alpha)
    result[:,i] = best_alpha*x_test[:,0] + (1-best_alpha)*x_test[:,1]

submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
submission[label_cols] = result
sub_id = int(time.time())
print(sub_id)
submission.to_csv('./StackPreds/mixtwo_' + str(sub_id) + '.csv', index=False)

### 2. stacking

In [6]:
# load the saved repo. IMPORTANT: set load_from_file to True! or you will overwrite the saved repo
base_layer_results_repo = BaseLayerResultsRepo(load_from_file=True, filepath='obj/WithPreprocessedFile/')

load from file


In [7]:
scores = base_layer_results_repo.show_scores()

0.9888	ModelName.NBLOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9777	ModelName.LOGREG_tfidf_word_(1, 1)_30000_1_1.0
0.9666	ModelName.LOGREG_PERLABEL_tfidf_word_(1, 1)_30000_1_1.0


In [15]:
# two ways to choose model_data from the repo: set a threashold, or give a list to ones you want
# a list can be sth like: 
# chosen = ['ModelName.NBLOGREG_tfidf_word_(1, 1)_30000_1_1.0'
#          'ModelName.LOGREG_tfidf_word_(1, 1)_30000_1_1.0']
layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=0.95)

f = open('./xgb_search.csv', 'a')
# header = 'time,id,th,amt,csbt,lr,md,ss,ga,a,rounds,folds,tops,sps,ops,\
# thps,inps,ihps,tobr,sbr,obr,thbr,inbr,ihbr,sth1,sth2,sth3,sth4,sth5,\
# to_auc,s_auc,o_auc,th_auc,in_auc,ih_auc,avg_auc\n'
header = 'time,id,threshold,num_models,colsample_bytree,lr,max_depth,subsample,\
gamma,alpha,cv_num_round,cv_nfolds,toxic_pos_scale,severe_toxic_pos_scale,obscene_pos_scale,\
threat_pos_scale,insult_pos_scale,identity_hate_pos_scale,sth1,sth2,sth3,sth4,sth5,\
toxic_best_round,severe_toxic_best_round,obscene_best_round,threat_best_round,\
insult_best_round,identity_hate_best_round,toxic_auc,severe_toxic_auc,obscene_auc,\
threat_auc,insult_auc,identity_hate_auc,avg_auc\n'
f.write(header)
f.close()

In [9]:
def get_time():
    from datetime import datetime
    from dateutil import tz

    # METHOD 1: Hardcode zones:
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/New_York')


    utc = datetime.utcnow()

    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    utc = utc.replace(tzinfo=from_zone)

    # Convert time zone
    est = utc.astimezone(to_zone)
    
    return est.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import cross_val_score

for i in range(130):
    now = get_time()
    search_id = int(time.time())
    np.random.seed(int(time.time()* 1000000) % 45234634)
    
    model_threshold = np.random.choice(ths)#[0.9803, 0.9794, 0.9793, 0.9786, 0.9774, 0.9768, 0.9765])
    layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=model_threshold)
    gc.collect() 
        
    xgb_colsample_bytree = np.random.randint(5, 10)/10
    xgb_learning_rate = 1e-2 * (0.1 ** (np.random.rand() * 2 - 1.0)) # 0.001 to 0.0997
    xgb_max_depth = np.random.randint(2, 8)
    xgb_subsample = np.random.randint(50, 100)/100
    xgb_gamma = np.random.randint(0, 3)
    xgb_alpha = np.random.randint(0, 2)

    xgb_cv_seed = 0
    xgb_cv_num_round = np.random.randint(400, 1000)
    xgb_cv_nfolds = np.random.randint(3,5)
    
    scale_pos_weights = {}
    scale_pos_weights['toxic'] = np.random.randint(2, 15) #10
    scale_pos_weights['severe_toxic'] = np.random.randint(20, 130) # 100
    scale_pos_weights['obscene'] = np.random.randint(3, 25) # 17
    scale_pos_weights['threat'] = np.random.randint(60, 380) # 333
    scale_pos_weights['insult'] = np.random.randint(4, 25) # 20
    scale_pos_weights['identity_hate'] = np.random.randint(30, 140) #112

    xgb_params = {
        'seed': 0,
        'colsample_bytree': xgb_colsample_bytree,
        'silent': 1,
        'subsample': xgb_subsample,
        'learning_rate': xgb_learning_rate,
        'max_depth': xgb_max_depth,
        'gamma': xgb_gamma,
        'alpha': xgb_alpha,
        'nthread': 5,
        'min_child_weight': 1,
        'objective':'binary:logistic',
        'eval_metric':'auc'
    }

    num_models = len(layer1_oof_train_loaded['toxic'])
    #print('Stacking {} models'.format(num_models)) # number of models that will be stacked

    print('time: %s, id: %d, th: %f, num_models: %d, colsample_bytree: %f, lr: %.7f, \
    max_depth: %d, subsample: %f, gamma: %d, alpha: %d, cv_num_round: %d, cv_nfolds: %d,\
    to_pw: %d, s_pw: %d, ob_pw: %d, th_pw: %d, in_pw: %d, ih_pw: %d\
            '%(now,search_id,model_threshold,num_models,\
              xgb_colsample_bytree,xgb_learning_rate,\
              xgb_max_depth,xgb_subsample,xgb_gamma,\
              xgb_alpha,xgb_cv_num_round,xgb_cv_nfolds,\
              scale_pos_weights['toxic'],scale_pos_weights['severe_toxic'],\
              scale_pos_weights['obscene'],scale_pos_weights['threat'],\
              scale_pos_weights['insult'],scale_pos_weights['identity_hate']))
    

    result = np.empty((test.shape[0],len(label_cols)))
    metric_dict = {} # all labels
    best_nrounds = {}  # all labels

    for i, label in enumerate(label_cols):
        assert train.shape == (159571, 27)
        x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
        x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)

    #     clf = XGBClassifier()
    #     #scores = cross_val_score(clf, x_train, train[label], cv=3, scoring='roc_auc')
    #     #print(scores)
    #     #print("Stacking-CV: ROC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    #     clf.fit(x_train, train[label])
    #     result[:, i] = clf.predict_proba(x_test)[:,1]

        dtrain = xgb.DMatrix(x_train, train[label]) # check if train is still in right shape
        dtest = xgb.DMatrix(x_test)

        def xg_eval_auc(yhat, dtrain):
            y = dtrain.get_label()
            return 'auc', roc_auc_score(y, yhat)

        xgb_params['scale_pos_weight'] = scale_pos_weights[label]
        
        res = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_cv_num_round, nfold=xgb_cv_nfolds, seed=xgb_cv_seed, stratified=False,
                 early_stopping_rounds=25, verbose_eval=None, show_stdv=False, feval=xg_eval_auc, maximize=True)
        # early stopping is based on eavl on test fold. so check out the test-auc
        #pdb.set_trace()
        best_nrounds_for_current_label = res.shape[0] - 1
        #print(res[-3:])
        cv_mean = res.iloc[-1, 0]
        cv_std = res.iloc[-1, 1]

        #print('Ensemble-CV: {}: {}+{}'.format(label, cv_mean, cv_std))
        metric_dict[label] = cv_mean
        best_nrounds[label] = best_nrounds_for_current_label
        #metric_dict[label]['cv_mean'] = cv_mean
        #metric_dict[label]['cv_std'] = cv_std
        gbdt = xgb.train(xgb_params, dtrain, best_nrounds_for_current_label)

        result[:,i] = gbdt.predict(dtest)#_proba(x_test)[:,1]

    #print('Stacking done')

    avg_auc = 0
    for label in label_cols:
        avg_auc += metric_dict[label]
    avg_auc/=6
          
    res = '%s,%d,%f,%d,%f,%.7f,%d,%f,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%.8f,%.8f,%.8f,%.8f,%.8f,%.8f,%.8f\n\
            '%(now,search_id,model_threshold,num_models,xgb_colsample_bytree,\
               xgb_learning_rate,xgb_max_depth,xgb_subsample,xgb_gamma,xgb_alpha,\
               xgb_cv_num_round,xgb_cv_nfolds,scale_pos_weights['toxic'],scale_pos_weights['severe_toxic'],\
               scale_pos_weights['obscene'],scale_pos_weights['threat'],scale_pos_weights['insult'],\
               scale_pos_weights['identity_hate'],-99,-99,-99,-99,-99,best_nrounds['toxic'],best_nrounds['severe_toxic'],\
               best_nrounds['obscene'],best_nrounds['threat'],best_nrounds['insult'],\
               best_nrounds['identity_hate'],metric_dict['toxic'],metric_dict['severe_toxic'],\
               metric_dict['obscene'],metric_dict['threat'],metric_dict['insult'],\
               metric_dict['identity_hate'],avg_auc)

    f = open('./xgb_search.csv', 'a')
    f.write(res)
    f.close()

#     sub_tile = 'stacking_test_'
#     submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
#     submission[label_cols] = result
#     submission.to_csv('./StackPreds/' + sub_tile + str(search_id) + '.csv', index=False)

time: 2018-03-02 00:25:50, id: 1519968350, th: 0.980300, num_models: 5, colsample_bytree: 0.700000, lr: 0.0152982,     max_depth: 4, subsample: 0.690000, gamma: 2, alpha: 0, cv_num_round: 633, cv_nfolds: 4,    to_pw: 6, s_pw: 74, ob_pw: 14, th_pw: 139, in_pw: 10, ih_pw: 128            
time: 2018-03-02 00:34:34, id: 1519968874, th: 0.981500, num_models: 4, colsample_bytree: 0.500000, lr: 0.0105918,     max_depth: 4, subsample: 0.670000, gamma: 1, alpha: 0, cv_num_round: 729, cv_nfolds: 4,    to_pw: 7, s_pw: 121, ob_pw: 23, th_pw: 292, in_pw: 21, ih_pw: 42            
time: 2018-03-02 00:40:43, id: 1519969243, th: 0.979300, num_models: 7, colsample_bytree: 0.700000, lr: 0.0027763,     max_depth: 4, subsample: 0.810000, gamma: 0, alpha: 1, cv_num_round: 471, cv_nfolds: 3,    to_pw: 7, s_pw: 52, ob_pw: 24, th_pw: 251, in_pw: 13, ih_pw: 118            
time: 2018-03-02 00:44:33, id: 1519969473, th: 0.979300, num_models: 7, colsample_bytree: 0.500000, lr: 0.0010538,     max_depth: 7, subsam

time: 2018-03-02 03:22:59, id: 1519978979, th: 0.979300, num_models: 7, colsample_bytree: 0.500000, lr: 0.0685323,     max_depth: 2, subsample: 0.880000, gamma: 2, alpha: 1, cv_num_round: 984, cv_nfolds: 4,    to_pw: 8, s_pw: 87, ob_pw: 11, th_pw: 93, in_pw: 10, ih_pw: 109            
time: 2018-03-02 03:31:54, id: 1519979514, th: 0.976800, num_models: 10, colsample_bytree: 0.900000, lr: 0.0392708,     max_depth: 5, subsample: 0.630000, gamma: 0, alpha: 0, cv_num_round: 498, cv_nfolds: 4,    to_pw: 11, s_pw: 121, ob_pw: 23, th_pw: 351, in_pw: 23, ih_pw: 57            
time: 2018-03-02 03:37:12, id: 1519979832, th: 0.977400, num_models: 9, colsample_bytree: 0.700000, lr: 0.0030536,     max_depth: 6, subsample: 0.700000, gamma: 0, alpha: 1, cv_num_round: 873, cv_nfolds: 4,    to_pw: 4, s_pw: 114, ob_pw: 7, th_pw: 341, in_pw: 4, ih_pw: 118            
time: 2018-03-02 03:45:08, id: 1519980308, th: 0.978600, num_models: 8, colsample_bytree: 0.600000, lr: 0.0649149,     max_depth: 7, subsam

time: 2018-03-02 07:15:18, id: 1519992918, th: 0.980300, num_models: 5, colsample_bytree: 0.600000, lr: 0.0010395,     max_depth: 7, subsample: 0.840000, gamma: 1, alpha: 1, cv_num_round: 695, cv_nfolds: 4,    to_pw: 4, s_pw: 85, ob_pw: 23, th_pw: 341, in_pw: 9, ih_pw: 105            
time: 2018-03-02 07:21:05, id: 1519993265, th: 0.979400, num_models: 6, colsample_bytree: 0.800000, lr: 0.0032795,     max_depth: 2, subsample: 0.860000, gamma: 1, alpha: 1, cv_num_round: 544, cv_nfolds: 3,    to_pw: 9, s_pw: 43, ob_pw: 9, th_pw: 69, in_pw: 19, ih_pw: 61            
time: 2018-03-02 07:28:58, id: 1519993738, th: 0.981500, num_models: 4, colsample_bytree: 0.500000, lr: 0.0188473,     max_depth: 3, subsample: 0.940000, gamma: 2, alpha: 0, cv_num_round: 405, cv_nfolds: 3,    to_pw: 14, s_pw: 114, ob_pw: 17, th_pw: 344, in_pw: 24, ih_pw: 30            
time: 2018-03-02 07:35:41, id: 1519994141, th: 0.977400, num_models: 9, colsample_bytree: 0.800000, lr: 0.0688231,     max_depth: 2, subsample

# only to get best rounnds

In [19]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import cross_val_score

for i in range(1):
    now = get_time()
    search_id = int(time.time())
    np.random.seed(int(time.time()* 1000000) % 45234634)
    
    model_threshold = 0.9793
    layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=model_threshold)
    gc.collect() 
        
    xgb_colsample_bytree = 0.8
    xgb_learning_rate = 0.08
    xgb_max_depth = 3
    xgb_subsample = 0.9
    xgb_gamma = 2
    xgb_alpha = 0

    xgb_cv_seed = 0
    xgb_cv_num_round = 500
    xgb_cv_nfolds = 4
    
    scale_pos_weights = {}
    scale_pos_weights['toxic'] = 10 #1
    scale_pos_weights['severe_toxic'] = 100 #1
    scale_pos_weights['obscene'] = 17
    scale_pos_weights['threat'] = 333
    scale_pos_weights['insult'] = 20
    scale_pos_weights['identity_hate'] = 112

    xgb_params = {
        'seed': 0,
        'colsample_bytree': xgb_colsample_bytree,
        'silent': 1,
        'subsample': xgb_subsample,
        'learning_rate': xgb_learning_rate,
        'max_depth': xgb_max_depth,
        'gamma': xgb_gamma,
        'alpha': xgb_alpha,
        'nthread': 5,
        'min_child_weight': 1,
        'objective':'binary:logistic',
        'eval_metric':'auc'
    }

    num_models = len(layer1_oof_train_loaded['toxic'])
    #print('Stacking {} models'.format(num_models)) # number of models that will be stacked

    print('time: %s, id: %d, th: %f, num_models: %d, colsample_bytree: %f, lr: %.7f, \
    max_depth: %d, subsample: %f, gamma: %d, alpha: %d, cv_num_round: %d, cv_nfolds: %d,\
    to_pw: %d, s_pw: %d, ob_pw: %d, th_pw: %d, in_pw: %d, ih_pw: %d\
            '%(now,search_id,model_threshold,num_models,\
              xgb_colsample_bytree,xgb_learning_rate,\
              xgb_max_depth,xgb_subsample,xgb_gamma,\
              xgb_alpha,xgb_cv_num_round,xgb_cv_nfolds,\
              scale_pos_weights['toxic'],scale_pos_weights['severe_toxic'],\
              scale_pos_weights['obscene'],scale_pos_weights['threat'],\
              scale_pos_weights['insult'],scale_pos_weights['identity_hate']))
    

    result = np.empty((test.shape[0],len(label_cols)))
    metric_dict = {} # all labels
    best_nrounds = {}  # all labels

    for i, label in enumerate(label_cols):
        assert train.shape == (159571, 27)
        x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
        x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)

    #     clf = XGBClassifier()
    #     #scores = cross_val_score(clf, x_train, train[label], cv=3, scoring='roc_auc')
    #     #print(scores)
    #     #print("Stacking-CV: ROC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    #     clf.fit(x_train, train[label])
    #     result[:, i] = clf.predict_proba(x_test)[:,1]

        dtrain = xgb.DMatrix(x_train, train[label]) # check if train is still in right shape
        dtest = xgb.DMatrix(x_test)

        def xg_eval_auc(yhat, dtrain):
            y = dtrain.get_label()
            return 'auc', roc_auc_score(y, yhat)

        xgb_params['scale_pos_weight'] = scale_pos_weights[label]
        
        res = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_cv_num_round, nfold=xgb_cv_nfolds, seed=xgb_cv_seed, stratified=False,
                 early_stopping_rounds=25, verbose_eval=None, show_stdv=False, feval=xg_eval_auc, maximize=True)
        # early stopping is based on eavl on test fold. so check out the test-auc
        #pdb.set_trace()
        best_nrounds_for_current_label = res.shape[0] - 1
        #print(res[-3:])
        cv_mean = res.iloc[-1, 0]
        cv_std = res.iloc[-1, 1]

        #print('Ensemble-CV: {}: {}+{}'.format(label, cv_mean, cv_std))
        metric_dict[label] = cv_mean
        best_nrounds[label] = best_nrounds_for_current_label
        #metric_dict[label]['cv_mean'] = cv_mean
        #metric_dict[label]['cv_std'] = cv_std
        gbdt = xgb.train(xgb_params, dtrain, best_nrounds_for_current_label)

        result[:,i] = gbdt.predict(dtest)#_proba(x_test)[:,1]

    #print('Stacking done')

    avg_auc = 0
    for label in label_cols:
        avg_auc += metric_dict[label]
    avg_auc/=6
          
    res = '%s,%d,%f,%d,%f,%.7f,%d,%f,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%.8f,%.8f,%.8f,%.8f,%.8f,%.8f,%.8f\n\
            '%(now,search_id,model_threshold,num_models,xgb_colsample_bytree,\
               xgb_learning_rate,xgb_max_depth,xgb_subsample,xgb_gamma,xgb_alpha,\
               xgb_cv_num_round,xgb_cv_nfolds,scale_pos_weights['toxic'],scale_pos_weights['severe_toxic'],\
               scale_pos_weights['obscene'],scale_pos_weights['threat'],scale_pos_weights['insult'],\
               scale_pos_weights['identity_hate'],-99,-99,-99,-99,-99,best_nrounds['toxic'],best_nrounds['severe_toxic'],\
               best_nrounds['obscene'],best_nrounds['threat'],best_nrounds['insult'],\
               best_nrounds['identity_hate'],metric_dict['toxic'],metric_dict['severe_toxic'],\
               metric_dict['obscene'],metric_dict['threat'],metric_dict['insult'],\
               metric_dict['identity_hate'],avg_auc)

    f = open('./xgb_search.csv', 'a')
    f.write(res)
    f.close()

time: 2018-03-07 09:27:15, id: 1520432835, th: 0.979300, num_models: 8, colsample_bytree: 0.800000, lr: 0.0800000,     max_depth: 3, subsample: 0.900000, gamma: 2, alpha: 0, cv_num_round: 500, cv_nfolds: 4,    to_pw: 10, s_pw: 100, ob_pw: 17, th_pw: 333, in_pw: 20, ih_pw: 112            


# xgb random search top N training

In [59]:
xgb_search = pd.read_csv('~/data/kaggle/toxic/sc/stacking/xgb_search.csv').sort_values(by='avg_auc', ascending=False)

xgb_search.head(3)

Unnamed: 0,time,id,threshold,num_models,colsample_bytree,lr,max_depth,subsample,gamma,alpha,...,threat_best_round,insult_best_round,identity_hate_best_round,toxic_auc,severe_toxic_auc,obscene_auc,threat_auc,insult_auc,identity_hate_auc,avg_auc
9,2018-03-02 00:54:48,1519970088,0.9794,6,0.9,0.096663,3,0.9,2,0,...,119,74,98,0.987579,0.991802,0.995448,0.994074,0.99013,0.991359,0.991732
73,2018-03-02 08:36:45,1519997805,0.9768,10,0.7,0.031006,4,0.9,1,0,...,273,163,195,0.98764,0.991029,0.995461,0.993971,0.990103,0.991239,0.991574
65,2018-03-02 07:35:41,1519994141,0.9774,9,0.8,0.068823,2,0.65,2,1,...,100,96,104,0.987659,0.99181,0.99546,0.992785,0.990203,0.991231,0.991525


In [19]:
for i in range(1):
    
    now = get_time()

    search_id = xgb_search['id'].values[i]
    model_threshold = xgb_search['threshold'].values[i]
    num_models = xgb_search['num_models'].values[i]
    xgb_colsample_bytree = xgb_search['colsample_bytree'].values[i]
    xgb_learning_rate = xgb_search['lr'].values[i]
    xgb_max_depth = xgb_search['max_depth'].values[i]
    xgb_subsample = xgb_search['subsample'].values[i]
    xgb_gamma = xgb_search['gamma'].values[i]
    xgb_alpha = xgb_search['alpha'].values[i]
    xgb_cv_num_round = xgb_search['cv_num_round'].values[i]
    xgb_cv_nfolds = xgb_search['cv_nfolds'].values[i]
    
    # per label based params 
    best_nrounds = {}
    best_nrounds['toxic'] = xgb_search['toxic_best_round'].values[i]
    best_nrounds['severe_toxic'] = xgb_search['severe_toxic_best_round'].values[i]
    best_nrounds['obscene'] = xgb_search['obscene_best_round'].values[i]
    best_nrounds['threat'] = xgb_search['threat_best_round'].values[i]
    best_nrounds['insult'] = xgb_search['insult_best_round'].values[i]
    best_nrounds['identity_hate'] = xgb_search['identity_hate_best_round'].values[i]
    
    scale_pos_weights = {}
#     scale_pos_weights['toxic'] = xgb_search['toxic_pos_scale'].values[i]
#     scale_pos_weights['severe_toxic'] = xgb_search['severe_toxic_pos_scale'].values[i]
#     scale_pos_weights['obscene'] = xgb_search['obscene_pos_scale'].values[i]
#     scale_pos_weights['threat'] = xgb_search['threat_pos_scale'].values[i]
#     scale_pos_weights['insult'] = xgb_search['insult_pos_scale'].values[i]
#     scale_pos_weights['identity_hate'] = xgb_search['identity_hate_pos_scale'].values[i]
    scale_pos_weights['toxic'] = 10 #1
    scale_pos_weights['severe_toxic'] = 100 #1
    scale_pos_weights['obscene'] = 17
    scale_pos_weights['threat'] = 333
    scale_pos_weights['insult'] = 20
    scale_pos_weights['identity_hate'] = 112
    
    metric_dict_fromcsv = {}
    metric_dict_fromcsv['avg_auc'] = xgb_search['avg_auc'].values[i]
    
    layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=model_threshold)
    gc.collect() 

    xgb_params = {
        'seed': 0,
        'colsample_bytree': xgb_colsample_bytree,
        'silent': 1,
        'subsample': xgb_subsample,
        'learning_rate': xgb_learning_rate,
        'max_depth': xgb_max_depth,
        'gamma': xgb_gamma,
        'alpha': xgb_alpha,
        'nthread': 7,
        'min_child_weight': 1,
        'objective':'binary:logistic',
        'eval_metric':'auc'
    }

    print('time: %s, id: %d, th: %f, num_models: %d, colsample_bytree: %f, lr: %.7f, \
    max_depth: %d, subsample: %f, gamma: %d, alpha: %d, cv_num_round: %d, cv_nfolds: %d,\
    to_pw: %d, s_pw: %d, ob_pw: %d, th_pw: %d, in_pw: %d, ih_pw: %d, to_br: %d, s_br: %d,\
    ob_br: %d, th_br: %d, in_br: %d, ih_br: %d\
            '%(now,search_id,model_threshold,num_models,\
              xgb_colsample_bytree,xgb_learning_rate,\
              xgb_max_depth,xgb_subsample,xgb_gamma,\
              xgb_alpha,xgb_cv_num_round,xgb_cv_nfolds,\
              scale_pos_weights['toxic'],scale_pos_weights['severe_toxic'],\
              scale_pos_weights['obscene'],scale_pos_weights['threat'],\
              scale_pos_weights['insult'],scale_pos_weights['identity_hate'],\
              best_nrounds['toxic'],best_nrounds['severe_toxic'],\
              best_nrounds['obscene'],best_nrounds['threat'],\
              best_nrounds['insult'],best_nrounds['identity_hate']))


    result = np.empty((test.shape[0],len(label_cols)))
    metric_dict = {}

    for i, label in enumerate(label_cols):
        assert train.shape == (159571, 27)
        x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
        x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)
        
        # add engineered features to layer 2
        x_train = np.hstack([F_train[features].as_matrix(), x_train])
        x_test = np.hstack([F_test[features].as_matrix(), x_test])  

        dtrain = xgb.DMatrix(x_train, train[label]) # check if train is still in right shape
        dtest = xgb.DMatrix(x_test)
        
        xgb_params['scale_pos_weight'] = scale_pos_weights[label]

        gbdt = xgb.train(xgb_params, dtrain, best_nrounds[label])

        result[:,i] = gbdt.predict(dtest)#_proba(x_test)[:,1] # if using xgboost sklearn wrapper

    sub_title = 'xgb_topn_w_addedfeatures_posweighted_'
    submission = pd.read_csv(PATH + 'sample_submission.csv')
    submission[label_cols] = result
    submission.to_csv('./StackPreds/TopN_XGB/{}_{}_{}.csv'.format(sub_title,metric_dict_fromcsv['avg_auc'],search_id), index=False)
        

time: 2018-03-12 12:28:44, id: 1520431835, th: 0.979300, num_models: 8, colsample_bytree: 0.800000, lr: 0.0800000,     max_depth: 3, subsample: 0.900000, gamma: 2, alpha: 0, cv_num_round: 500, cv_nfolds: 4,    to_pw: 10, s_pw: 100, ob_pw: 17, th_pw: 333, in_pw: 20, ih_pw: 112, to_br: 231, s_br: 114,    ob_br: 103, th_br: 98, in_br: 101, ih_br: 116            


In [None]:
submission.head()

In [42]:
lgb_stacker_params = {
    'max_depth':3, 
    'metric':"auc", 
    'n_estimators':125, 
    'num_leaves':10, 
    'boosting_type':"gbdt", 
    'learning_rate':0.1, 
    'feature_fraction':0.85,  #0.45 for only two added features
    'colsample_bytree':0.45, 
    'bagging_fraction':0.8, 
    'bagging_freq':5, 
    'reg_lambda':0.2
}

lgb_stacker = LightgbmBLE(None, None, params=lgb_stacker_params, nb=False, seed=1001)

In [10]:
xgb_stacker = XGBoostBLE(None, None, params={}, nb=False, seed=1001)

Naive Bayes is disabled
XGBoostBase is initialized


In [114]:
# rf_stacker = SklearnBLE(RandomForestClassifier, params={}, seed=1001)

In [7]:
et_stacker = SklearnBLE(ExtraTreesClassifier, params={}, seed=1001)

In [44]:
logreg_stacker = SklearnBLE(LogisticRegression, params={}, seed=1001)

In [15]:
selected = []
for items in base_layer_results_repo.show_scores():
    if items[1] >= 0.976:
        selected.append(items[0])
print(len(selected))
from random import shuffle
import random
random.seed(1001)
shuffle(selected)
#print(selected)

0.9827	ModelName.NBLSVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9826	ModelName.NBSVM_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9819	ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9818	ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9815	ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9803	ModelName.NBSVM_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9796	ModelName.LOGREG_wordtfidf_word_(1, 1)_100000_1_1.0_char_(2, 5)_200000_1_1.0
0.9794	ModelName.LGB_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9793	ModelName.LOGREG_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9786	ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w
0.9774	ModelName.NBLSVC_tfidf_word_df2_ng(1, 1)_wmf200000
0.9768	ModelName.NBSVM_tfidf_word_df2_ng(1, 1)_wmf200000
0.9765	ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000
0.9765	ModelName.NBLGB_wordtfidf_word_(1, 1)_100000

In [59]:
model_pool = {}
layer2_inputs = {}

n1 = '_w_features_1stHalf16_seed1001'
n2 = '_w_features_2ndHalf16_seed1001'
n3 = '_w_features_All16_seed1001'
xgbmodel1 = str(ModelName.XGB)+n1
xgbmodel2 = str(ModelName.XGB)+n2
xgbmodel3 = str(ModelName.XGB)+n3
model_pool[xgbmodel1] = xgb_stacker
model_pool[xgbmodel2] = xgb_stacker
model_pool[xgbmodel3] = xgb_stacker
layer2_inputs[xgbmodel1] = base_layer_results_repo.get_results(chosen_ones=selected[:int(len(selected)/2)])
layer2_inputs[xgbmodel2] = base_layer_results_repo.get_results(chosen_ones=selected[int(len(selected)/2):])
layer2_inputs[xgbmodel3] = base_layer_results_repo.get_results(chosen_ones=selected)

# logregmodel1 = str(ModelName.LOGREG)+n1
# logregmodel2 = str(ModelName.LOGREG)+n2
# logregmodel3 = str(ModelName.LOGREG)+n3
# model_pool[logregmodel1] = logreg_stacker
# model_pool[logregmodel2] = logreg_stacker
# model_pool[logregmodel3] = logreg_stacker
# layer2_inputs[logregmodel1] = base_layer_results_repo.get_results(chosen_ones=selected[:int(len(selected)/2)])
# layer2_inputs[logregmodel2] = base_layer_results_repo.get_results(chosen_ones=selected[int(len(selected)/2):])
# layer2_inputs[logregmodel3] = base_layer_results_repo.get_results(chosen_ones=selected)

# lgbmodel1 = str(ModelName.LGB)+n1
# lgbmodel2 = str(ModelName.LGB)+n2
# lgbmodel3 = str(ModelName.LGB)+n3
# model_pool[lgbmodel1] = lgb_stacker
# model_pool[lgbmodel2] = lgb_stacker
# model_pool[lgbmodel3] = lgb_stacker
# layer2_inputs[lgbmodel1] = base_layer_results_repo.get_results(chosen_ones=selected[:int(len(selected)/2)])
# layer2_inputs[lgbmodel2] = base_layer_results_repo.get_results(chosen_ones=selected[int(len(selected)/2):])
# layer2_inputs[lgbmodel3] = base_layer_results_repo.get_results(chosen_ones=selected)



In [60]:
model_pool

{'ModelName.XGB_w_features_1stHalf16_seed1001': <base_layer_utils.XGBoostBLE at 0x7f627e5ab0b8>,
 'ModelName.XGB_w_features_2ndHalf16_seed1001': <base_layer_utils.XGBoostBLE at 0x7f627e5ab0b8>,
 'ModelName.XGB_w_features_All16_seed1001': <base_layer_utils.XGBoostBLE at 0x7f627e5ab0b8>}

In [51]:
# selected[int(len(selected)/2):] # the good half (xgb 9854) # 1stHalf14
# selected = ['ModelName.NBSVM_tfidf_word_df2_ng(1, 1)_wmf200000',
#  'ModelName.LGB_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000',
#  'ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real',
#  'ModelName.NBSVM_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000',
#  'ModelName.LOGREG_tfidf_word_df2_ng(1, 1)_wmf200000',
#  'ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real',
#  'ModelName.LOGREG_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000']

In [168]:
# selected[:int(len(selected)/2)] # the bad half (xgb 9853)
# ['ModelName.NBLSVC_tfidf_word_df2_ng(1, 1)_wmf200000',
#  'ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w',
#  'ModelName.NBLSVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real',
#  'ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000',
#  'ModelName.NBSVM_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real',
#  'ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000',
#  'ModelName.NBSVM_tfidf_word_df2_ng(1, 2)_wmf200000']

# more features

In [37]:
train_more_features = pd.read_csv('/home/kai/data/wei/Toxic/dataset/shiyi_0313_features_train.csv')

global_train_features = train_more_features[['cleaned_word_count', 'word_count', 'unique_word_count', 
                    'cleaned_unique_word_count', 'ellipsis', 'exclamation_marks',
                    'question_marks', 'polarity_cleaned', 'polarity_ori']].to_dense()

global_train_features.shape

(159571, 9)

In [38]:
test_more_features = pd.read_csv('/home/kai/data/wei/Toxic/dataset/shiyi_0313_features_test.csv')

global_test_features = test_more_features[['cleaned_word_count', 'word_count', 'unique_word_count', 
                    'cleaned_unique_word_count', 'ellipsis', 'exclamation_marks',
                    'question_marks', 'polarity_cleaned', 'polarity_ori']].to_dense()

global_test_features.shape

(153164, 9)

In [39]:
import gc
gc.collect()

4056

In [63]:
# import lightgbm as lgb
# stacker = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, 
#                              boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.45, 
#                              colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)

layer2_est_preds = {} # directly preditions from the base layer estimators

layer2_oof_train = {}
layer2_oof_test = {}

layer2_model_list = []

# result = np.empty((test.shape[0],len(label_cols)))
for model_name in model_pool.keys():
    print('Generating Layer2 model {} OOF'.format(model_name))
    for i, label in enumerate(label_cols):
        assert train.shape == (159571, 27)

        model = model_pool[model_name]
        if str(ModelName.XGB) in model_name: # Done some grid search on xgb, so it has per label based best params
            print('XGB model chosen, setting params for {}'.format(label))
            model.set_params(get_best_xgb_params(label=label))

        layer1_oof_train_loaded, layer1_oof_test_loaded, _ = layer2_inputs[model_name]
        
        x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
        x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)

        # add engineered features to layer 2
#         x_train = np.hstack([F_train[features].as_matrix(), global_train_features, x_train])
#         x_test = np.hstack([F_test[features].as_matrix(), global_test_features, x_test])  
        x_train = np.hstack([F_train[features].as_matrix(), x_train])
        x_test = np.hstack([F_test[features].as_matrix(), x_test])  

        SEED = 1001
        NFOLDS = 4 # set folds for out-of-fold prediction

        oof_train, oof_test = get_oof(model,  x_train, train[label], x_test, NFOLDS, SEED)

        if label not in layer2_oof_train:
            layer2_oof_train[label] = []
            layer2_oof_test[label] = []
        layer2_oof_train[label].append(oof_train)
        layer2_oof_test[label].append(oof_test)

    #     stacker.fit(x_train, train[label])
    #     result[:,i] = stacker.predict_proba(x_test)[:,1]
        model_id = '{}_{}'.format(model_name, 'layer2')
        model.train(x_train, train[label])
        est_preds = model.predict(x_test)

        if model_id not in layer2_est_preds:
            layer2_est_preds[model_id] = np.empty((x_test.shape[0],len(label_cols)))
            layer2_model_list.append(model_id)
        layer2_est_preds[model_id][:,i] = est_preds

Generating Layer2 model ModelName.XGB_w_features_1stHalf16_seed1001 OOF
XGB model chosen, setting params for toxic
xgb grid search file loaded. shape:(81, 36)
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
XGB model chosen, setting params for severe_toxic
xgb grid search file loaded. shape:(81, 36)
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
starting predicting
predicting done
XGB model chosen, setting params for obscene
xgb grid search file loaded. shape:(81, 36)
starting p

In [19]:
model.clf.coef_ # w features

array([[ 0.00873907, -0.02068911,  0.03144957, -0.09154736,  1.03971318,
         0.23963355,  0.56386026, -1.4400043 ,  1.48109455,  5.55383302,
         1.05722585]])

In [67]:
len(layer2_oof_train['toxic'])

1

In [96]:
layer2_oof_train['toxic'][0].shape

AttributeError: 'int' object has no attribute 'shape'

In [65]:
list(layer2_est_preds)

['ModelName.XGB_w_features_2ndHalf16_seed1001_layer2',
 'ModelName.XGB_w_features_1stHalf16_seed1001_layer2',
 'ModelName.XGB_w_features_All16_seed1001_layer2']

In [29]:
layer2_est_preds[list(layer2_est_preds.keys())[0]].shape

(153164, 6)

In [67]:
base_layer_results_repo.add(layer2_oof_train, layer2_oof_test, layer2_est_preds, layer2_model_list)

In [68]:
_ = base_layer_results_repo.show_scores()

0.9827	ModelName.NBLSVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9826	ModelName.NBSVM_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9819	ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9818	ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9815	ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9803	ModelName.NBSVM_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9796	ModelName.LOGREG_wordtfidf_word_(1, 1)_100000_1_1.0_char_(2, 5)_200000_1_1.0
0.9794	ModelName.LGB_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9793	ModelName.LOGREG_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9786	ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w
0.9774	ModelName.NBLSVC_tfidf_word_df2_ng(1, 1)_wmf200000
0.9768	ModelName.NBSVM_tfidf_word_df2_ng(1, 1)_wmf200000
0.9765	ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000
0.9765	ModelName.NBLGB_wordtfidf_word_(1, 1)_100000

In [56]:
base_layer_results_repo.add_score('ModelName.LOGREG_w_features_All16_seed1001_layer2', 0.009833)

ModelName.LOGREG_w_features_All16_seed1001_layer2 already existed in the repo. score: 0 update to 0.009833


In [69]:
base_layer_results_repo.save()

sub_title = 'xgb_topn_w_addedfeatures_'
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = layer2_est_preds['ModelName.XGB_layer2']
tempid = int(time.time())
print(tempid)
submission.to_csv('./StackPreds/TopN_XGB/{}_{}.csv'.format(sub_title, tempid), index=False)

In [31]:
def generate_base_layer_est_preds(base_layer_est_preds):
    for key in base_layer_est_preds:
        submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
        submission[label_cols] = base_layer_est_preds[key]
        sub_id = int(time.time())
        print(sub_id)
        submission.to_csv('./BaseEstPreds/' + key + '_' + str(sub_id) + '.csv', index=False)

In [66]:
generate_base_layer_est_preds(layer2_est_preds)

1521040515
1521040516
1521040518


In [62]:
def get_best_xgb_params(xgb_grid_search_res_path='~/data/kaggle/toxic/sc/stacking/xgb_search.csv', idx = 0, label='toxic'):
    """
    Params: 
        xgb_grid_search_res_path: (str).
        idx: load the nth best params. e.g. idx=0, load the best. idx=1, load the 2nd best
    Returns:
        (dict) the best xgb grid search params.
    """       
    xgb_search = pd.read_csv(xgb_grid_search_res_path).sort_values(by='avg_auc', ascending=False)
    
    print('xgb grid search file loaded. shape:{}'.format(xgb_search.shape))
    
    i=idx

    search_id = xgb_search['id'].values[i]
    model_threshold = xgb_search['threshold'].values[i]
    num_models = xgb_search['num_models'].values[i]
    xgb_colsample_bytree = xgb_search['colsample_bytree'].values[i]
    xgb_learning_rate = xgb_search['lr'].values[i]
    xgb_max_depth = xgb_search['max_depth'].values[i]
    xgb_subsample = xgb_search['subsample'].values[i]
    xgb_gamma = xgb_search['gamma'].values[i]
    xgb_alpha = xgb_search['alpha'].values[i]
    xgb_cv_num_round = xgb_search['cv_num_round'].values[i]
    xgb_cv_nfolds = xgb_search['cv_nfolds'].values[i]
    
    # per label based params 
    best_nrounds = {}
    best_nrounds['toxic'] = xgb_search['toxic_best_round'].values[i]
    best_nrounds['severe_toxic'] = xgb_search['severe_toxic_best_round'].values[i]
    best_nrounds['obscene'] = xgb_search['obscene_best_round'].values[i]
    best_nrounds['threat'] = xgb_search['threat_best_round'].values[i]
    best_nrounds['insult'] = xgb_search['insult_best_round'].values[i]
    best_nrounds['identity_hate'] = xgb_search['identity_hate_best_round'].values[i]
    
    scale_pos_weights = {}
    scale_pos_weights['toxic'] = xgb_search['toxic_pos_scale'].values[i]
    scale_pos_weights['severe_toxic'] = xgb_search['severe_toxic_pos_scale'].values[i]
    scale_pos_weights['obscene'] = xgb_search['obscene_pos_scale'].values[i]
    scale_pos_weights['threat'] = xgb_search['threat_pos_scale'].values[i]
    scale_pos_weights['insult'] = xgb_search['insult_pos_scale'].values[i]
    scale_pos_weights['identity_hate'] = xgb_search['identity_hate_pos_scale'].values[i]

    metric_dict_fromcsv = {}
    metric_dict_fromcsv['avg_auc'] = xgb_search['avg_auc'].values[i]
    
    xgb_params = {
        'seed': 0,
        'colsample_bytree': xgb_colsample_bytree,
        'silent': 1,
        'subsample': xgb_subsample,
        'learning_rate': xgb_learning_rate,
        'max_depth': xgb_max_depth,
        'gamma': xgb_gamma,
        'alpha': xgb_alpha,
        'nthread': 7,
        'min_child_weight': 1,
        'objective':'binary:logistic',
        'eval_metric':'auc'
    }

    xgb_params['scale_pos_weight'] = scale_pos_weights[label]
    xgb_params['num_boost_round'] = best_nrounds[label]
    
    return xgb_params

In [36]:
get_best_xgb_params(label='threat')

xgb grid search file loaded. shape:(81, 36)


{'alpha': 0,
 'colsample_bytree': 0.9,
 'eval_metric': 'auc',
 'gamma': 2,
 'learning_rate': 0.0966631,
 'max_depth': 3,
 'min_child_weight': 1,
 'nthread': 7,
 'num_boost_round': 119,
 'objective': 'binary:logistic',
 'scale_pos_weight': 120,
 'seed': 0,
 'silent': 1,
 'subsample': 0.9}

In [74]:
isinstance({}, dict)

True

In [22]:
from sklearn.model_selection import KFold, StratifiedKFold

def get_oof(clf, x_train, y_train, x_test, nfolds, stratified=False, shuffle=True, seed=1001):
    #pdb.set_trace()
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((nfolds, ntest))
    if stratified:
        kf = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=seed)
    else:
        kf = KFold(n_splits=nfolds, shuffle=shuffle, random_state=seed)

    for i, (tr_index, te_index) in enumerate(kf.split(x_train, y_train)):
        x_tr, x_te = x_train[tr_index], x_train[te_index]
        y_tr, y_te = y_train.iloc[tr_index], y_train.iloc[te_index]
        
        clf.train(x_tr, y_tr)

        oof_train[te_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
}