In [1]:
# Load in our libraries
from tqdm import tqdm_notebook
import tqdm
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import lightgbm as lgb
#import seaborn as sns
#import matplotlib.pyplot as plt
%matplotlib inline

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import KFold
from fast_text_data import fasttext_data_process
from onevsone_data import onevsone_data_process

In [2]:
import pandas as pd
import numpy as np
import re, time
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
from scipy.sparse import csr_matrix, hstack

In [3]:
PATH = '~/data/toxic/data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
#train = pd.read_csv('/home/kai/data/wei/Toxic/data/Shiyi_training.csv').fillna('na')

test = pd.read_csv(PATH + 'cleaned_test.csv')

#train = train.head(1000)
#test = test.head(1000)

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

(159571, 27)
(153164, 21)


In [5]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
from enum import Enum
class ModelName(Enum):
    XGB = 1
    LGB = 2
    LOGREG = 3
    NBSVM = 4
    RF = 5 # random forest
    RNN = 6
    NBLSVC = 7
    ONESVC = 8
    ONELOGREG = 9

class BaseLayerDataRepo():
    def __init__(self):
        self._data_repo = {}
    
    def add_data(self, data_id, x_train, x_test, y_train, label_cols, compatible_model=[ModelName.LOGREG], rnn_data=False):
        """
        x_train, x_test: ndarray
        y_train: pd df
        """
        temp = {}
        
        temp['data_id'] = data_id
        temp['x_train'] = x_train
        temp['x_test'] = x_test
        temp['labes_cols'] = label_cols
        temp['compatible_model'] = set(compatible_model)
        
        if rnn_data: 
            temp['y_train'] = y_train # here y_train is a df
        else:
            label_dict = {}
            for col in label_cols:
                label_dict[col] = y_train[col]
            temp['y_train'] = label_dict # hence y_train is a dict with labels as keys
        
        self._data_repo[data_id] = temp
    
    def get_data(self, data_id):
        return self._data_repo[data_id]
    
    def remove_data(self, data_id):
        self._data_repo.pop(data_id, None)
        
    def get_compatible_model(self, data_id):
        return self._data_repo[data_id]['compatible_model']
    
    def remove_compatible_model(self, data_id, model_name):
        return self._data_repo[data_id]['compatible_model'].discard(model_name)
    
    def add_compatible_model(self, data_id, model_name):
        return self._data_repo[data_id]['compatible_model'].add(model_name)
                  
    def get_data_by_compatible_model(self, model_name):
        data_to_return = []
        for data_id in self._data_repo.keys():
            data = self._data_repo[data_id]
            if model_name in data['compatible_model']:
                data_to_return.append(data)
        return data_to_return
    
    def __len__(self):
        return len(self._data_repo)
    
    def __str__(self):
        output = ''
        for data_id in self._data_repo.keys():
            output+='data_id: {:20} \n\tx_train: {}\tx_test: {}\n\ty_train type: {}\n\tcompatible_model: {}\n '\
            .format(data_id, self._data_repo[data_id]['x_train'].shape, \
                    self._data_repo[data_id]['x_test'].shape, \
                    type(self._data_repo[data_id]['y_train']), \
                    self._data_repo[data_id]['compatible_model'])
        return output

In [7]:
bldr = BaseLayerDataRepo()

In [8]:
x_train_1v1, y_train_1v1, x_test_1v1, data_id_1v1 = onevsone_data_process()

loading data done!
fitting char
fitting phrase
transforming train char
transforming train phrase
transforming test char
transforming test phrase


In [9]:
compatible_models= [ModelName.ONESVC, ModelName.ONELOGREG]
bldr.add_data(data_id_1v1, x_train_1v1, x_test_1v1, y_train_1v1, label_cols, compatible_models)

In [10]:
print(bldr)

data_id: wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real 
	x_train: (159571, 300000)	x_test: (153164, 300000)
	y_train type: <class 'dict'>
	compatible_model: {<ModelName.ONESVC: 8>, <ModelName.ONELOGREG: 9>}
 


In [8]:
x_train_rnn, y_train_rnn, x_test_rnn, _, _ = fasttext_data_process()#first_n_entries=100)

rnn_data_id = 'rnn_data_001'
compatible_models= [ModelName.RNN]
bldr.add_data(rnn_data_id, x_train_rnn, x_test_rnn, y_train_rnn, label_cols, compatible_models, True)


Loading data
train shape: (159571, 27). test shape: (153164, 21)

Loading FT model
300
window: 200. dimension(n_features): 300


In [9]:
print(bldr)

data_id: rnn_data_001         
	x_train: (159571, 200, 300)	x_test: (153164, 200, 300)
	y_train type: <class 'pandas.core.frame.DataFrame'>
	compatible_model: {<ModelName.RNN: 6>} 


In [8]:
#bldr = BaseLayerDataRepo()

for min_df in [2]:
    for word_ngram_range in [(1,1),(1,2)]:#,(4,4),(5,6)]:#,(1,3),(4,4),(5,6)]:
        #min_df = i
        #word_ngram_range = (1,1)
        #char_ngram_range = (1,5)
        word_max_features = 200000
        #char_max_features = 100000
        token_pattern = r'\w{%d,}'%3

        data_id = 'tfidf_word_df%d_ng%s_wmf%s'%(min_df,str(word_ngram_range),str(word_max_features))

        word_vec = TfidfVectorizer(analyzer='word',
                                  min_df=1,
                                  ngram_range=word_ngram_range,
                                  max_features=word_max_features,
                                  token_pattern=token_pattern,
                                  stop_words='english',
                                  strip_accents='unicode',
                                  sublinear_tf=True)
        
        train_term_doc = word_vec.fit_transform(train.comment_text)
        test_term_doc = word_vec.transform(test.comment_text)
        #pdb.set_trace()
        #np.save(DATA_PATH + data_id+'_x_train.npy', train_term_doc)
        #np.save(DATA_PATH + data_id+'_x_test.npy', test_term_doc)
    
        compatible_models = [ModelName.LGB, ModelName.LOGREG, ModelName.NBSVM, ModelName.NBLSVC, ModelName.RF, ModelName.XGB]
        bldr.add_data(data_id, train_term_doc, test_term_doc, train[label_cols], label_cols, compatible_models)

In [53]:
for min_df in [2]:
    for word_ngram_range in [(1,2)]:#,(4,4),(5,6)]:#,(1,3),(4,4),(5,6)]:
        for char_max_df in [0.3]:
            #min_df = i
            #word_ngram_range = (1,1)
            #char_ngram_range = (1,5)
            word_max_features = 100000
            char_max_features = 100000
            token_pattern = r'\w{%d,}'%3

            data_id = 'tfidf_wordchar_charmaxdf%f_ng%s_wmf%s_cmf%s'%(char_max_df,str(word_ngram_range),str(word_max_features),str(char_max_features))

            word_vec = TfidfVectorizer(analyzer='word',
                                      min_df=1,
                                      ngram_range=word_ngram_range,
                                      max_features=word_max_features,
                                      token_pattern=token_pattern,
                                      stop_words='english',
                                      strip_accents='unicode',
                                      sublinear_tf=True)


            char_vec = TfidfVectorizer(analyzer='char',
                                      min_df = 1,
                                      max_df = char_max_df,
                                      ngram_range=(2,7), 
                                      max_features=char_max_features, 
                                      #stop_words='english',
                                      strip_accents='unicode',
                                      sublinear_tf=True)

            train_word_doc = word_vec.fit_transform(train.comment_text)
            test_word_doc = word_vec.transform(test.comment_text)

            train_char_doc = char_vec.fit_transform(train.comment_text)
            test_char_doc = char_vec.transform(test.comment_text)

            train_term_tfidf = hstack((train_word_doc, train_char_doc), format='csr')
            test_term_tfidf = hstack((test_word_doc, test_char_doc), format='csr')

            #np.save(DATA_PATH + data_id+'_x_train.npy', train_term_tfidf)
            #np.save(DATA_PATH + data_id+'_x_test.npy', test_term_tfidf)

            compatible_models = [ModelName.LGB, ModelName.LOGREG, ModelName.NBSVM, ModelName.NBLSVC, ModelName.RF, ModelName.XGB]
            bldr.add_data(data_id, train_term_tfidf, test_term_tfidf, train[label_cols], label_cols, compatible_models)

import pickle
with open(data_id_list_file, 'wb') as f:
    pickle.dump(data_ids, f)

with open(data_id_list_file, 'rb') as f:
    data_ids = pickle.load(f)

for data_id in data_ids:
    bldr.add_data(data_id, np.load(DATA_PATH+data_id+'_x_train.npy'), np.load(DATA_PATH+data_id+'_x_test.npy'), train[label_cols], label_cols, ['logreg','gbm','rf','nbsvm'])

In [129]:
print(bldr)

data_id: rnn_data_001         
	x_train: (159571, 200, 300)	x_test: (153164, 200, 300)
	y_train type: <class 'pandas.core.frame.DataFrame'>
	compatible_model: {<ModelName.RNN: 6>} data_id: tfidf_word_df2_ng(1, 2)_wmf200000 
	x_train: (159571, 200000)	x_test: (153164, 200000)
	y_train type: <class 'dict'>
	compatible_model: {<ModelName.NBLSVC: 7>, <ModelName.XGB: 1>, <ModelName.LOGREG: 3>, <ModelName.LGB: 2>, <ModelName.RF: 5>, <ModelName.NBSVM: 4>} data_id: tfidf_word_df2_ng(1, 1)_wmf200000 
	x_train: (159571, 184719)	x_test: (153164, 184719)
	y_train type: <class 'dict'>
	compatible_model: {<ModelName.NBLSVC: 7>, <ModelName.XGB: 1>, <ModelName.LOGREG: 3>, <ModelName.LGB: 2>, <ModelName.RF: 5>, <ModelName.NBSVM: 4>} data_id: tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000 
	x_train: (159571, 200000)	x_test: (153164, 200000)
	y_train type: <class 'dict'>
	compatible_model: {<ModelName.NBLSVC: 7>, <ModelName.XGB: 1>, <ModelName.LOGREG: 3>, <ModelName.LGB: 2>, <ModelName.RF:

In [130]:
for data in bldr.get_data_by_compatible_model(ModelName.LGB):
    print(data['compatible_model'])
    print(data['x_train'].shape)

{<ModelName.NBLSVC: 7>, <ModelName.XGB: 1>, <ModelName.LOGREG: 3>, <ModelName.LGB: 2>, <ModelName.RF: 5>, <ModelName.NBSVM: 4>}
(159571, 200000)
{<ModelName.NBLSVC: 7>, <ModelName.XGB: 1>, <ModelName.LOGREG: 3>, <ModelName.LGB: 2>, <ModelName.RF: 5>, <ModelName.NBSVM: 4>}
(159571, 184719)
{<ModelName.NBLSVC: 7>, <ModelName.XGB: 1>, <ModelName.LOGREG: 3>, <ModelName.LGB: 2>, <ModelName.RF: 5>, <ModelName.NBSVM: 4>}
(159571, 200000)


In [28]:
def get_oof(clf, x_train, y_train, x_test, nfolds):
    #pdb.set_trace()
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((nfolds, ntest))
    kf = KFold(ntrain, n_folds=nfolds)#, random_state=SEED)

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        #pdb.set_trace()
        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [11]:
def get_oof_1v1(clf, x_train, y_train, x_test, nfolds, label):
    #pdb.set_trace()
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((nfolds, ntest))
    kf = KFold(ntrain, n_folds=nfolds)#, random_state=SEED)

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        #pdb.set_trace()
        clf.train(x_tr, y_tr, label)

        oof_train[test_index] = clf.predict(x_te, label)
        oof_test_skf[i, :] = clf.predict(x_test, label)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [29]:
def get_oof_rnn(clf, x_train, y_train, x_test, nfolds, number_labels):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,number_labels))
    oof_test = np.zeros((ntest,number_labels))
    oof_test_skf = np.empty((nfolds, ntest, number_labels))
    kf = KFold(ntrain, n_folds=nfolds)#, random_state=SEED)

    for i, (train_index, test_index) in enumerate(kf):
        ################################################################ maybe shuffle train_index
        x_tr = x_train[train_index]
        #pdb.set_trace()
        y_tr = y_train.iloc[train_index]
        x_te = x_train[test_index]
        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train, oof_test

In [12]:
def generate_base_layer_est_preds(base_layer_est_preds):
    for key in base_layer_est_preds:
        submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
        submission[label_cols] = base_layer_est_preds[key]
        sub_id = int(time.time())
        print(sub_id)
        submission.to_csv('./BaseEstPreds/' + key + '_' + str(sub_id) + '.csv', index=False)

In [141]:
import gc
gc.collect()

1313

In [132]:
rnn_model_pool = {}

In [133]:
x_train_rnn.shape[1], x_train_rnn.shape[2] 

(200, 300)

In [134]:
rnn_ble = RnnBLE(x_train_rnn.shape[1], x_train_rnn.shape[2], label_cols, epochs=2)
rnn_model_pool[ModelName.RNN] = rnn_ble

In [142]:
base_layer_est_preds = {} # directly preditions from the base layer estimators

layer1_oof_train = {}
layer1_oof_test = {}

model_data_id_list = []

for model_name in rnn_model_pool.keys():
    for data in bldr.get_data_by_compatible_model(model_name):
        x_train = data['x_train']
        y_train = data['y_train']
        x_test = data['x_test']

        SEED = 0 # for reproducibility
        NFOLDS = 4 # set folds for out-of-fold prediction
        #print(x_train.shape,y_train.shape,x_test.shape,label)

        current_run = '{} {}'.format(model_name,data['data_id'])
        print('Generating: '+current_run)

        oof_train, oof_test = get_oof_rnn(rnn_model_pool[model_name], \
                                          x_train, y_train, x_test, NFOLDS, len(label_cols))
        for i, label in enumerate(label_cols):
            if label not in layer1_oof_train:
                layer1_oof_train[label] = []
                layer1_oof_test[label] = []
            layer1_oof_train[label].append(oof_train[:, i].reshape(-1,1)) # before reshape: (159571,) after: (159571, 1) => good for np.concatenate
            layer1_oof_test[label].append(oof_test[:, i].reshape(-1,1))

        model_data_id = '{}_{}'.format(model_name, data['data_id'])
        model_data_id_list.append(model_data_id)
        model = rnn_model_pool[model_name]
        model.train(x_train, y_train) ################################ maybe shuffle x_train along with y_train?
        est_preds = model.predict(x_test)

        base_layer_est_preds[model_data_id] = est_preds

Generating: ModelName.RNN rnn_data_001
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [143]:
generate_base_layer_est_preds(base_layer_est_preds)

1519810384


In [None]:
base_layer_est_preds['ModelName.RNN_rnn_data_001'].shape

In [13]:
from base_layer_estimator import OneVSOneReg

In [14]:
onevsone_svc = OneVSOneReg(x_train_1v1, y_train_1v1, model='svc')
onevsone_logreg = OneVSOneReg(x_train_1v1, y_train_1v1)

OneVsOne is using svc kernel
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
initializing done
OneVsOne is using svc kernel
OneVsOne is using logistic kernel
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
initializing done
OneVsOne is using logistic kernel


In [15]:
model_pool = {}
model_pool[ModelName.ONESVC] = onevsone_svc
model_pool[ModelName.ONELOGREG] = onevsone_logreg

In [144]:
model_pool = {}

SEED = 0

logreg_params = {
    'n_jobs': 3
}
logreg_ble = SklearnBLE(LogisticRegression, seed=SEED, params=logreg_params)
model_pool[ModelName.LOGREG] = logreg_ble


#rf_ble = SklearnBLE(RandomForestClassifier, seed=SEED, params={'n_jobs': 3})
#model_pool[ModelName.RF] = rf_ble


nblsvc_params = {
    'C':0.02
}
nblsvc_ble = NbSvmBLE(mode=ModelName.NBLSVC, seed=SEED, params=nblsvc_params)
model_pool[ModelName.NBLSVC] = nblsvc_ble


nbsvm_params = {
    'C':1.0,
    'dual':True,
    'n_jobs':3
}
nbsvm_ble = NbSvmBLE(mode=ModelName.NBSVM, seed=SEED, params=nbsvm_params)
model_pool[ModelName.NBSVM] = nbsvm_ble

#rf_ble = SklearnBLE(RandomForestClassifier, seed=SEED, params={})
#xgb_ble = XgbBLE(params=xgb_params)

lgb_params = {
    'learning_rate': 0.2,
    'application': 'binary',
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'nthread': 8,
    'lambda_l1': 1,
    'lambda_l2': 1
}
lgb_ble = LightgbmBLE(params=lgb_params)
model_pool[ModelName.LGB] = lgb_ble


# lg = SklearnBLE(clf=LogisticRegression, seed=SEED, params={'n_jobs': 1})

# et = SklearnBLE(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
# ada = SklearnBLE(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
# gb = SklearnBLE(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
# svc = SklearnBLE(clf=SVC, seed=SEED, params=svc_params)

#model_pool['rf'] = rf_ble

In [19]:
########################## ONE VS ONE ######################
base_layer_est_preds = {} # directly preditions from the base layer estimators

layer1_oof_train = {}
layer1_oof_test = {}

model_data_id_list = []

for i, label in enumerate(label_cols):
#     layer1_oof_train[label] = []
#     layer1_oof_test[label] = []
    for model_name in model_pool.keys():
        for data in bldr.get_data_by_compatible_model(model_name):
            x_train = data['x_train']
            y_train = data['y_train'][label]
            x_test = data['x_test']
            
            SEED = 0 # for reproducibility
            NFOLDS = 4 # set folds for out-of-fold prediction
            #print(x_train.shape,y_train.shape,x_test.shape,label)
            
            current_run = '{} {} {}'.format(model_name,data['data_id'],label)
            print('Generating: '+current_run)
            
            if model_name == ModelName.LGB:
                model = LogisticRegression(solver='sag')
                sfm = SelectFromModel(model, threshold='5*mean')
                print('dimension before selecting: train:{} test:{}'.format(x_train.shape, x_test.shape))
                x_train = sfm.fit_transform(x_train, y_train)
                x_test = sfm.transform(x_test)
                print('dimension after selecting: train:{} test:{}'.format(x_train.shape, x_test.shape))
            
            oof_train, oof_test = get_oof_1v1(model_pool[model_name],  x_train, y_train, x_test, NFOLDS, label) # Logreg
            if label not in layer1_oof_train:
                layer1_oof_train[label] = []
                layer1_oof_test[label] = []
            layer1_oof_train[label].append(oof_train)
            layer1_oof_test[label].append(oof_test)
            
            model_data_id = '{}_{}'.format(model_name, data['data_id'])
            model = model_pool[model_name]
            model.train(x_train, y_train, label)
            est_preds = model.predict(x_test, label)
            
            if model_data_id not in base_layer_est_preds:
                base_layer_est_preds[model_data_id] = np.empty((x_test.shape[0],len(label_cols)))
                model_data_id_list.append(model_data_id)
            base_layer_est_preds[model_data_id][:,i] = est_preds

Generating: ModelName.ONESVC wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real toxic
Starting One vs One dataset splitting
splitting done!
start training linear svc regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training linear svc regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training linear svc regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training linear svc regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One datase

predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training logistic regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training logistic regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training logistic regression
training done
applying naive bayes to dataset
predicting
predicting done
applying naive bayes to dataset
predicting
predicting done
Starting One vs One dataset splitting
splitting done!
start training logistic regression
training done
applying naive bayes to dataset
predicting
predicting done
Generating: ModelName.ONESVC wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real insult
Starting One vs One dataset splitting
splitting done!
start train

In [145]:
# base_layer_est_preds = {} # directly preditions from the base layer estimators

# layer1_oof_train = {}
# layer1_oof_test = {}

for i, label in enumerate(label_cols):
#     layer1_oof_train[label] = []
#     layer1_oof_test[label] = []
    for model_name in model_pool.keys():
        for data in bldr.get_data_by_compatible_model(model_name):
            x_train = data['x_train']
            y_train = data['y_train'][label]
            x_test = data['x_test']
            
            SEED = 0 # for reproducibility
            NFOLDS = 4 # set folds for out-of-fold prediction
            #print(x_train.shape,y_train.shape,x_test.shape,label)
            
            current_run = '{} {} {}'.format(model_name,data['data_id'],label)
            print('Generating: '+current_run)
            
            if model_name == ModelName.LGB:
                model = LogisticRegression(solver='sag')
                sfm = SelectFromModel(model, threshold='5*mean')
                print('dimension before selecting: train:{} test:{}'.format(x_train.shape, x_test.shape))
                x_train = sfm.fit_transform(x_train, y_train)
                x_test = sfm.transform(x_test)
                print('dimension after selecting: train:{} test:{}'.format(x_train.shape, x_test.shape))
            
            oof_train, oof_test = get_oof(model_pool[model_name],  x_train, y_train, x_test, NFOLDS) # Logreg
            if label not in layer1_oof_train:
                layer1_oof_train[label] = []
                layer1_oof_test[label] = []
            layer1_oof_train[label].append(oof_train)
            layer1_oof_test[label].append(oof_test)
            
            model_data_id = '{}_{}'.format(model_name, data['data_id'])
            model = model_pool[model_name]
            model.train(x_train, y_train)
            est_preds = model.predict(x_test)
            
            if model_data_id not in base_layer_est_preds:
                base_layer_est_preds[model_data_id] = np.empty((x_test.shape[0],len(label_cols)))
                model_data_id_list.append(model_data_id)
            base_layer_est_preds[model_data_id][:,i] = est_preds

Generating: ModelName.NBSVM tfidf_word_df2_ng(1, 2)_wmf200000 toxic
Generating: ModelName.NBSVM tfidf_word_df2_ng(1, 1)_wmf200000 toxic
Generating: ModelName.NBSVM tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000 toxic
Generating: ModelName.LGB tfidf_word_df2_ng(1, 2)_wmf200000 toxic
dimension before selecting: train:(159571, 200000) test:(153164, 200000)
dimension after selecting: train:(159571, 5688) test:(153164, 5688)
Generating: ModelName.LGB tfidf_word_df2_ng(1, 1)_wmf200000 toxic
dimension before selecting: train:(159571, 184719) test:(153164, 184719)
dimension after selecting: train:(159571, 8238) test:(153164, 8238)
Generating: ModelName.LGB tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000 toxic
dimension before selecting: train:(159571, 200000) test:(153164, 200000)
dimension after selecting: train:(159571, 3477) test:(153164, 3477)
Generating: ModelName.NBLSVC tfidf_word_df2_ng(1, 2)_wmf200000 toxic
Generating: ModelName.NBLSVC tfidf_word_df2_ng(1, 

Generating: ModelName.LOGREG tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000 identity_hate


In [20]:
generate_base_layer_est_preds(base_layer_est_preds)

1519871930
1519871931


In [None]:
list(base_layer_est_preds)

In [None]:
list(layer1_oof_train) # list keys

In [None]:
len(layer1_oof_train['toxic']) # number of models to stack (each model will predict one set of toxic, servere_toxic, etc..)

In [None]:
len(layer1_oof_train['toxic'][0]) # examples in oof_train (meta features, x_train) (meta labels are in train[label])

In [None]:
list(layer1_oof_test)

In [None]:
len(layer1_oof_test['toxic'][0]) # examples in oof_test (will be used by meta model (after validation) to predict the final prediction)

### before we choose which models to assemble, we can do:
#### 1. scatter plot analysis to check the diversity
#### 2. submit to check if the models have similar performance

## Ensembling:

In [21]:
def combine_layer_oof_per_label(layer1_oof_dict, label):
    x = None
    data_list = layer1_oof_dict[label]
    for i in range(len(data_list)):
        if i == 0:
            x = data_list[0]
        else:
            x = np.concatenate((x, data_list[i]), axis=1)
    return x

### 1. simple blend of two models

result = np.empty((test.shape[0],len(label_cols)))

# mix the first two models
for i, label in enumerate(label_cols):
    x_train = combine_layer_oof_per_label(layer1_oof_train, label)
    x_test = combine_layer_oof_per_label(layer1_oof_test, label)
    for j in range(x_train.shape[1]):
        roc = roc_auc_score(train[label], x_train[:,j])
        print(label, j, roc) # print out roc for meta feature on meta label (which is just the original train label)
    
    roc_scores_of_a_label = []
    alphas = np.linspace(0,1,1001)
    best_roc = 0
    best_alpha = 0
    for alpha in alphas:
        roc = roc_auc_score(train[label], alpha*x_train[:,0] + (1-alpha)*x_train[:,1])
        if roc > best_roc:
            best_roc = roc
            best_alpha = alpha
    
    print(label, best_roc, best_alpha)
    result[:,i] = best_alpha*x_test[:,0] + (1-best_alpha)*x_test[:,1]

submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
submission[label_cols] = result
sub_id = int(time.time())
print(sub_id)
submission.to_csv('./StackPreds/mixtwo_' + str(sub_id) + '.csv', index=False)

### 2. stacking

In [151]:
list(layer1_oof_train)

['insult', 'toxic', 'obscene', 'threat', 'severe_toxic', 'identity_hate']

In [24]:
base_layer_results_repo = BaseLayerResultsRepo()#load_from_file=False)

load from file


In [25]:
model_data_id_list

['ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real',
 'ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real']

In [26]:
base_layer_results_repo.add(layer1_oof_train, layer1_oof_test, base_layer_est_preds, model_data_id_list)

In [38]:
base_layer_results_repo.show_scores()

0.9825	ModelName.RNN_rnn_data_001
0.9819	ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9818	ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real
0.9815	ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9803	ModelName.NBSVM_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9794	ModelName.LGB_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9793	ModelName.LOGREG_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000
0.9786	ModelName.ONESVC_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w
0.9774	ModelName.NBLSVC_tfidf_word_df2_ng(1, 1)_wmf200000
0.9768	ModelName.NBSVM_tfidf_word_df2_ng(1, 1)_wmf200000
0.9765	ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000
0.9761	ModelName.NBSVM_tfidf_word_df2_ng(1, 2)_wmf200000
0.976	ModelName.LOGREG_tfidf_word_df2_ng(1, 1)_wmf200000
0.9752	ModelName.LOGREG_tfidf_word_df2_ng(1, 2)_wmf200000
0.9726	ModelName.LGB_tfidf_word_df2_ng(1, 2)_wmf200000
0.9723	ModelNam

In [161]:
model_threshold = np.random.choice([0.9803, 0.9794, 0.9793, 0.9786, 0.9774, 0.9768, 0.9765])

0.9803

In [29]:
#base_layer_results_repo.add_score('ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real',0.9818)

ModelName.ONELOGREG_wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w_real already existed in the repo. score: 0 update to 0.9818


In [39]:
# base_layer_results_repo.save()

In [48]:
layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=0.9774)

In [49]:
len(layer1_oof_train_loaded['toxic']) # number of models that will be stacked

9

len(layer1_oof_train_temp[label_cols[5]]) == len(layer1_oof_test_temp[label_cols[5]]) == len(list(base_layer_est_preds_temp)) == 1
                                                                                             

In [509]:
train.shape

(159571, 27)

In [206]:
int(time.time()* 1000000 % 45234634)

40881912

f = open('./xgb_search.csv', 'a')
header = 'time,id,threshold,num_models,colsample_bytree,lr,max_depth,subsample,\
gamma,alpha,cv_num_round,cv_nfolds,toxic_best_round,severe_toxic_best_round,\
obscene_best_round,threat_best_round,insult_best_round,identity_hate_best_round,\
toxic_auc,severe_toxic_auc,obscene_auc,threat_auc,insult_auc,identity_hate_auc,avg_auc\n'
f.write(header)
f.close()

In [212]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import datetime, time, gc

for i in range(100):
    np.random.seed(int(time.time()* 1000000) % 45234634)
    
    model_threshold = np.random.choice([0.9803, 0.9794, 0.9793, 0.9786, 0.9774, 0.9768, 0.9765])
    layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=model_threshold)
    gc.collect() 
        
    xgb_colsample_bytree = np.random.randint(5, 10)/10
    xgb_learning_rate = 1e-2 * (0.1 ** (np.random.rand() * 2 - 1.0)) # 0.001 to 0.0997
    xgb_max_depth = np.random.randint(2, 8)
    xgb_subsample = np.random.randint(50, 100)/100
    xgb_gamma = np.random.randint(0, 3)
    xgb_alpha = np.random.randint(0, 2)

    xgb_cv_seed = 0
    xgb_cv_num_round = np.random.randint(400, 1000)
    xgb_cv_nfolds = np.random.randint(3,5)

    xgb_params = {
        'seed': 0,
        'colsample_bytree': xgb_colsample_bytree,
        'silent': 1,
        'subsample': xgb_subsample,
        'learning_rate': xgb_learning_rate,
        'max_depth': xgb_max_depth,
        'gamma': xgb_gamma,
        'alpha': xgb_alpha,
        'nthread': 7,
        'min_child_weight': 1,
        'objective':'binary:logistic',
        'eval_metric':'auc'
    }

    num_models = len(layer1_oof_train_loaded['toxic'])
    #print('Stacking {} models'.format(num_models)) # number of models that will be stacked
    
    search_id = int(time.time())

    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('time: %s, id: %d, th: %f, num_models: %d, colsample_bytree: %f, lr: %.7f,\
    max_depth: %d, subsample: %f, gamma: %d, alpha: %d, cv_num_round: %d, cv_nfolds: %d\
            '%(now,search_id,model_threshold,num_models,\
              xgb_colsample_bytree,xgb_learning_rate,\
              xgb_max_depth,xgb_subsample,xgb_gamma,\
              xgb_alpha,xgb_cv_num_round,xgb_cv_nfolds))
    

    result = np.empty((test.shape[0],len(label_cols)))
    metric_dict = {} # all labels
    best_rounds = {}  # all labels

    for i, label in enumerate(label_cols):
        assert train.shape == (159571, 27)
        x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
        x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)

    #     clf = XGBClassifier()

    #     #scores = cross_val_score(clf, x_train, train[label], cv=3, scoring='roc_auc')

    #     #print(scores)
    #     #print("Stacking-CV: ROC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    #     clf.fit(x_train, train[label])

    #     result[:, i] = clf.predict_proba(x_test)[:,1]

        dtrain = xgb.DMatrix(x_train, train[label]) # check if train is still in right shape
        dtest = xgb.DMatrix(x_test)

        def xg_eval_auc(yhat, dtrain):
            y = dtrain.get_label()
            return 'auc', roc_auc_score(y, yhat)

        res = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_cv_num_round, nfold=xgb_cv_nfolds, seed=xgb_cv_seed, stratified=False,
                 early_stopping_rounds=25, verbose_eval=None, show_stdv=False, feval=xg_eval_auc, maximize=True)
        # early stopping is based on eavl on test fold. so check out the test-auc
        #pdb.set_trace()
        best_nrounds_for_current_label = res.shape[0] - 1
        #print(res[-3:])
        cv_mean = res.iloc[-1, 0]
        cv_std = res.iloc[-1, 1]

        #print('Ensemble-CV: {}: {}+{}'.format(label, cv_mean, cv_std))
        metric_dict[label] = cv_mean
        best_rounds[label] = best_nrounds_for_current_label
        #metric_dict[label]['cv_mean'] = cv_mean
        #metric_dict[label]['cv_std'] = cv_std
        gbdt = xgb.train(xgb_params, dtrain, best_nrounds_for_current_label)

        result[:,i] = gbdt.predict(dtest)#_proba(x_test)[:,1]

    #print('Stacking done')

    avg_auc = 0
    for label in label_cols:
        avg_auc += metric_dict[label]
    avg_auc/=6
          
    res = '%s,%d,%f,%d,%f,%.7f,%d,%f,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%.8f,%.8f,%.8f,%.8f,%.8f,%.8f,%.8f\n\
            '%(now,search_id,model_threshold,num_models,xgb_colsample_bytree,\
               xgb_learning_rate,xgb_max_depth,xgb_subsample,xgb_gamma,xgb_alpha,\
               xgb_cv_num_round,xgb_cv_nfolds,best_rounds['toxic'],best_rounds['severe_toxic'],\
               best_rounds['obscene'],best_rounds['threat'],best_rounds['insult'],\
               best_rounds['identity_hate'],metric_dict['toxic'],metric_dict['severe_toxic'],\
               metric_dict['obscene'],metric_dict['threat'],metric_dict['insult'],\
               metric_dict['identity_hate'],avg_auc)

    f = open('./xgb_search.csv', 'a')
    f.write(res)
    f.close()

#     sub_tile = 'stacking_test_'
#     submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
#     submission[label_cols] = result
#     submission.to_csv('./StackPreds/' + sub_tile + str(search_id) + '.csv', index=False)

> <ipython-input-212-a449875801c1>(9)<module>()
-> model_threshold = np.random.choice([0.9803, 0.9794, 0.9793, 0.9786, 0.9774, 0.9768, 0.9765])
(Pdb) n
> <ipython-input-212-a449875801c1>(10)<module>()
-> layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=model_threshold)
(Pdb) unt 19
> <ipython-input-212-a449875801c1>(20)<module>()
-> xgb_cv_seed = 0
(Pdb) model_threshold
0.9768
(Pdb) q


BdbQuit: 

# xgb random search top N training

In [207]:
xgb_search = pd.read_csv('xgb_search.csv').sort_values(by='avg_auc', ascending=False)

In [215]:
xgb_search.head(3)

Unnamed: 0,time,id,threshold,num_models,colsample_bytree,lr,max_depth,subsample,gamma,alpha,cv_num_round,cv_nfolds,toxic_auc,severe_toxic_auc,obscene_auc,threat_auc,insult_auc,identity_hate_auc,avg_auc
0,2018-03-01 10:23:13,1519899793,0.9768,10,0.7,0.092125,2,0.85,2,0,962,4,0.987677,0.991848,0.995462,0.994002,0.990178,0.991174,0.991723
1,2018-03-01 06:45:20,1519886720,0.9794,6,0.8,0.097664,4,0.68,1,0,995,3,0.987663,0.991693,0.995412,0.993793,0.990062,0.991191,0.991636
2,2018-03-01 08:14:47,1519892087,0.9794,6,0.5,0.098587,3,0.77,1,0,809,4,0.987629,0.991796,0.995422,0.99361,0.990133,0.991218,0.991635


In [219]:
for i in range(20):
    
    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    search_id = xgb_search['id'].values[i]
    model_threshold = xgb_search['threshold'].values[i]
    num_models = xgb_search['num_models'].values[i]
    xgb_colsample_bytree = xgb_search['colsample_bytree'].values[i]
    xgb_learning_rate = xgb_search['lr'].values[i]
    xgb_max_depth = xgb_search['max_depth'].values[i]
    xgb_subsample = xgb_search['subsample'].values[i]
    xgb_gamma = xgb_search['gamma'].values[i]
    xgb_alpha = xgb_search['alpha'].values[i]
    xgb_cv_num_round = xgb_search['cv_num_round'].values[i]
    xgb_cv_nfolds = xgb_search['cv_nfolds'].values[i]
    
            
#     best_rounds = {}
#     best_rounds['toxic'] = xgb_search['toxic_best_round'].values[i]
#     best_rounds['severe_toxic'] = xgb_search['severe_toxic_best_round'].values[i]
#     best_rounds['obscene'] = xgb_search['obscene_best_round'].values[i]
#     best_rounds['threat'] = xgb_search['threat_best_round'].values[i]
#     best_rounds['insult'] = xgb_search['insult_best_round'].values[i]
#     best_rounds['identity_hate'] = xgb_search['identity_hate_best_round'].values[i]
    
    
    metric_dict_fromcsv = {}
    metric_dict_fromcsv['toxic'] = xgb_search['toxic_auc'].values[i]
    metric_dict_fromcsv['severe_toxic'] = xgb_search['severe_toxic_auc'].values[i]
    metric_dict_fromcsv['obscene'] = xgb_search['obscene_auc'].values[i]
    metric_dict_fromcsv['threat'] = xgb_search['threat_auc'].values[i]
    metric_dict_fromcsv['insult'] = xgb_search['insult_auc'].values[i]
    metric_dict_fromcsv['identity_hate'] = xgb_search['identity_hate_auc'].values[i]
    metric_dict_fromcsv['avg_auc'] = xgb_search['avg_auc'].values[i]

    
    layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=model_threshold)
    gc.collect() 

    xgb_params = {
        'seed': 0,
        'colsample_bytree': xgb_colsample_bytree,
        'silent': 1,
        'subsample': xgb_subsample,
        'learning_rate': xgb_learning_rate,
        'max_depth': xgb_max_depth,
        'gamma': xgb_gamma,
        'alpha': xgb_alpha,
        'nthread': 7,
        'min_child_weight': 1,
        'objective':'binary:logistic',
        'eval_metric':'auc'
    }


    print('time: %s, id: %d, th: %f, num_models: %d, colsample_bytree: %f, lr: %.7f,\
    max_depth: %d, subsample: %f, gamma: %d, alpha: %d, cv_num_round: %d, cv_nfolds: %d\
        '%(now,search_id,model_threshold,num_models,\
          xgb_colsample_bytree,xgb_learning_rate,\
          xgb_max_depth,xgb_subsample,xgb_gamma,\
          xgb_alpha,xgb_cv_num_round,xgb_cv_nfolds))
        

    result = np.empty((test.shape[0],len(label_cols)))
    metric_dict = {}

    for i, label in enumerate(label_cols):
        assert train.shape == (159571, 27)
        x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
        x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)

        dtrain = xgb.DMatrix(x_train, train[label]) # check if train is still in right shape
        dtest = xgb.DMatrix(x_test)

        def xg_eval_auc(yhat, dtrain):
            y = dtrain.get_label()
            return 'auc', roc_auc_score(y, yhat)

        res = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_cv_num_round, nfold=xgb_cv_nfolds, seed=xgb_cv_seed, stratified=False,
                 early_stopping_rounds=25, verbose_eval=None, show_stdv=False, feval=xg_eval_auc, maximize=True)
        # early stopping is based on eavl on test fold. so check out the test-auc
        #pdb.set_trace()
        best_nrounds = res.shape[0] - 1
        #print(res[-3:])
        cv_mean = res.iloc[-1, 0]
        cv_std = res.iloc[-1, 1]
        
        #######################################################################
        #######################################################################
        #######################################################################
        #########################     best_nrounds    #########################
        #######################################################################
        #######################################################################
        #######################################################################

        
        metric_dict[label] = cv_mean
        print('XGB top N training (id: {}). {}: \t cv_mean:{} \t cv_mean_fromcsv {} \t best nrounds: {}\
        '.format(search_id, label, cv_mean, metric_dict_fromcsv[label], best_nrounds))
        
        gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

        result[:,i] = gbdt.predict(dtest)#_proba(x_test)[:,1]

    #print('Stacking done')

    avg_auc = 0
    for label in label_cols:
        avg_auc += metric_dict[label]
    avg_auc/=6
    
    print('XGB top N training. avg_auc:{} \t avg_auc_fromcsv {}'.format(avg_auc, metric_dict_fromcsv['avg_auc']))


    sub_tile = 'xgb_topn_'
    submission = pd.read_csv(PATH + 'sample_submission.csv')#.head(1000)
    submission[label_cols] = result
    submission.to_csv('./StackPreds/TopN_XGB/{}_{}_{}.csv'.format(sub_title,search_id,metric_dict_fromcsv['avg_auc']), index=False)
        
        
#     val_auc = para['val_auc'].values[i]
#     print('Model training done. Validation AUC: %.5f'%val_auc)

    
#     test_flow = dataGenerator.flow(test_embeddings + test_genre, [test_context], \
#             batch_size=16384, shuffle=False)
#     test_pred = model.predict_generator(test_flow, test_flow.__len__(), workers=1)
    
#     test_sub = pd.DataFrame({'id': test_id, 'target': test_pred.ravel()})
#     test_sub.to_csv('./temp_nn/nn_%.5f_%.5f_%d.csv'%(val_auc, train_loss, flag), index=False)

time: 2018-03-01 16:22:48, id: 1519899793, th: 0.976800, num_models: 10, colsample_bytree: 0.700000, lr: 0.0921252,    max_depth: 2, subsample: 0.850000, gamma: 2, alpha: 0, cv_num_round: 962, cv_nfolds: 4        
XGB top N training (id: 1519899793). toxic: 	 cv_mean:0.9876772500000001 	 cv_mean_fromcsv 0.98767725 	 best nrounds: 165        
XGB top N training (id: 1519899793). severe_toxic: 	 cv_mean:0.9918480000000001 	 cv_mean_fromcsv 0.991848 	 best nrounds: 73        
XGB top N training (id: 1519899793). obscene: 	 cv_mean:0.995462 	 cv_mean_fromcsv 0.9954620000000001 	 best nrounds: 117        
XGB top N training (id: 1519899793). threat: 	 cv_mean:0.994002 	 cv_mean_fromcsv 0.994002 	 best nrounds: 136        
XGB top N training (id: 1519899793). insult: 	 cv_mean:0.99017775 	 cv_mean_fromcsv 0.99017775 	 best nrounds: 100        
XGB top N training (id: 1519899793). identity_hate: 	 cv_mean:0.9911735 	 cv_mean_fromcsv 0.9911735 	 best nrounds: 107        
XGB top N training. av

XGB top N training (id: 1519889074). insult: 	 cv_mean:0.9901372500000001 	 cv_mean_fromcsv 0.99013725 	 best nrounds: 156        
XGB top N training (id: 1519889074). identity_hate: 	 cv_mean:0.9912777500000001 	 cv_mean_fromcsv 0.99127775 	 best nrounds: 106        
XGB top N training. avg_auc:0.9915545833333334 	 avg_auc_fromcsv 0.99155458
time: 2018-03-01 16:38:22, id: 1519890027, th: 0.980300, num_models: 5, colsample_bytree: 0.600000, lr: 0.0465127,    max_depth: 2, subsample: 0.620000, gamma: 0, alpha: 0, cv_num_round: 798, cv_nfolds: 3        
XGB top N training (id: 1519890027). toxic: 	 cv_mean:0.9874896666666667 	 cv_mean_fromcsv 0.98748967 	 best nrounds: 158        
XGB top N training (id: 1519890027). severe_toxic: 	 cv_mean:0.9917513333333333 	 cv_mean_fromcsv 0.99175133 	 best nrounds: 177        
XGB top N training (id: 1519890027). obscene: 	 cv_mean:0.9953436666666667 	 cv_mean_fromcsv 0.99534367 	 best nrounds: 118        
XGB top N training (id: 1519890027). threat

XGB top N training (id: 1519891515). obscene: 	 cv_mean:0.9954445 	 cv_mean_fromcsv 0.9954445 	 best nrounds: 367        
XGB top N training (id: 1519891515). threat: 	 cv_mean:0.9925642499999999 	 cv_mean_fromcsv 0.99256425 	 best nrounds: 202        
XGB top N training (id: 1519891515). insult: 	 cv_mean:0.9901135 	 cv_mean_fromcsv 0.9901135 	 best nrounds: 229        
XGB top N training (id: 1519891515). identity_hate: 	 cv_mean:0.99126925 	 cv_mean_fromcsv 0.99126925 	 best nrounds: 214        
XGB top N training. avg_auc:0.9913044583333334 	 avg_auc_fromcsv 0.99130446
time: 2018-03-01 17:04:04, id: 1519884412, th: 0.980300, num_models: 5, colsample_bytree: 0.800000, lr: 0.0515072,    max_depth: 4, subsample: 0.520000, gamma: 2, alpha: 1, cv_num_round: 634, cv_nfolds: 4        
XGB top N training (id: 1519884412). toxic: 	 cv_mean:0.9876025 	 cv_mean_fromcsv 0.9876024999999999 	 best nrounds: 177        
XGB top N training (id: 1519884412). severe_toxic: 	 cv_mean:0.99180575 	 cv_m

In [None]:
submission.head()

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline

preds_mats = []
for label_idx, label in enumerate(label_cols):
    preds_list = []
    print(label)
    for i, (key, value) in enumerate(base_layer_est_preds_loaded.items()):
        preds_list.append(value[:,label_idx].reshape(-1,1))
    preds_mats.append(np.hstack(preds_list))
    
    
assert len(preds_mats) == len(label_cols)
assert preds_mats[0].shape[1] == (len(base_layer_est_preds_loaded))

# analyze the first label in scatter matrix
temp = pd.DataFrame(preds_mats[0], columns=[key.split('_')[0] + re.sub('\D','',key) for key in base_layer_est_preds_loaded.keys()])
print(temp.head(3))

scatter_matrix(temp, figsize=(14,14))

In [None]:
def hyperparamstune(clf, x, y, label_cols, cv=5, scoring='roc_auc'):
    

In [None]:
list(base_layer_est_preds)

model_data_id_list = ['ModelName.RNN_rnn_data_001',
'ModelName.NBSVM_tfidf_word_df2_ng(1, 2)_wmf200000',
'ModelName.NBSVM_tfidf_word_df2_ng(1, 1)_wmf200000',
'ModelName.NBSVM_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000',
'ModelName.LGB_tfidf_word_df2_ng(1, 2)_wmf200000',
'ModelName.LGB_tfidf_word_df2_ng(1, 1)_wmf200000',
'ModelName.LGB_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000',
'ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000',
'ModelName.NBLSVC_tfidf_word_df2_ng(1, 1)_wmf200000',
'ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000',
'ModelName.LOGREG_tfidf_word_df2_ng(1, 2)_wmf200000',
'ModelName.LOGREG_tfidf_word_df2_ng(1, 1)_wmf200000',
'ModelName.LOGREG_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000']

base_layer_results_repo.add_score('ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000', 0.9815)

base_layer_results_repo.add_score('ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000', 0.9765)

base_layer_results_repo.add_score('ModelName.NBLSVC_tfidf_word_df2_ng(1, 1)_wmf200000', 0.9774)

base_layer_results_repo.add_score('ModelName.LOGREG_tfidf_word_df2_ng(1, 1)_wmf200000', 0.9760)

base_layer_results_repo.add_score('ModelName.LGB_tfidf_word_df2_ng(1, 1)_wmf200000', 0.9723)

base_layer_results_repo.add_score('ModelName.NBLSVC_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000', 0.9815)

base_layer_results_repo.add_score('ModelName.NBLSVC_tfidf_word_df2_ng(1, 2)_wmf200000', 0.9765)

base_layer_results_repo.add_score('ModelName.LOGREG_tfidf_word_df2_ng(1, 2)_wmf200000', 0.9752)

base_layer_results_repo.add_score('ModelName.LGB_tfidf_word_df2_ng(1, 2)_wmf200000', 0.9726)

base_layer_results_repo.add_score('ModelName.NBSVM_tfidf_word_df2_ng(1, 2)_wmf200000', 0.9761)

base_layer_results_repo.add_score('ModelName.NBSVM_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000', 0.9803)

base_layer_results_repo.add_score('ModelName.LOGREG_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000', 0.9793)

base_layer_results_repo.add_score('ModelName.LGB_tfidf_wordchar_charmaxdf0.300000_ng(1, 2)_wmf100000_cmf100000', 0.9794)

base_layer_results_repo.add_score('ModelName.NBSVM_tfidf_word_df2_ng(1, 1)_wmf200000', 0.9768)

# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
}