In [32]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack, vstack
import lightgbm as lgb

from enum import Enum
class ModelName(Enum):
    XGB = 1
    NBXGB = 2
    LGB = 3
    NBLGB = 4
    LOGREG = 5
    NBSVM = 6 # NBLOGREG
    LSVC = 7
    NBLSVC = 8
    RF = 9 # random forest
    RNN = 10
    ONESVC = 11
    ONELOGREG = 12


class BaseLayerEstimator(ABC):
    
    def _pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def _nb(self, x_train, y_train):
        assert isinstance(y_train, pd.DataFrame)
        r = {}
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            r[col] = np.log(self._pr(1, y_train[col].values, x_train) / self._pr(0, y_train[col], x_train))
        return r
    
    @abstractmethod
    def train(self, x_train, y_train):
        """
        Params:
            x_train: np array
            y_train: pd series
        """
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass
    
    
class LightgbmBLE(BaseLayerEstimator):
    def __init__(self, x_train, y_train, label_cols= None, params=None, nb=True, seed=0):
        """
        constructor:

            x_train: should be a np/scipy/ 2-d array or matrix. only be used when nb is true
            y_train: should be a dataframe
            label_cols: (list) if y_train contains multiple labels, provide the list of label names
                e.g.: label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
            params: (dict)
            nb: (boolean) compute naive bayes or not. (helpful for unbalanced data)
            seed: (int) training random seed (not used currently)
            
        Example:
            ll = LightgbmBLE(train_tfidf, train[label_cols], params=params, nb=True)
            result = pd.DataFrame()
            for col in label_cols:
                    print(col)
                    ll.train(train_tfidf, train[col], col)
                    result[col] = ll.predict(test_tfidf, col)
        """
        #### check naive bayes
        if nb:
            print('Naive Bayes is enabled')
            self.r = self._nb(x_train, y_train)
        else:
            print('Naive Bayes is disabled')
            self.r = None
        ##### set values    
        self.nb = nb
        self.set_params(params)
        self.label_cols = label_cols
        print('LightgbmBLE is initialized')
    
    
    def set_params(self, params):
        self.params = params
    
    
    
    def _pre_process(self, x, y, label=None):
        if self.nb:
            if label is None:
                raise ValueError('Naive Bayes is enabled. label cannot be None.')
            if label not in self.label_cols:
                raise ValueError('Label not in label_cols')
            print('apply naive bayes to feature set')
            x = x.multiply(self.r[label])
            if isinstance(x, csr_matrix):
                x = x.tocsr()
        if isinstance(y, pd.Series):
            y = y.values
        else:
            y = y
        return (x, y)
    
    
    def train(self, x_train, y_train, label=None, valid_set_percent=0):
        """
        Params:
            x_train: np/scipy/ 2-d array or matrix
            y_train: should be a dataframe
            label: (str) if not none, then it's one of the labels in the label_cols
                    if nb is set to True when initializing, when label can not be None
            valid_set_percent: (float, 0 to 1). 
                    0: no validation set. (imposible to use early stopping)
                    1: use training set as validation set (to check underfitting, and early stopping)
                    >0 and <1: use a portion of training set as validation set. (to check overfitting, and early stopping)
        
        """
        x, y = self._pre_process(x_train, y_train, label)
        
        if valid_set_percent != 0:
            if valid_set_percent > 1 or valid_set_percent < 0:
                raise ValueError('valid_set_percent must >= 0 and <= 1')
            if valid_set_percent != 1:
                x, x_val, y, y_val = train_test_split(x, y, test_size=valid_set_percent)


        lgb_train = lgb.Dataset(x, y)
        if valid_set_percent != 0:
            if valid_set_percent == 1:
                print('Evaluating using training set')
                self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_train)
            else:
                lgb_val = lgb.Dataset(x_val, y_val)
                print('Evaluating using validation set ({}% of training set)'.format(valid_set_percent*100))
                self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_val)
        else:
            print('No evaluation set, thus not possible to use early stopping. Please train with your best params.')
            self.model = lgb.train(self.params, lgb_train)
        
        
    def predict(self, x_test, label=None):
        x, _ = self._pre_process(x_test, y=None, label=label)
        print('starting predicting')
        if self.model.best_iteration > 0:
            print('best_iteration {} is chosen.'.format(best_iteration))
            result = self.model.predict(x, num_iteration=bst.best_iteration)
        else:
            result = self.model.predict(x)
        print('predicting done')
        return result
       

In [3]:
from tfidf_data import tfidf_data_process

In [56]:
x_train, y_train, x_test, data_id = tfidf_data_process(word_ngram=(1,3), word_max=100000, char_ngram=(2, 5), char_max=200000)

loading data done!
fitting word
transforming train word
transforming test word
fitting char
transforming train char
transforming test char


In [57]:
x_train.shape

(159571, 300000)

In [58]:
lgb_params_per_label = {}
lgb_params_per_label['toxic'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.9,
    'feature_fraction': 0.6,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.1,
    'max_depth': -1,
    'num_iterations': 219,
    'num_leaves': 61, 
    'is_unbalance': False
}

lgb_params_per_label['severe_toxic'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.6,
    'lambda_l1': 0.5, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': 5,
    'num_iterations': 322,
    'num_leaves': 11, 
    'is_unbalance': False
}


lgb_params_per_label['obscene'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.8,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 274,
    'num_leaves': 61, 
    'is_unbalance': False
}

lgb_params_per_label['threat'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.8,
    'lambda_l1': 0.5, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 208,
    'num_leaves': 11, 
    'is_unbalance': False
}

lgb_params_per_label['insult'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.5, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 454,
    'num_leaves': 11, 
    'is_unbalance': False
}

lgb_params_per_label['identity_hate'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.6,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 191,
    'num_leaves': 61, 
    'is_unbalance': False
}
    
    
#     #'learning_rate': 0.05,
#     'is_unbalance': True,
#     'early_stopping_round': 25,
#     'max_depth': -1,
#     'num_boost_round': 3000,
#     'application': 'binary',
#     'num_leaves': 63,
#     'verbosity': 10,
#     'metric': 'auc',
#     'data_random_seed': 2,
#     'bagging_fraction': 1,
#     'feature_fraction': 0.6,
#     'nthread': 4
# #     'lambda_l1': 1,
# #     'lambda_l2': 1
# }


In [59]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
lgb_ble = LightgbmBLE(x_train, y_train, label_cols=label_cols, nb=True)

Naive Bayes is enabled
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
LightgbmBLE is initialized


In [60]:
preds = np.zeros((x_test.shape[0], len(label_cols)))

In [61]:
preds.shape

(153164, 6)

In [62]:
for i, label in enumerate(label_cols):
    lgb_ble.set_params(lgb_params_per_label[label])
    lgb_ble.train(x_train, y_train[label].values, label)
    preds[:, i] = lgb_ble.predict(x_test, label)

apply naive bayes to feature set
No evaluation set, thus not possible to use early stopping. Please train with your best params.




apply naive bayes to feature set
starting predicting




predicting done
apply naive bayes to feature set
No evaluation set, thus not possible to use early stopping. Please train with your best params.
apply naive bayes to feature set
starting predicting
predicting done
apply naive bayes to feature set
No evaluation set, thus not possible to use early stopping. Please train with your best params.
apply naive bayes to feature set
starting predicting
predicting done
apply naive bayes to feature set
No evaluation set, thus not possible to use early stopping. Please train with your best params.
apply naive bayes to feature set
starting predicting
predicting done
apply naive bayes to feature set
No evaluation set, thus not possible to use early stopping. Please train with your best params.
apply naive bayes to feature set
starting predicting
predicting done
apply naive bayes to feature set
No evaluation set, thus not possible to use early stopping. Please train with your best params.
apply naive bayes to feature set
starting predicting
predicting

In [63]:
PATH = '~/data/toxic/data/'
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = preds
import time
sub_id = int(time.time())
print(sub_id)
submission.to_csv('./BaseEstPreds/' + 'test123_' + str(sub_id) + '.csv', index=False)

1520650080


In [64]:
submission[label_cols].describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,153164.0,153164.0,153164.0,153164.0,153164.0,153164.0
mean,0.203953,0.017368,0.121976,0.003625,0.095111,0.013132
std,0.352209,0.074662,0.286152,0.037611,0.219306,0.081633
min,0.000102,5.4e-05,0.000103,8.1e-05,0.000374,8.6e-05
25%,0.002085,0.000337,0.000739,0.000234,0.002915,0.00026
50%,0.007873,0.000697,0.001715,0.0004,0.006668,0.000467
75%,0.191338,0.002289,0.014694,0.00078,0.032832,0.001296
max,1.0,0.967314,0.999962,0.994893,0.999849,0.991052


In [12]:
lgb_ble.train(x_train, y_train['toxic'].values, valid_set_percent=0.1)



[1]	valid_0's auc: 0.870293
Training until validation scores don't improve for 25 rounds.
[2]	valid_0's auc: 0.879855
[3]	valid_0's auc: 0.890069
[4]	valid_0's auc: 0.895269
[5]	valid_0's auc: 0.900742
[6]	valid_0's auc: 0.904055
[7]	valid_0's auc: 0.90557
[8]	valid_0's auc: 0.908777
[9]	valid_0's auc: 0.911911
[10]	valid_0's auc: 0.912996
[11]	valid_0's auc: 0.916038
[12]	valid_0's auc: 0.918541
[13]	valid_0's auc: 0.921163
[14]	valid_0's auc: 0.922745
[15]	valid_0's auc: 0.925085
[16]	valid_0's auc: 0.926786
[17]	valid_0's auc: 0.927603
[18]	valid_0's auc: 0.931456
[19]	valid_0's auc: 0.932031
[20]	valid_0's auc: 0.934025
[21]	valid_0's auc: 0.93553
[22]	valid_0's auc: 0.936878
[23]	valid_0's auc: 0.938081
[24]	valid_0's auc: 0.93911
[25]	valid_0's auc: 0.940473
[26]	valid_0's auc: 0.941336
[27]	valid_0's auc: 0.942148
[28]	valid_0's auc: 0.942864
[29]	valid_0's auc: 0.943854
[30]	valid_0's auc: 0.944219
[31]	valid_0's auc: 0.945139
[32]	valid_0's auc: 0.947068
[33]	valid_0's auc: 0.

[277]	valid_0's auc: 0.976481
[278]	valid_0's auc: 0.976518
[279]	valid_0's auc: 0.976525
[280]	valid_0's auc: 0.976551
[281]	valid_0's auc: 0.976569
[282]	valid_0's auc: 0.976599
Early stopping, best iteration is:
[257]	valid_0's auc: 0.976635


In [13]:
x_test.shape

(153164, 300000)

In [15]:
preds = lgb_ble.predict(x_test)

starting predicting
predicting done


In [17]:
np.save('lgbtoxicpred', preds)

In [20]:
preds_load = np.load('lgbtoxicpred.npy')

In [21]:
preds_load.shape

(153164,)

In [1]:
import lightgbm as lgb

In [None]:
lgb.train()

In [4]:
parameters = [
            dict(name="max_bin", type="int", bounds=dict(min=20, max=20000)),
            dict(name="learning_rate", type="double", bounds=dict(min=0.001, max=0.3)),
            dict(name="num_leaves", type="int", bounds=dict(min=100, max=4095)),
            # dict(name="num_leaves", type="int", bounds=dict(min=100, max=45000)),
            dict(name="scale_pos_weight", type="double", bounds=dict(min=0.01, max=2000.0)),
            dict(name="n_estimators", type="int", bounds=dict(min=10, max=10000)),
            dict(name="min_child_weight", type="int", bounds=dict(min=1, max=2000)),
            dict(name="subsample", type="double", bounds=dict(min=0.4, max=1)),
            dict(name="bagging_fraction", type="double", bounds=dict(min=0.3, max=1)),
            dict(name="max_depth", type="int", bounds=dict(min=2, max=50)),
        ]
# static_parameters = {'boosting_type': 'dart', 'reg_alpha': 0, 'reg_lambda': 2, 'is_unbalance': True,
#                              'min_split_gain': 0, 'min_child_samples': 10, 'colsample_bytree': 0.8, 'subsample_freq': 3,
#                              'subsample_for_bin': 50000,
#                              'histogram_pool_size': detect_available_memory_for_histogram_cache()}

In [5]:
parameters

[{'bounds': {'max': 20000, 'min': 20}, 'name': 'max_bin', 'type': 'int'},
 {'bounds': {'max': 0.3, 'min': 0.001},
  'name': 'learning_rate',
  'type': 'double'},
 {'bounds': {'max': 4095, 'min': 100}, 'name': 'num_leaves', 'type': 'int'},
 {'bounds': {'max': 2000.0, 'min': 0.01},
  'name': 'scale_pos_weight',
  'type': 'double'},
 {'bounds': {'max': 10000, 'min': 10}, 'name': 'n_estimators', 'type': 'int'},
 {'bounds': {'max': 2000, 'min': 1},
  'name': 'min_child_weight',
  'type': 'int'},
 {'bounds': {'max': 1, 'min': 0.4}, 'name': 'subsample', 'type': 'double'},
 {'bounds': {'max': 1, 'min': 0.3},
  'name': 'bagging_fraction',
  'type': 'double'},
 {'bounds': {'max': 50, 'min': 2}, 'name': 'max_depth', 'type': 'int'}]