In [88]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack, vstack
import lightgbm as lgb

from enum import Enum
class ModelName(Enum):
    XGB = 1
    NBXGB = 2
    LGB = 3
    NBLGB = 4
    LOGREG = 5
    NBSVM = 6 # NBLOGREG
    LSVC = 7
    NBLSVC = 8
    RF = 9 # random forest
    RNN = 10
    ONESVC = 11
    ONELOGREG = 12


class BaseLayerEstimator(ABC):
    
    def _pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def _nb(self, x_train, y_train):
        assert isinstance(y_train, pd.DataFrame)
        r = {}
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            r[col] = np.log(self._pr(1, y_train[col].values, x_train) / self._pr(0, y_train[col], x_train))
        return r
    
    @abstractmethod
    def train(self, x_train, y_train):
        """
        Params:
            x_train: np array
            y_train: pd series
        """
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass
    
    
    
class LogRegAndLsvcBLE(BaseLayerEstimator):
    def __init__(self, mode=ModelName.LOGREG, seed=0, params=None):
        if mode != ModelName.LOGREG and mode != ModelName.LSVC:
            raise ValueError('Invalid mode. Valid modes: ModelName.LOGREG and ModelName.LSVC')
        self._mode = mode
        params['random_state'] = seed
        self._params = params

    def predict(self, x):
        return self._clf.predict_proba(x)[:,1] # chance of being 1 ([:,0] chance of being 0)

    def train(self, x_train, y_train):
        if self._mode == ModelName.LOGREG:
            self._clf = LogisticRegression(**self._params).fit(x_train, y_train)
        if self._mode == ModelName.LSVC:
            self._clf = CalibratedClassifierCV(LinearSVC(**self._params)).fit(x_train, y_train)
    
    def feature_importance(self):
        return self._clf.feature_importance
    
    
    
class LightgbmBLE(BaseLayerEstimator):
    def __init__(self, x_train, y_train, label_cols= None, params=None, nb=True, seed=0):
        """
        constructor:

            x_train: should be a np/scipy/ 2-d array or matrix. only be used when nb is true
            y_train: should be a dataframe
            label_cols: (list) if y_train contains multiple labels, provide the list of label names
                e.g.: label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
            params: (dict)
            nb: (boolean) compute naive bayes or not. (helpful for unbalanced data)
            seed: (int) training random seed (not used currently)
            
        Example:
            ll = LightgbmBLE(train_tfidf, train[label_cols], params=params, nb=True)
            result = pd.DataFrame()
            for col in label_cols:
                    print(col)
                    ll.train(train_tfidf, train[col], col)
                    result[col] = ll.predict(test_tfidf, col)
        """
        #### check naive bayes
        if nb:
            print('Naive Bayes is enabled')
            self.r = self._nb(x_train, y_train)
        else:
            print('Naive Bayes is disabled')
            self.r = None
        ##### set values    
        self.nb = nb
        self.set_params(params)
        self.label_cols = label_cols
        print('LightgbmBLE is initialized')
    
    
    def set_params(self, params):
        self.params = params
    
    
    
    def _pre_process(self, x, y, label=None):
        if self.nb:
            if label is None:
                raise ValueError('Naive Bayes is enabled. label cannot be None.')
            if label not in self.label_cols:
                raise ValueError('Label not in label_cols')
            print('apply naive bayes to feature set')
            x = x.multiply(self.r[label])
            if isinstance(x, csr_matrix):
                x = x.tocsr()
        if isinstance(y, pd.Series):
            y = y.values
        else:
            y = y
        return (x, y)
    
    
    def train(self, x_train, y_train, label=None, valid_set_percent=0):
        """
        Params:
            x_train: np/scipy/ 2-d array or matrix
            y_train: should be a dataframe
            label: (str) if not none, then it's one of the labels in the label_cols
                    if nb is set to True when initializing, when label can not be None
            valid_set_percent: (float, 0 to 1). 
                    0: no validation set. (imposible to use early stopping)
                    1: use training set as validation set (to check underfitting, and early stopping)
                    >0 and <1: use a portion of training set as validation set. (to check overfitting, and early stopping)
        
        """
        x, y = self._pre_process(x_train, y_train, label)
        
        if valid_set_percent != 0:
            if valid_set_percent > 1 or valid_set_percent < 0:
                raise ValueError('valid_set_percent must >= 0 and <= 1')
            if valid_set_percent != 1:
                x, x_val, y, y_val = train_test_split(x, y, test_size=valid_set_percent)


        lgb_train = lgb.Dataset(x, y)
        if valid_set_percent != 0:
            if valid_set_percent == 1:
                print('Evaluating using training set')
                self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_train)
            else:
                lgb_val = lgb.Dataset(x_val, y_val)
                print('Evaluating using validation set ({}% of training set)'.format(valid_set_percent*100))
                self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_val)
        else:
            print('No evaluation set, thus not possible to use early stopping. Please train with your best params.')
            self.model = lgb.train(self.params, lgb_train)
        
        
    def predict(self, x_test, label=None):
        x, _ = self._pre_process(x_test, y=None, label=label)
        print('starting predicting')
        if self.model.best_iteration > 0:
            print('best_iteration {} is chosen.'.format(best_iteration))
            result = self.model.predict(x, num_iteration=bst.best_iteration)
        else:
            result = self.model.predict(x)
        print('predicting done')
        return result
       

In [70]:
from tfidf_data import tfidf_data_process

In [None]:
x_train, y_train, x_test, data_id = tfidf_data_process(word_ngram=(1,3), word_max=100000, char_ngram=(2, 5), char_max=200000)

In [73]:
x_train.shape, y_train.shape, x_test.shape, data_id

((159571, 300000),
 (159571, 6),
 (153164, 300000),
 'wordtfidf_word_(1, 3)_100000_1_1.0_char_(2, 5)_200000_1_1.0')

In [94]:
lsvc_params = {
    'identity_hate': {'C': 0.01, 'class_weight': 'balanced', 'fit_intercept': True},
    'insult': {'C': 0.02, 'class_weight': 'balanced', 'fit_intercept': True},
    'obscene': {'C': 0.05, 'class_weight': None, 'fit_intercept': True},
    'severe_toxic': {'C': 0.02, 'class_weight': None, 'fit_intercept': True},
    'threat': {'C': 0.005, 'class_weight': 'balanced', 'fit_intercept': True},
    'toxic': {'C': 0.1, 'class_weight': 'balanced','fit_intercept': True}
}

In [90]:
preds = np.zeros((x_test.shape[0], len(label_cols)))

for i, label in enumerate(label_cols):
    lsvc_ble = LogRegAndLsvcBLE(mode=ModelName.LSVC, seed=1001, params=lsvc_params[label])
    lsvc_ble.train(x_train, y_train[label].values)
    preds[:, i] = lsvc_ble.predict(x_test)

In [91]:
preds.shape

(153164, 6)

In [93]:
logreg_params = {
    'identity_hate': {'C': 0.25, 'class_weight': 'balanced', 'fit_intercept': True},
    'insult': {'C': 0.25, 'class_weight': 'balanced', 'fit_intercept': True},
    'obscene': {'C': 0.7, 'class_weight': 'balanced', 'fit_intercept': True},
    'severe_toxic': {'C': 0.3,'class_weight': None, 'fit_intercept': True},
    'threat': {'C': 0.05, 'class_weight': 'balanced', 'fit_intercept': True}, 
    'toxic': {'C': 0.8, 'class_weight': 'balanced', 'fit_intercept': True}
}

In [98]:
preds = np.zeros((x_test.shape[0], len(label_cols)))

for i, label in enumerate(label_cols):
    logreg_ble = LogRegAndLsvcBLE(mode=ModelName.LOGREG, seed=1001, params=logreg_params[label])
    logreg_ble.train(x_train, y_train[label].values)
    preds[:, i] = logreg_ble.predict(x_test)

In [99]:
preds.shape

(153164, 6)

In [58]:
lgb_params_per_label = {}
lgb_params_per_label['toxic'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.9,
    'feature_fraction': 0.6,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.1,
    'max_depth': -1,
    'num_iterations': 219,
    'num_leaves': 61, 
    'is_unbalance': False
}

lgb_params_per_label['severe_toxic'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.6,
    'lambda_l1': 0.5, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': 5,
    'num_iterations': 322,
    'num_leaves': 11, 
    'is_unbalance': False
}


lgb_params_per_label['obscene'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.8,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 274,
    'num_leaves': 61, 
    'is_unbalance': False
}

lgb_params_per_label['threat'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.8,
    'lambda_l1': 0.5, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 208,
    'num_leaves': 11, 
    'is_unbalance': False
}

lgb_params_per_label['insult'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.5, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 454,
    'num_leaves': 11, 
    'is_unbalance': False
}

lgb_params_per_label['identity_hate'] = {
    'objective': 'binary',
    'metric': 'auc', 
    'num_threads': 8, 
    'bagging_freq': 1, 
    'bagging_fraction': 0.7,
    'feature_fraction': 0.6,
    'lambda_l1': 0.0, 
    'lambda_l2': 0.0, 
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_iterations': 191,
    'num_leaves': 61, 
    'is_unbalance': False
}
    
    
#     #'learning_rate': 0.05,
#     'is_unbalance': True,
#     'early_stopping_round': 25,
#     'max_depth': -1,
#     'num_boost_round': 3000,
#     'application': 'binary',
#     'num_leaves': 63,
#     'verbosity': 10,
#     'metric': 'auc',
#     'data_random_seed': 2,
#     'bagging_fraction': 1,
#     'feature_fraction': 0.6,
#     'nthread': 4
# #     'lambda_l1': 1,
# #     'lambda_l2': 1
# }


In [59]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
lgb_ble = LightgbmBLE(x_train, y_train, label_cols=label_cols, nb=True)

Naive Bayes is enabled
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
LightgbmBLE is initialized


In [None]:
preds = np.zeros((x_test.shape[0], len(label_cols)))

for i, label in enumerate(label_cols):
    lgb_ble.set_params(lgb_params_per_label[label])
    lgb_ble.train(x_train, y_train[label].values, label)
    preds[:, i] = lgb_ble.predict(x_test, label)

In [100]:
PATH = '~/data/toxic/data/'
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = preds
import time
sub_id = int(time.time())
print(sub_id)
submission.to_csv('./BaseEstPreds/' + 'test123_' + str(sub_id) + '.csv', index=False)

1520733758


In [79]:
submission[label_cols].describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,153164.0,153164.0,153164.0,153164.0,153164.0,153164.0
mean,0.207959,0.016811,0.12614,0.004849,0.100053,0.017637
std,0.34924,0.083159,0.290001,0.043009,0.234656,0.083987
min,2.8e-05,0.000139,0.000401,1.4e-05,0.000123,5.2e-05
25%,0.002755,0.001807,0.003215,0.000175,0.002449,0.000689
50%,0.010663,0.002533,0.005905,0.000347,0.005874,0.001342
75%,0.226733,0.004101,0.025181,0.000862,0.035457,0.003944
max,1.0,0.998685,1.0,0.999605,1.0,0.999958
