In [17]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack, vstack
import lightgbm as lgb
from sklearn.model_selection import train_test_split

from enum import Enum
class ModelName(Enum):
    XGB = 1
    NBXGB = 2
    LGB = 3
    NBLGB = 4
    LOGREG = 5
    NBSVM = 6
    NBLSVC = 7
    RF = 8 # random forest
    RNN = 9
    ONESVC = 10
    ONELOGREG = 11


class BaseLayerEstimator(ABC):
    
    def _pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def _nb(self, x_train, y_train):
        assert isinstance(y_train, pd.DataFrame)
        r = {}
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            r[col] = np.log(self._pr(1, y_train[col].values, x_train) / self._pr(0, y_train[col], x_train))
        return r
    
    @abstractmethod
    def train(self, x_train, y_train):
        """
        Params:
            x_train: np array
            y_train: pd series
        """
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass
    
    

class OneVSOneRegBLE(BaseLayerEstimator):
    def __init__(self, x_train, y_train, model='logistic'):
        """
        x_train: sparse matrix, raw tfidf
        y_train: dataframe, with only label columns. should be 6 columns in total
        model: only support logistic or svc
        """
        self.r = {}
        self.setModelName(model)
        assert self.model_name in ['logistic', 'svc']
        self.param = {}
        self.param['logistic'] = {'identity_hate': 9.0,
                                     'insult': 1.5,
                                     'obscene': 1.0,
                                     'severe_toxic': 4.0,
                                     'threat': 9.0,
                                     'toxic': 2.7}
        self.param['svc'] = {'identity_hate': 0.9,
                             'insult': 0.15,
                             'obscene': 0.15,
                             'severe_toxic': 0.15,
                             'threat': 1.0,
                             'toxic': 0.29}
        
        
        
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            self.r[col] = np.log(self.pr(1, y_train[col].values, x_train) / self.pr(0, y_train[col], x_train))
        print('initializing done')
        print('OneVsOne is using {} kernel'.format(self.model_name))
        
    def setModelName(self, name):
        self.model_name = name
        assert self.model_name in ['logistic', 'svc']
        print('OneVsOne is using {} kernel'.format(self.model_name))
        
    def pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def oneVsOneSplit(self, x_train, y_train, label):
        print('Starting One vs One dataset splitting')
        if isinstance(y_train, pd.Series):
            y_train = y_train.values
        model_train = x_train[np.array(y_train == 1)]
        y_model_train = y_train[np.array(y_train == 1)]
        non_model_train = x_train[np.array(y_train == 0)]
        non_model_train = non_model_train[:model_train.shape[0]]
        y_non_model_train = y_train[np.array(y_train == 0)]
        y_non_model_train = y_non_model_train[:model_train.shape[0]]
        x_model_stack = vstack([model_train, non_model_train])
        y_model_stack = np.concatenate([y_model_train, y_non_model_train])
        x_nb = x_model_stack.multiply(self.r[label]).tocsr()
        y_nb = y_model_stack
        print('splitting done!')
        return (x_nb, y_nb)
    
    def train(self, x_train, y_train, label):
        ### construct one vs one
        x_nb, y_nb = self.oneVsOneSplit(x_train, y_train, label)
        ### start training
        if self.model_name is 'logistic':
            print('start training logistic regression')
            self.model = LogisticRegression(C=self.param['logistic'][label])
            self.model.fit(x_nb, y_nb)
            print('training done')
            
        else:
            print('start training linear svc regression')
            lsvc = LinearSVC(C=self.param['svc'][label])
            self.model = CalibratedClassifierCV(lsvc) 
            self.model.fit(x_nb, y_nb)
            print('training done')
        

    
    def predict(self, x_test, label):
        print('applying naive bayes to dataset')
        x_nb_test = x_test.multiply(self.r[label]).tocsr()
        print('predicting')
        pred = self.model.predict_proba(x_nb_test)[:,1]
        print('predicting done')
        return pred
    
##### example        
# aa = OneVSOneReg(train_tfidf, train[label_cols], model='logistic')
# aa.setModelName('svc')
# aa.train(train_tfidf,train['toxic'], 'toxic')
# aa.predict(test_tfidf, 'toxic')



class SklearnBLE(BaseLayerEstimator):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from sklearn.calibration import CalibratedClassifierCV
class NbSvmBLE(BaseLayerEstimator, BaseEstimator, ClassifierMixin):
    def __init__(self, mode=ModelName.NBSVM, seed=0, params=None):
        self._mode = mode
        params['random_state'] = seed
        self._params = params


    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        #return self._clf.predict(x.multiply(self._r))
        return self._clf.predict_proba(x.multiply(self._r))[:,1] # chance of being 1 ([:,0] chance of being 0)

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        #self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        self._clf = LogisticRegression(**self._params).fit(x_nb, y)
        if self._mode == ModelName.NBLSVC:
            self._clf = CalibratedClassifierCV(LinearSVC(**self._params)).fit(x_nb, y)

        return self
    
    def train(self, x_train, y_train):
        self.fit(x_train, y_train)
    
    def feature_importance(self):
        return self._clf.feature_importance

    
class XgbBLE(BaseLayerEstimator):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict_proba(xgb.DMatrix(x))[:,1]

from sklearn.feature_selection import SelectFromModel

class LightgbmBLE(BaseLayerEstimator):
    def __init__(self, x_train, y_train, label_cols= None, params=None, nb=True, seed=0):
        """
        constructor:

            x_train: should be a np/scipy/ 2-d array or matrix. only be used when nb is true
            y_train: should be a dataframe
            label_cols: (list) if y_train contains multiple labels, provide the list of label names
                e.g.: label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
            params: (dict)
            nb: (boolean) compute naive bayes or not. (helpful for unbalanced data)
            seed: (int) training random seed (not used currently)
            
        Example:
            ll = LightgbmBLE(train_tfidf, train[label_cols], params=params, nb=True)
            result = pd.DataFrame()
            for col in label_cols:
                    print(col)
                    ll.train(train_tfidf, train[col], col)
                    result[col] = ll.predict(test_tfidf, col)
        """
        #### check naive bayes
        if nb:
            print('Naive Bayes is enabled')
            self.r = self._nb(x_train, y_train)
        else:
            print('Naive Bayes is disabled')
            self.r = None
        ##### set values    
        self.nb = nb
        self.set_params(params)
        self.label_cols = label_cols
        print('LightgbmBLE is initialized')
    
    
    def set_params(self, params):
        self.params = params
    
    
    
    def _pre_process(self, x_train, y_train, label=None):
        if self.nb:
            if label is None:
                raise ValueError('Naive Bayes is enabled. label cannot be None.')
            if label not in self.label_cols:
                raise ValueError('Label not in label_cols')
            print('apply naive bayes to feature set')
            x = x_train.multiply(self.r[label])
            if isinstance(x_train, csr_matrix):
                x = x.tocsr()
        else:
            x = x_train
        if isinstance(y_train, pd.Series):
            y = y_train.values
        else:
            y = y_train
        return (x, y)
    
    
    def train(self, x_train, y_train, label=None, valid_set_percent=0):
        """
        Params:
            x_train: np/scipy/ 2-d array or matrix
            y_train: should be a dataframe
            label: (str) if not none, then it's one of the labels in the label_cols
                    if nb is set to True when initializing, when label can not be None
            valid_set_percent: (float, 0 to 1). 
                    0: no validation set. (imposible to use early stopping)
                    1: use training set as validation set (to check underfitting, and early stopping)
                    >0 and <1: use a portion of training set as validation set. (to check overfitting, and early stopping)
        
        """
        x, y = self._pre_process(x_train, y_train, label)
        
        if valid_set_percent != 0:
            if valid_set_percent > 1 or valid_set_percent < 0:
                raise ValueError('valid_set_percent must >= 0 and <= 1')
            if valid_set_percent != 1:
                x, x_val, y, y_val = train_test_split(x, y, test_size=valid_set_percent)


        lgb_train = lgb.Dataset(x, y)
        if valid_set_percent != 0:
            if valid_set_percent == 1:
                print('Evaluating using training set')
                self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_train)
            else:
                lgb_val = lgb.Dataset(x_val, y_val)
                print('Evaluating using validation set ({}% of training set)'.format(valid_set_percent*100))
                self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_val)
        else:
            print('No evaluation set, thus not possible to use early stopping. Please train with your best params.')
            self.model = lgb.train(self.params, lgb_train)
        
        
    def predict(self, x_train, label=None):
        import pdb
        pdb.set_trace()
        x, _ = self._pre_process(x_train, y_train=None, label=label)
        print('starting predicting')
        if self.model.best_iteration > 0:
            print('best_iteration {} is chosen.'.format(best_iteration))
            result = self.model.predict(x, num_iteration=bst.best_iteration)
        else:
            result = self.model.predict(x)
        print('predicting done')
        return result
        
            



In [4]:
from tfidf_data import tfidf_data_process

In [5]:
x_train, y_train, x_test, data_id = tfidf_data_process()

loading data done!
fitting char
fitting phrase
transforming train char
transforming train phrase
transforming test char
transforming test phrase


In [26]:
lgb_params = {
    #'learning_rate': 0.05,
    'is_unbalance': True,
    'early_stopping_round': 25,
    'max_depth': -1,
    'num_boost_round': 3000,
    'application': 'binary',
    'num_leaves': 63,
    'verbosity': 10,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 1,
    'feature_fraction': 0.6,
    'nthread': 4
#     'lambda_l1': 1,
#     'lambda_l2': 1
}

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

lgb_ble = LightgbmBLE(x_train, y_train, label_cols=label_cols, params=lgb_params, nb=False)


Naive Bayes is disabled
LightgbmBLE is initialized


In [21]:
lgb_ble.train(x_train, y_train['threat'].values, valid_set_percent=0.1)

No evaluation set, thus not possible to use early stopping. Please train with your best params.




In [22]:
lgb_ble.predict(x_test)

> <ipython-input-17-a1ac4d0d46cd>(318)predict()
-> x, _ = self._pre_process(x_train, y_train=None, label=label)
(Pdb) self.model.best_iteration
0
(Pdb) q


BdbQuit: 

In [8]:
type(y_train)

pandas.core.frame.DataFrame

In [12]:
lgb_ble.train(x_train, y_train['toxic'].values, valid_set_percent=0.1)



[1]	valid_0's auc: 0.870293
Training until validation scores don't improve for 25 rounds.
[2]	valid_0's auc: 0.879855
[3]	valid_0's auc: 0.890069
[4]	valid_0's auc: 0.895269
[5]	valid_0's auc: 0.900742
[6]	valid_0's auc: 0.904055
[7]	valid_0's auc: 0.90557
[8]	valid_0's auc: 0.908777
[9]	valid_0's auc: 0.911911
[10]	valid_0's auc: 0.912996
[11]	valid_0's auc: 0.916038
[12]	valid_0's auc: 0.918541
[13]	valid_0's auc: 0.921163
[14]	valid_0's auc: 0.922745
[15]	valid_0's auc: 0.925085
[16]	valid_0's auc: 0.926786
[17]	valid_0's auc: 0.927603
[18]	valid_0's auc: 0.931456
[19]	valid_0's auc: 0.932031
[20]	valid_0's auc: 0.934025
[21]	valid_0's auc: 0.93553
[22]	valid_0's auc: 0.936878
[23]	valid_0's auc: 0.938081
[24]	valid_0's auc: 0.93911
[25]	valid_0's auc: 0.940473
[26]	valid_0's auc: 0.941336
[27]	valid_0's auc: 0.942148
[28]	valid_0's auc: 0.942864
[29]	valid_0's auc: 0.943854
[30]	valid_0's auc: 0.944219
[31]	valid_0's auc: 0.945139
[32]	valid_0's auc: 0.947068
[33]	valid_0's auc: 0.

[277]	valid_0's auc: 0.976481
[278]	valid_0's auc: 0.976518
[279]	valid_0's auc: 0.976525
[280]	valid_0's auc: 0.976551
[281]	valid_0's auc: 0.976569
[282]	valid_0's auc: 0.976599
Early stopping, best iteration is:
[257]	valid_0's auc: 0.976635


In [13]:
x_test.shape

(153164, 300000)

In [15]:
preds = lgb_ble.predict(x_test)

starting predicting
predicting done


In [17]:
np.save('lgbtoxicpred', preds)

In [20]:
preds_load = np.load('lgbtoxicpred.npy')

In [21]:
preds_load.shape

(153164,)

In [1]:
import lightgbm as lgb

In [None]:
lgb.train()

In [4]:
parameters = [
            dict(name="max_bin", type="int", bounds=dict(min=20, max=20000)),
            dict(name="learning_rate", type="double", bounds=dict(min=0.001, max=0.3)),
            dict(name="num_leaves", type="int", bounds=dict(min=100, max=4095)),
            # dict(name="num_leaves", type="int", bounds=dict(min=100, max=45000)),
            dict(name="scale_pos_weight", type="double", bounds=dict(min=0.01, max=2000.0)),
            dict(name="n_estimators", type="int", bounds=dict(min=10, max=10000)),
            dict(name="min_child_weight", type="int", bounds=dict(min=1, max=2000)),
            dict(name="subsample", type="double", bounds=dict(min=0.4, max=1)),
            dict(name="bagging_fraction", type="double", bounds=dict(min=0.3, max=1)),
            dict(name="max_depth", type="int", bounds=dict(min=2, max=50)),
        ]
# static_parameters = {'boosting_type': 'dart', 'reg_alpha': 0, 'reg_lambda': 2, 'is_unbalance': True,
#                              'min_split_gain': 0, 'min_child_samples': 10, 'colsample_bytree': 0.8, 'subsample_freq': 3,
#                              'subsample_for_bin': 50000,
#                              'histogram_pool_size': detect_available_memory_for_histogram_cache()}

In [5]:
parameters

[{'bounds': {'max': 20000, 'min': 20}, 'name': 'max_bin', 'type': 'int'},
 {'bounds': {'max': 0.3, 'min': 0.001},
  'name': 'learning_rate',
  'type': 'double'},
 {'bounds': {'max': 4095, 'min': 100}, 'name': 'num_leaves', 'type': 'int'},
 {'bounds': {'max': 2000.0, 'min': 0.01},
  'name': 'scale_pos_weight',
  'type': 'double'},
 {'bounds': {'max': 10000, 'min': 10}, 'name': 'n_estimators', 'type': 'int'},
 {'bounds': {'max': 2000, 'min': 1},
  'name': 'min_child_weight',
  'type': 'int'},
 {'bounds': {'max': 1, 'min': 0.4}, 'name': 'subsample', 'type': 'double'},
 {'bounds': {'max': 1, 'min': 0.3},
  'name': 'bagging_fraction',
  'type': 'double'},
 {'bounds': {'max': 50, 'min': 2}, 'name': 'max_depth', 'type': 'int'}]