In [1]:
import pandas as pd 
import numpy as np 
import re 
from nltk.corpus import stopwords 
from nltk.tokenize import TweetTokenizer 
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC 
from sklearn.calibration import CalibratedClassifierCV 
from sklearn.metrics import roc_auc_score 
from scipy.sparse import csr_matrix, hstack 
import lightgbm as lgb

In [2]:
PATH = '../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)
###########################


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#######################

# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.9) #################################
# shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}


train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

# del r_dict, x_train, x_val
import gc
gc.collect()


(159571, 30)
(153164, 24)
fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


0

In [10]:
from abc import ABC, abstractmethod

class BaseLayerEstimator(ABC):
    
    def _pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def _nb(self, x_train, y_train):
        assert isinstance(y_train, pd.DataFrame)
        r = {}
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            r[col] = np.log(self._pr(1, y_train[col].values, x_train) / self._pr(0, y_train[col], x_train))
        return r
    
    @abstractmethod
    def train(self, x_train, y_train):
        """
        Params:
            x_train: np array
            y_train: pd series
        """
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass
    
class LightgbmBLE(BaseLayerEstimator):
    def __init__(self, seed=0, params=None):
        self.param = params
        #self.param['seed'] = seed
        self.nrounds = params.pop('num_iterations', 100)
    
    def train(self, x_train, y_train):
        dtrain = lgb.Dataset(x_train, label=y_train)
        self.gbdt = lgb.train(self.param, dtrain, self.nrounds, verbose_eval=10)
    
    def predict(self, x):
        return self.gbdt.predict(x)

In [17]:
class LightgbmBLE(BaseLayerEstimator):
    def __init__(self, x_train, y_train, params=None, nb=True, seed=0):
        """
        constructor:

            x_train: should be a np/scipy/ 2-d array or matrix. only be used when nb is true
            y_train: should be a dataframe
        """
        #### check naive bayes
        if nb:
            print('Naive Bayes is enabled')
            self.r = self._nb(x_train, y_train)
        else:
            print('Naive Bayes is disabled')
            self.r = None
        ##### set values    
        self.nb = nb
        self.set_params(params)
        print('LightgbmBLE is initialized')
    
    
    def set_params(self, params):
        self.params = params
    
    
    
    def _pre_process(self, x_train, y_train, label=None):
        if self.nb:
            assert label is not None
            print('apply naive bayes to feature set')
            x = x_train.multiply(self.r[label])
            if isinstance(x_train, csr_matrix):
                x = x.tocsr()
        else:
            x = x_train
        if isinstance(y_train, pd.Series):
            y = y_train.values
        else:
            y = y_train
        return (x, y)
    
    
    def train(self, x_train, y_train, label=None):
        x, y = self._pre_process(x_train, y_train, label)
        lgb_train = lgb.Dataset(x, y)
        lgb_eval = lgb.Dataset(x, y, reference=lgb_train)
        self.model = lgb.train(self.params, lgb_train, valid_sets=lgb_eval, verbose_eval=20)
        
        
    def predict(self, x_train, label=None):
        x, _ = self._pre_process(x_train, y_train=None, label=label)
        print('starting predicting')
        result = self.model.predict(x)
        print('predicting done')
        return result
        
            

In [18]:
params = {
    'learning_rate': 0.2,
    'application': 'binary',
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'nthread': 4,
    'lambda_l1': 1,
    'lambda_l2': 1
} 
ll = LightgbmBLE(train_tfidf, train[label_cols], params=params, nb=True)

Naive Bayes is enabled
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
LightgbmBLE is initialized


In [19]:
ll = LightgbmBLE(train_tfidf, train[label_cols], params=params, nb=True)
result = pd.DataFrame()
for col in label_cols:
    print(col)
    ll.train(train_tfidf, train[col], col)
    result[col] = ll.predict(test_tfidf, col)

toxic
apply naive bayes to feature set
[20]	valid_0's auc: 0.961558
[40]	valid_0's auc: 0.980041
[60]	valid_0's auc: 0.986581
[80]	valid_0's auc: 0.990192
[100]	valid_0's auc: 0.992662
apply naive bayes to feature set
starting predicting
predicting done
severe_toxic
apply naive bayes to feature set
[20]	valid_0's auc: 0.990618
[40]	valid_0's auc: 0.997012
[60]	valid_0's auc: 0.998851
[80]	valid_0's auc: 0.999541
[100]	valid_0's auc: 0.999816
apply naive bayes to feature set
starting predicting
predicting done
obscene
apply naive bayes to feature set
[20]	valid_0's auc: 0.990877
[40]	valid_0's auc: 0.995789
[60]	valid_0's auc: 0.997536
[80]	valid_0's auc: 0.998457
[100]	valid_0's auc: 0.999016
apply naive bayes to feature set
starting predicting
predicting done
threat
apply naive bayes to feature set
[20]	valid_0's auc: 0.990854
[40]	valid_0's auc: 0.999744
[60]	valid_0's auc: 0.999992
[80]	valid_0's auc: 0.999999
[100]	valid_0's auc: 0.999999
apply naive bayes to feature set
starting p

In [20]:
result.to_csv(PATH + 'lightgbmtest.csv', index=False)