In [1]:
import pandas as pd 
import numpy as np 
import re 
from nltk.corpus import stopwords 
from nltk.tokenize import TweetTokenizer 
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC 
from sklearn.calibration import CalibratedClassifierCV 
from sklearn.metrics import roc_auc_score 
from scipy.sparse import csr_matrix, hstack 
import lightgbm as lgb

In [2]:
PATH = '../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)
###########################


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#######################

# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.9) #################################
# shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}


train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

# del r_dict, x_train, x_val
import gc
gc.collect()


(159571, 30)
(153164, 24)
fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


0

In [3]:
from abc import ABC, abstractmethod

class BaseLayerEstimator(ABC):
    
    def _pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def _nb(self, x_train, y_train):
        assert isinstance(y_train, pd.DataFrame)
        r = {}
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            r[col] = np.log(self._pr(1, y_train[col].values, x_train) / self._pr(0, y_train[col], x_train))
        return r
    
    @abstractmethod
    def train(self, x_train, y_train):
        """
        Params:
            x_train: np array
            y_train: pd series
        """
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass
    


In [24]:
import xgboost

class XGBoostBase(BaseLayerEstimator):
    def __init__(self, x_train, y_train, params=None, nb=True, seed=0):
        """
        constructor:

            x_train: should be a np/scipy/ 2-d array or matrix. only be used when nb is true
            y_train: should be a dataframe
        """
        #### check naive bayes
        if nb:
            print('Naive Bayes is enabled')
            self.r = self._nb(x_train, y_train)
        else:
            print('Naive Bayes is disabled')
            self.r = None
        ##### set values
        self.seed = seed
        self.nb = nb
        self.set_params(params)
        print('XGBoostBase is initialized')
    
    
    def set_params(self, params):
        self.params = params
        self.params['seed'] = self.seed
    
    
    def _pre_process(self, x_train, y_train, label=None):
        if self.nb:
            assert label is not None
            print('apply naive bayes to feature set')
            x = x_train.multiply(self.r[label])
            if isinstance(x_train, csr_matrix):
                x = x.tocsr()
        else:
            x = x_train
        if isinstance(y_train, pd.Series):
            y = y_train.values
        else:
            y = y_train
        return (x, y)
    
    
    def train(self, x_train, y_train, label=None):
        x, y = self._pre_process(x_train, y_train, label)
        self.params['eval_set'] = [x, y]
        self.model = xgboost.XGBClassifier(**self.params)
        self.model.fit(x,y)
        
        
    def predict(self, x_train, label=None):
        x, _ = self._pre_process(x_train, y_train=None, label=label)
        print('starting predicting')
        result = self.model.predict_proba(x)[:,1]
        print('predicting done')
        return result
        
            

In [26]:
params = {
    'learning_rate': 0.2,
    'max_depth': 6,
    'nthread': 20,
    'n_estimators' : 200,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'eval_metric': 'auc',
    'verbose_eval': 10,
    'silent': False
    } 
xx = XGBoostBase(train_tfidf, train[label_cols], params=params, nb=True)

Naive Bayes is enabled
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
XGBoostBase is initialized


In [28]:
result = pd.DataFrame()
result['id'] = test['id']
for col in label_cols:
    print(col)
    xx.train(train_tfidf, train[col], col)
    result[col] = xx.predict(test_tfidf, col)

toxic
apply naive bayes to feature set
apply naive bayes to feature set
starting predicting
predicting done
severe_toxic
apply naive bayes to feature set
apply naive bayes to feature set
starting predicting
predicting done
obscene
apply naive bayes to feature set
apply naive bayes to feature set
starting predicting
predicting done
threat
apply naive bayes to feature set
apply naive bayes to feature set
starting predicting
predicting done
insult
apply naive bayes to feature set
apply naive bayes to feature set
starting predicting
predicting done
identity_hate
apply naive bayes to feature set
apply naive bayes to feature set
starting predicting
predicting done


In [31]:
result.to_csv(PATH + 'xgbtest.csv', index=False)

In [13]:
clf = xgboost.XGBClassifier()
clf.set_params(**params)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=1, nthread=20, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [29]:
xx.predict(test_tfidf, col)

apply naive bayes to feature set
starting predicting
predicting done


array([ 0.1148539 ,  0.00081353,  0.00049475, ...,  0.00089766,
        0.00043752,  0.00040467], dtype=float32)

In [30]:
result

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999947,0.281409,0.997474,1.641622e-02,0.987579,0.114854
1,0000247867823ef7,0.011992,0.000180,0.002092,3.484975e-06,0.008555,0.000814
2,00013b17ad220c46,0.045801,0.000435,0.014519,1.816888e-05,0.007328,0.000495
3,00017563c3f7919a,0.001886,0.000060,0.000670,5.671510e-06,0.000987,0.000080
4,00017695ad8997eb,0.020567,0.000131,0.004855,7.531781e-06,0.003565,0.000179
5,0001ea8717f6de06,0.004622,0.000115,0.001488,5.061864e-05,0.002206,0.000040
6,00024115d4cbde0f,0.001517,0.000013,0.002438,1.354918e-06,0.003876,0.000061
7,000247e83dcc1211,0.178491,0.000530,0.003522,1.428693e-05,0.044514,0.000645
8,00025358d4737918,0.012535,0.000158,0.006464,4.769408e-07,0.001734,0.000846
9,00026d1092fe71cc,0.005762,0.000029,0.001152,1.275922e-06,0.003141,0.000030
