In [1]:
# The goal of this kernel is to demonstrate that LightGBM can have predictive
# performance in line with that of a logistic regression. The theory is that
# labeling is being driven by a few keywords that can be picked up by trees.
#
# With some careful tuning, patience with runtimes, and additional feature
# engineering, this kernel can be tuned to slightly exceed the best
# logistic regression. Best of all, the two approaches (LR and LGB) blend
# well together.
#
# Hopefully, with some work, this could be a good addition to your ensemble.

import gc
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb


In [2]:


class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

PATH = '~/data/toxic/data/'

train = pd.read_csv(PATH + 'cleaned_train.csv').fillna(' ')
test = pd.read_csv(PATH + 'cleaned_test.csv').fillna(' ')

print('Loaded')

train_text = train['comment_text']
test_text = test['comment_text']

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=200000)
train_word_features = word_vectorizer.fit_transform(train_text)
print('Word TFIDF 1/2')
test_word_features = word_vectorizer.transform(test_text)
print('Word TFIDF 2/2')

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 1), # 2, 6
    max_features=5)
train_char_features = char_vectorizer.fit_transform(train_text)
print('Char TFIDF 1/2')
test_char_features = char_vectorizer.transform(test_text)
print('Char TFIDF 2/2')

train_features = hstack([train_char_features, train_word_features])
print('HStack 1/2')
test_features = hstack([test_char_features, test_word_features])
print('HStack 2/2')

submission = pd.DataFrame.from_dict({'id': test['id']})

train.drop('comment_text', axis=1, inplace=True)
del test
del train_text
del test_text
del train_char_features
del test_char_features
del train_word_features
del test_word_features
gc.collect()



Loaded
Word TFIDF 1/2
Word TFIDF 2/2
Char TFIDF 1/2
Char TFIDF 2/2
HStack 1/2
HStack 2/2


16

In [3]:
import pdb

In [4]:
for class_name in class_names:
    print(class_name)
    train_target = train[class_name]
    model = LogisticRegression(solver='sag')
    sfm = SelectFromModel(model, threshold='5*mean')
    print(train_features.shape)
    train_sparse_matrix = sfm.fit_transform(train_features, train_target)
    print(train_sparse_matrix.shape)
    train_sparse_matrix, valid_sparse_matrix, y_train, y_valid = train_test_split(train_sparse_matrix, train_target, test_size=0.05, random_state=144)
    test_sparse_matrix = sfm.transform(test_features)
    #pdb.set_trace()
    d_train = lgb.Dataset(train_sparse_matrix, label=y_train)
    d_valid = lgb.Dataset(valid_sparse_matrix, label=y_valid)
    watchlist = [d_train, d_valid]
    params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
    rounds_lookup = {'toxic': 140,
                 'severe_toxic': 50,
                 'obscene': 80,
                 'threat': 80,
                 'insult': 70,
                 'identity_hate': 80}
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[class_name],
                      valid_sets=watchlist,
                      verbose_eval=10)
    submission[class_name] = model.predict(test_sparse_matrix)

submission.to_csv('lgb_submission.csv', index=False)

toxic
(159571, 200005)
(159571, 4786)
[10]	training's auc: 0.923995	valid_1's auc: 0.916171
[20]	training's auc: 0.943793	valid_1's auc: 0.929762
[30]	training's auc: 0.959004	valid_1's auc: 0.942964
[40]	training's auc: 0.968107	valid_1's auc: 0.948907
[50]	training's auc: 0.973284	valid_1's auc: 0.953616
[60]	training's auc: 0.976657	valid_1's auc: 0.956679
[70]	training's auc: 0.979051	valid_1's auc: 0.958577
[80]	training's auc: 0.981189	valid_1's auc: 0.960543
[90]	training's auc: 0.982921	valid_1's auc: 0.96183
[100]	training's auc: 0.98421	valid_1's auc: 0.962022
[110]	training's auc: 0.985309	valid_1's auc: 0.962676
[120]	training's auc: 0.986306	valid_1's auc: 0.963425
[130]	training's auc: 0.98716	valid_1's auc: 0.964083
[140]	training's auc: 0.987926	valid_1's auc: 0.964623
severe_toxic
(159571, 200005)




(159571, 9046)
[10]	training's auc: 0.973585	valid_1's auc: 0.98574
[20]	training's auc: 0.985484	valid_1's auc: 0.984285
[30]	training's auc: 0.989867	valid_1's auc: 0.98525
[40]	training's auc: 0.992647	valid_1's auc: 0.985462
[50]	training's auc: 0.994289	valid_1's auc: 0.985664
obscene
(159571, 200005)




(159571, 5633)
[10]	training's auc: 0.970236	valid_1's auc: 0.978649
[20]	training's auc: 0.979576	valid_1's auc: 0.98272
[30]	training's auc: 0.985798	valid_1's auc: 0.98579
[40]	training's auc: 0.989262	valid_1's auc: 0.987172
[50]	training's auc: 0.991013	valid_1's auc: 0.988783
[60]	training's auc: 0.992275	valid_1's auc: 0.989363
[70]	training's auc: 0.99323	valid_1's auc: 0.989239
[80]	training's auc: 0.994104	valid_1's auc: 0.989222
threat
(159571, 200005)




(159571, 7907)
[10]	training's auc: 0.948873	valid_1's auc: 0.888877
[20]	training's auc: 0.990782	valid_1's auc: 0.962706
[30]	training's auc: 0.997098	valid_1's auc: 0.986311
[40]	training's auc: 0.998853	valid_1's auc: 0.989597
[50]	training's auc: 0.999472	valid_1's auc: 0.987354
[60]	training's auc: 0.999746	valid_1's auc: 0.986845
[70]	training's auc: 0.99987	valid_1's auc: 0.985915
[80]	training's auc: 0.999939	valid_1's auc: 0.984106
insult
(159571, 200005)




(159571, 5996)
[10]	training's auc: 0.95328	valid_1's auc: 0.949968
[20]	training's auc: 0.965642	valid_1's auc: 0.961717
[30]	training's auc: 0.975147	valid_1's auc: 0.970155
[40]	training's auc: 0.980828	valid_1's auc: 0.972785
[50]	training's auc: 0.984024	valid_1's auc: 0.974603
[60]	training's auc: 0.986179	valid_1's auc: 0.975801
[70]	training's auc: 0.987937	valid_1's auc: 0.976377
identity_hate
(159571, 200005)




(159571, 8516)
[10]	training's auc: 0.933711	valid_1's auc: 0.888707
[20]	training's auc: 0.971245	valid_1's auc: 0.968729
[30]	training's auc: 0.984338	valid_1's auc: 0.972178
[40]	training's auc: 0.988867	valid_1's auc: 0.972661
[50]	training's auc: 0.991815	valid_1's auc: 0.976516
[60]	training's auc: 0.993642	valid_1's auc: 0.977425
[70]	training's auc: 0.995029	valid_1's auc: 0.97824
[80]	training's auc: 0.995882	valid_1's auc: 0.979439
