In [1]:
# The goal of this kernel is to demonstrate that LightGBM can have predictive
# performance in line with that of a logistic regression. The theory is that
# labeling is being driven by a few keywords that can be picked up by trees.
#
# With some careful tuning, patience with runtimes, and additional feature
# engineering, this kernel can be tuned to slightly exceed the best
# logistic regression. Best of all, the two approaches (LR and LGB) blend
# well together.
#
# Hopefully, with some work, this could be a good addition to your ensemble.

In [2]:
import gc
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# Data

In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../input/train.csv').fillna(' ')
test = pd.read_csv('../input/test.csv').fillna(' ')
print('Loaded')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

Loaded


In [4]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=50000)
word_vectorizer.fit(all_text)
print('Word TFIDF 1/3')
train_word_features = word_vectorizer.transform(train_text)
print('Word TFIDF 2/3')
test_word_features = word_vectorizer.transform(test_text)
print('Word TFIDF 3/3')


Word TFIDF 1/3


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Word TFIDF 2/3
Word TFIDF 3/3


In [5]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
print('Char TFIDF 1/3')
train_char_features = char_vectorizer.transform(train_text)
print('Char TFIDF 2/3')
test_char_features = char_vectorizer.transform(test_text)
print('Char TFIDF 3/3')


Char TFIDF 1/3


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Char TFIDF 2/3
Char TFIDF 3/3


In [6]:
train_features = hstack([train_char_features, train_word_features])
print('HStack 1/2')
test_features = hstack([test_char_features, test_word_features])
print('HStack 2/2')

submission = pd.DataFrame.from_dict({'id': test['id']})

HStack 1/2
HStack 2/2


In [7]:
train.drop('comment_text', axis=1, inplace=True)
del test
del train_text
del test_text
del all_text
del train_char_features
del test_char_features
del train_word_features
del test_word_features
gc.collect()

14

In [8]:
for class_name in class_names:
    print(class_name)
    train_target = train[class_name]
    model = LogisticRegression(solver='sag')
    sfm = SelectFromModel(model, threshold=0.2)
    print(train_features.shape)
    train_sparse_matrix = sfm.fit_transform(train_features, train_target)
    print(train_sparse_matrix.shape)
    train_sparse_matrix, valid_sparse_matrix, y_train, y_valid = train_test_split(train_sparse_matrix, train_target, test_size=0.05, random_state=144)
    test_sparse_matrix = sfm.transform(test_features)
    d_train = lgb.Dataset(train_sparse_matrix, label=y_train)
    d_valid = lgb.Dataset(valid_sparse_matrix, label=y_valid)
    watchlist = [d_train, d_valid]
    params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
    rounds_lookup = {'toxic': 140,
                 'severe_toxic': 50,
                 'obscene': 80,
                 'threat': 80,
                 'insult': 70,
                 'identity_hate': 80}
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[class_name],
                      valid_sets=watchlist,
                      verbose_eval=10)
    submission[class_name] = model.predict(test_sparse_matrix)

toxic
(159571, 100000)


  if np.issubdtype(mask.dtype, np.int):


(159571, 24954)
[10]	training's auc: 0.93934	valid_1's auc: 0.926078
[20]	training's auc: 0.961744	valid_1's auc: 0.944641
[30]	training's auc: 0.973841	valid_1's auc: 0.956544
[40]	training's auc: 0.980219	valid_1's auc: 0.962575
[50]	training's auc: 0.983948	valid_1's auc: 0.965518
[60]	training's auc: 0.98671	valid_1's auc: 0.96846
[70]	training's auc: 0.988697	valid_1's auc: 0.969916
[80]	training's auc: 0.990201	valid_1's auc: 0.970408
[90]	training's auc: 0.99154	valid_1's auc: 0.970967
[100]	training's auc: 0.992609	valid_1's auc: 0.971041
[110]	training's auc: 0.99352	valid_1's auc: 0.971244
[120]	training's auc: 0.994341	valid_1's auc: 0.971235
[130]	training's auc: 0.994972	valid_1's auc: 0.971586
[140]	training's auc: 0.995591	valid_1's auc: 0.971679
severe_toxic
(159571, 100000)


  if np.issubdtype(mask.dtype, np.int):


(159571, 6352)
[10]	training's auc: 0.972672	valid_1's auc: 0.96486
[20]	training's auc: 0.990887	valid_1's auc: 0.985212
[30]	training's auc: 0.994264	valid_1's auc: 0.985903
[40]	training's auc: 0.996728	valid_1's auc: 0.984439
[50]	training's auc: 0.997903	valid_1's auc: 0.983392
obscene
(159571, 100000)


  if np.issubdtype(mask.dtype, np.int):


(159571, 13720)
[10]	training's auc: 0.980456	valid_1's auc: 0.981755
[20]	training's auc: 0.990982	valid_1's auc: 0.989378
[30]	training's auc: 0.994175	valid_1's auc: 0.991522
[40]	training's auc: 0.995721	valid_1's auc: 0.992936
[50]	training's auc: 0.996787	valid_1's auc: 0.99359
[60]	training's auc: 0.997499	valid_1's auc: 0.993388
[70]	training's auc: 0.998026	valid_1's auc: 0.993501
[80]	training's auc: 0.998412	valid_1's auc: 0.993769
threat
(159571, 100000)


  if np.issubdtype(mask.dtype, np.int):


(159571, 3286)
[10]	training's auc: 0.954414	valid_1's auc: 0.83758
[20]	training's auc: 0.994329	valid_1's auc: 0.935639
[30]	training's auc: 0.998239	valid_1's auc: 0.960796
[40]	training's auc: 0.999624	valid_1's auc: 0.980023
[50]	training's auc: 0.999916	valid_1's auc: 0.977887
[60]	training's auc: 0.999981	valid_1's auc: 0.978615
[70]	training's auc: 0.999996	valid_1's auc: 0.973916
[80]	training's auc: 0.999999	valid_1's auc: 0.981116
insult
(159571, 100000)


  if np.issubdtype(mask.dtype, np.int):


(159571, 16821)
[10]	training's auc: 0.955145	valid_1's auc: 0.946272
[20]	training's auc: 0.975689	valid_1's auc: 0.967746
[30]	training's auc: 0.984063	valid_1's auc: 0.97636
[40]	training's auc: 0.988461	valid_1's auc: 0.978564
[50]	training's auc: 0.99099	valid_1's auc: 0.979352
[60]	training's auc: 0.992611	valid_1's auc: 0.980395
[70]	training's auc: 0.993882	valid_1's auc: 0.980878
identity_hate
(159571, 100000)


  if np.issubdtype(mask.dtype, np.int):


(159571, 6929)
[10]	training's auc: 0.934593	valid_1's auc: 0.877853
[20]	training's auc: 0.979445	valid_1's auc: 0.936626
[30]	training's auc: 0.993146	valid_1's auc: 0.972944
[40]	training's auc: 0.996837	valid_1's auc: 0.978698
[50]	training's auc: 0.998357	valid_1's auc: 0.979557
[60]	training's auc: 0.999036	valid_1's auc: 0.98002
[70]	training's auc: 0.999456	valid_1's auc: 0.980195
[80]	training's auc: 0.999689	valid_1's auc: 0.979861


In [9]:
submission.to_csv('lgb_submission.csv', index=False)