In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import xgboost as xgb
xgb.__version__

'1.6.2'

In [2]:
%%time
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../input/train.csv.zip').fillna(' ')
test = pd.read_csv('../input/test.csv.zip').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

CPU times: user 1.69 s, sys: 179 ms, total: 1.87 s
Wall time: 1.87 s


In [3]:
%%time
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

CPU times: user 21.5 s, sys: 148 ms, total: 21.7 s
Wall time: 21.7 s


In [4]:
%%time
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)



CPU times: user 10min 37s, sys: 7 s, total: 10min 44s
Wall time: 10min 45s


In [5]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [8]:
%%time
train_oof = np.zeros(train[class_names].shape)
kf = KFold(n_splits=3, random_state=137, shuffle=True)
for ii in range(6):
    print("Fitting target", ii+1)
    for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = train[class_names].values[train_index,ii]
        classifier = xgb.XGBClassifier(tree_method='hist')
        classifier.fit(train_x, train_target)
        train_oof[val_index, ii] = classifier.predict_proba(val_x)[:,1]
    print(roc_auc_score(train[class_names].values[:,ii], train_oof[:,ii]))

Fitting target 1
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9673027369246964
Fitting target 2
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9816544531267241
Fitting target 3
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9887388266253295
Fitting target 4
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9756780178073787
Fitting target 5
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9752410557231539
Fitting target 6
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9736121939187006
CPU times: user 1d 4h 22min 38s, sys: 11min 4s, total: 1d 4h 33min 43s
Wall time: 1h 21min 15s


In [9]:
train_target = train[class_names].values[train_index]

In [11]:
train_target.shape

(106381, 6)

In [12]:
train_oof_2 = np.zeros(train[class_names].shape)

In [14]:
train_oof_2[train_index] = train_target

In [15]:
%%time
train_oof = np.zeros(train[class_names].shape)
kf = KFold(n_splits=3, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = train[class_names].values[train_index]
        classifier = xgb.XGBClassifier(tree_method='hist')
        classifier.fit(train_x, train_target)
        train_oof[val_index] = classifier.predict_proba(val_x)

Fitting fold 1
Fitting fold 2
Fitting fold 3
CPU times: user 1d 3h 58min 24s, sys: 6min 46s, total: 1d 4h 5min 10s
Wall time: 1h 17min 22s


In [16]:
for ii in range(6):
    print(roc_auc_score(train[class_names].values[:,ii], train_oof[:,ii]))

0.9673027369246964
0.9816544531267241
0.9887388266253295
0.9756780178073787
0.9752410557231539
0.9736121939187006
