In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, vstack

In [3]:
%%time
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../input/train.csv.zip').fillna(' ')
test = pd.read_csv('../input/test.csv.zip').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

CPU times: user 1.68 s, sys: 170 ms, total: 1.85 s
Wall time: 1.85 s


In [4]:
%%time
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

CPU times: user 21.4 s, sys: 141 ms, total: 21.5 s
Wall time: 21.6 s


In [5]:
%%time
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)



CPU times: user 10min 25s, sys: 7.04 s, total: 10min 32s
Wall time: 10min 33s


In [6]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [7]:
%%time
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9783285513093992
CV score for class severe_toxic is 0.9885276555375122
CV score for class obscene is 0.990128534885006
CV score for class threat is 0.989654551343866
CV score for class insult is 0.982625117960518
CV score for class identity_hate is 0.9823768625964755
Total CV score is 0.9852735456054629
CPU times: user 10min 38s, sys: 4.13 s, total: 10min 43s
Wall time: 10min 43s


In [10]:
train[class_names].shape

(159571, 6)

In [11]:
train_oof = np.zeros(train[class_names].shape)

In [26]:
train[class_names].values[list(range(67)),0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0])

In [22]:
kf = KFold(n_splits=3, random_state=137, shuffle=True)

In [32]:
%%time
kf = KFold(n_splits=3, random_state=137, shuffle=True)
for ii in range(6):
    print("Fitting target", ii+1)
    for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = train[class_names].values[train_index,ii]
        classifier = LogisticRegression(solver='sag')
        classifier.fit(train_x, train_target)
        train_oof[val_index, ii] = classifier.predict_proba(val_x)[:,1]

Fitting target 1
Fitting fold 1
Fitting fold 2
Fitting fold 3
Fitting target 2
Fitting fold 1
Fitting fold 2
Fitting fold 3
Fitting target 3
Fitting fold 1
Fitting fold 2
Fitting fold 3
Fitting target 4
Fitting fold 1
Fitting fold 2
Fitting fold 3
Fitting target 5
Fitting fold 1
Fitting fold 2
Fitting fold 3
Fitting target 6
Fitting fold 1
Fitting fold 2
Fitting fold 3
CPU times: user 6min 36s, sys: 4.07 s, total: 6min 40s
Wall time: 6min 41s


In [36]:
for ii in range(6):
    print(roc_auc_score(train[class_names].values[:,ii], train_oof[:,ii]))

0.9784132615908256
0.9881219309055794
0.9902337988079097
0.9898252126259562
0.9823175830650794
0.9827057819292789


In [37]:
%%time
for ii in range(6):
    print("Fitting target", ii+1)
    for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = train[class_names].values[train_index,ii]
        classifier = LogisticRegression()
        classifier.fit(train_x, train_target)
        train_oof[val_index, ii] = classifier.predict_proba(val_x)[:,1]
    print(roc_auc_score(train[class_names].values[:,ii], train_oof[:,ii]))

Fitting target 1
Fitting fold 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting fold 2
Fitting fold 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9784124598949604
Fitting target 2
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9881218674063899
Fitting target 3
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.99023357481564
Fitting target 4
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9898250153781003
Fitting target 5
Fitting fold 1
Fitting fold 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting fold 3
0.9823175713485555
Fitting target 6
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9827052869315238
CPU times: user 38min 22s, sys: 1h 31min 30s, total: 2h 9min 53s
Wall time: 10min 21s
