In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack

In [3]:
%%time
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../input/train.csv.zip').fillna(' ')
test = pd.read_csv('../input/test.csv.zip').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

CPU times: user 1.64 s, sys: 149 ms, total: 1.79 s
Wall time: 1.79 s


In [4]:
%%time
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

CPU times: user 21.3 s, sys: 190 ms, total: 21.5 s
Wall time: 21.5 s


In [5]:
%%time
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)



CPU times: user 10min 28s, sys: 6.75 s, total: 10min 35s
Wall time: 10min 36s


In [6]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [7]:
%%time
train_oof = np.zeros(train[class_names].shape)
kf = KFold(n_splits=3, random_state=137, shuffle=True)
for ii in range(6):
    print("Fitting target", ii+1)
    for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = train[class_names].values[train_index,ii]
        classifier = SVC(kernel='linear', probability=True)
        classifier.fit(train_x, train_target)
        train_oof[val_index, ii] = classifier.predict_proba(val_x)[:,1]
    print(roc_auc_score(train[class_names].values[:,ii], train_oof[:,ii]))

Fitting target 1
Fitting fold 1
Fitting fold 2




Fitting fold 3




0.9645452339779474
Fitting target 2
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9550483522515939
Fitting target 3
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9755925859993532
Fitting target 4
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9574675631818415
Fitting target 5
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9621246782468851
Fitting target 6
Fitting fold 1
Fitting fold 2
Fitting fold 3
0.9416648655498348
CPU times: user 10h 54min 47s, sys: 3min 22s, total: 10h 58min 10s
Wall time: 2h 37min 40s


In [8]:
%%time
train_oof = np.zeros(train[class_names].shape)
kf = KFold(n_splits=3, random_state=137, shuffle=True)
for ii in range(6):
    print("Fitting target", ii+1)
    for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = train[class_names].values[train_index,ii]
        classifier = SVC(kernel='linear', probability=True, C=0.1)
        classifier.fit(train_x, train_target)
        train_oof[val_index, ii] = classifier.predict_proba(val_x)[:,1]
    print(roc_auc_score(train[class_names].values[:,ii], train_oof[:,ii]))

Fitting target 1
Fitting fold 1


ValueError: Column indices are larger than the maximum acceptable value