### Support Vector Classifier is our baseline approach. In this notebook, we implement SVC with scikit-learn first, and then improve the model by balancing the weight of each feature.

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import svm
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score, recall_score, accuracy_score

## Import data

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train.head()
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train.shape

(159571, 8)

In [5]:
len(train)

159571

## Train the model and Predict

In [6]:
vec = TfidfVectorizer()
#vec = CountVectorizer(max_features = 100)
train_tokens = vec.fit_transform(train['comment_text'])
test_tokens = vec.transform(test['comment_text'])

In [7]:
from scipy.sparse import vstack
global_tokens = vstack([train_tokens, test_tokens])
type(global_tokens)

scipy.sparse.csr.csr_matrix

In [8]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20)
svd.fit(global_tokens)
global_svd = svd.transform(global_tokens)
global_svd

array([[ 0.14655425, -0.00943179,  0.06249038, ...,  0.04152141,
         0.03615843, -0.07199262],
       [ 0.04475741,  0.00396638,  0.01562694, ..., -0.01601854,
        -0.00874285,  0.08677488],
       [ 0.29084021, -0.03439296, -0.02764026, ...,  0.01745265,
         0.0749654 ,  0.01187452],
       ...,
       [ 0.1502339 , -0.02335908, -0.02185975, ..., -0.0210672 ,
        -0.0533792 , -0.01538401],
       [ 0.20815742, -0.07998206, -0.00303758, ..., -0.02195973,
         0.02791193,  0.06944285],
       [ 0.20122179,  0.10389378, -0.07572752, ..., -0.0202595 ,
        -0.00221558, -0.01148191]])

In [9]:
train_svd = global_svd[:len(train)]
test_svd = global_svd[len(train):]

In [11]:
predict_on_test = np.zeros((len(test), len(labels)))
predict_on_train = np.zeros((len(train), len(labels)))
for idx, label in tqdm(enumerate(labels)):
    print("Training %s"%(label))
    m = svm.SVC(probability=True).fit(train_svd, train[label])
    predict_on_test[:,idx] = m.predict_proba(test_svd)[:,1]
    predict_on_train[:,idx] = m.predict_proba(train_svd)[:,1]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training toxic
Training severe_toxic
Training obscene
Training threat
Training insult
Training identity_hate



In [12]:
predict_on_test

array([[1.        , 0.01036816, 0.99999932, 0.00208316, 0.92923518,
        0.01385652],
       [0.05755422, 0.01017601, 0.0165532 , 0.00276266, 0.03387279,
        0.00860415],
       [0.03701674, 0.01059011, 0.01638785, 0.00220871, 0.03300015,
        0.01107108],
       ...,
       [0.02440495, 0.01006069, 0.00790497, 0.00273203, 0.03440999,
        0.01002481],
       [0.0856967 , 0.01027075, 0.03652905, 0.00227341, 0.05930911,
        0.01080503],
       [0.10860994, 0.01014905, 0.00768946, 0.00261936, 0.01947976,
        0.01046815]])

## Evaluation

In [20]:
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
for idx, label in tqdm(enumerate(labels)):
    print("%s class"%(label))
    loss_score = log_loss(train[label], predict_on_train[:,idx])
    score = roc_auc_score(train[label], predict_on_train[:,idx])
    print("Log loss: %.5f"%(loss_score))
    print("ROC_AUC_score: %.5f"%(score))
    print("")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

toxic class
Log loss: 0.19116
ROC_AUC_score: 0.89167

severe_toxic class
Log loss: 0.05195
ROC_AUC_score: 0.94475

obscene class
Log loss: 0.10859
ROC_AUC_score: 0.92721

threat class
Log loss: 0.02164
ROC_AUC_score: 0.02533

insult class
Log loss: 0.13186
ROC_AUC_score: 0.90078

identity_hate class
Log loss: 0.04771
ROC_AUC_score: 0.95326




## Output csv file

In [22]:
out = pd.concat([test["id"], pd.DataFrame(predict_on_test)], axis=1, ignore_index = True)
output_columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
out.columns = output_columns
submission_file = open("../output/submission_svc_imbalanced.csv", "w")
out.to_csv('../output/submission_svc_imbalanced.csv', index = False)
submission_file.close()

#### Kaggle score = 0.73786

# Improvement

## Balance the weight between classes

In [10]:
predict_on_test = np.zeros((len(test), len(labels)))
predict_on_train = np.zeros((len(train), len(labels)))
for idx, label in tqdm(enumerate(labels)):
    print("Training %s"%(label))
    #m = LogisticRegression().fit(train_svd, train[label])
    m = svm.SVC(probability=True, class_weight='balanced').fit(train_svd, train[label])
    predict_on_test[:,idx] = m.predict_proba(test_svd)[:,1]
    predict_on_train[:,idx] = m.predict_proba(train_svd)[:,1]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training toxic
Training severe_toxic
Training obscene
Training threat
Training insult
Training identity_hate



In [11]:
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
for idx, label in tqdm(enumerate(labels)):
    print("%s class"%(label))
    loss_score = log_loss(train[label], predict_on_train[:,idx])
    score = roc_auc_score(train[label], predict_on_train[:,idx])
    print("Log loss: %.5f"%(loss_score))
    print("ROC_AUC_score: %.5f"%(score))
    print("")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

toxic class
Log loss: 0.17157
ROC_AUC_score: 0.93789

severe_toxic class
Log loss: 0.03519
ROC_AUC_score: 0.98279

obscene class
Log loss: 0.09680
ROC_AUC_score: 0.96701

threat class
Log loss: 0.01693
ROC_AUC_score: 0.98425

insult class
Log loss: 0.11012
ROC_AUC_score: 0.95741

identity_hate class
Log loss: 0.04230
ROC_AUC_score: 0.96166




In [13]:
out = pd.concat([test["id"], pd.DataFrame(predict_on_test)], axis=1, ignore_index = True)
output_columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
out.columns = output_columns
submission_file = open("../output/submission_svc_balanced.csv", "w")
out.to_csv('../output/submission_svc_balanced.csv', index = False)
submission_file.close()

#### Kaggle score = 0.91595