In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import svm
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Import data

In [7]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [8]:
train.head()
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Train the model and Predict

In [9]:
vec = TfidfVectorizer()
#vec = CountVectorizer(max_features = 100)
train_tokens = vec.fit_transform(train['comment_text'])
test_tokens = vec.transform(test['comment_text'])

In [10]:
from scipy.sparse import vstack
global_tokens = vstack([train_tokens, test_tokens])

In [11]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, n_iter=10)
svd.fit(global_tokens)
global_svd = svd.transform(global_tokens)

In [12]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaler.fit(global_svd)
global_scaled = scaler.transform(global_svd)
train_scaled = global_scaled[:len(train)]
test_scaled = global_scaled[len(train):]

In [13]:
predict_on_test = np.zeros((len(test), len(labels)))
predict_on_train = np.zeros((len(train), len(labels)))
for idx, label in tqdm(enumerate(labels)):
    print("Training %s"%(label))
    m = LogisticRegression().fit(train_scaled, train[label])
    predict_on_test[:,idx] = m.predict_proba(test_scaled)[:,1]
    predict_on_train[:,idx] = m.predict_proba(train_scaled)[:,1]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Training toxic
Training severe_toxic
Training obscene
Training threat
Training insult
Training identity_hate



# Evaluation

### Cross Validation

In [26]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import log_loss, roc_auc_score
def crossValidation(clf, X, y, n=5):
    cv = KFold(n_splits=n)
    scores = []
    i = 0
    y_pred  = []
    y_true = []
    # split the training data to training and validation data
    for train_index, valid_index in cv.split(X):
        i += 1
        X_tr, X_va, y_tr, y_va = X[train_index], X[valid_index], y.iloc[train_index], y.iloc[valid_index]
        clf.fit(X_tr, y_tr)
        y_pred_sub = clf.predict(X_va)
        newScore = clf.score(X_va, y_va)
        scores.append(newScore)
        newLogLoss = log_loss(y_va, y_pred_sub)
        newROCAUC = roc_auc_score(y_va, y_pred_sub)
        print("loop %d, accuracy %0.6f, logloss %0.6f, roc_auc_score %0.6f" % (i, newScore, newLogLoss, newROCAUC))
        # preserve one pair of y_true and y_pred
        if i == 1:
            y_pred = y_pred_sub
            y_true = y_va
    scores_array = np.asarray(scores)
    print("Accuracy: %0.6f (+/- %0.6f)" % (scores_array.mean(), scores_array.std() * 2))
    print()
    
    return y_true, y_pred

In [27]:
y_pred = []
y_true = []
for i, label in tqdm(enumerate(labels)):
    print("----- evaluating %s -----" % label)
    m = LogisticRegression()
    y_true_sub, y_pred_sub = crossValidation(m, train_scaled, train[label])
    y_true.append(y_true_sub)
    y_pred.append(y_pred_sub)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

----- evaluating toxic -----
loop 1, accuracy 0.945825, logloss 1.871149, roc_auc_score 0.756012
loop 2, accuracy 0.945134, logloss 1.895018, roc_auc_score 0.750223
loop 3, accuracy 0.945572, logloss 1.879866, roc_auc_score 0.749606
loop 4, accuracy 0.946763, logloss 1.838741, roc_auc_score 0.755740
loop 5, accuracy 0.946795, logloss 1.837658, roc_auc_score 0.754921
Accuracy: 0.946018 (+/- 0.001319)

----- evaluating severe_toxic -----
loop 1, accuracy 0.989660, logloss 0.357131, roc_auc_score 0.599018
loop 2, accuracy 0.991164, logloss 0.305195, roc_auc_score 0.623302
loop 3, accuracy 0.990130, logloss 0.340909, roc_auc_score 0.608910
loop 4, accuracy 0.990192, logloss 0.338744, roc_auc_score 0.610146
loop 5, accuracy 0.990098, logloss 0.341991, roc_auc_score 0.606578
Accuracy: 0.990249 (+/- 0.000989)

----- evaluating obscene -----
loop 1, accuracy 0.974526, logloss 0.879841, roc_auc_score 0.792743
loop 2, accuracy 0.974525, logloss 0.879868, roc_auc_score 0.786059
loop 3, accuracy 0

# Output submission.csv

In [49]:
out = pd.concat([test["id"], pd.DataFrame(predict_on_test)], axis=1, ignore_index = True)
output_columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
out.columns = output_columns
submission_file = open("../output/submission_logistic.csv", "w")
out.to_csv('../output/submission_logistic.csv', index = False)
submission_file.close()

Score on Kaggle = 0.94415