# Toxic Comment Classification

**Burak Suyunu**

**submitted to Prof. Ethem Alpaydın**

# Logistic Regression Notebook

In [38]:
import numpy as np
import pandas as pd
import gc
import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack


In [2]:
train = pd.read_csv("train_clean.csv")
test = pd.read_csv("test_clean.csv")
test_labels = pd.read_csv("data/test_labels.csv")
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [3]:
test_all = pd.merge(test, test_labels, on='id', how='outer')
test_all = test_all[test_all.toxic > -1]
test_all = test_all.reset_index(drop=True)
test_all.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,comment_text_clean,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,5,0001ea8717f6de06,Thank you for understanding. I think very high...,thank understand think highly would revert wit...,0,0,0,0,0,0
1,7,000247e83dcc1211,:Dear god this site is horrible.,dear god site horrible,0,0,0,0,0,0
2,11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",somebody invariably try add religion really me...,0,0,0,0,0,0
3,13,0003e1cccfd5a40a,""" \r\n\r\n It says it right there that it IS a...",say right type type institution need case thre...,0,0,0,0,0,0
4,14,00059ace3e3e9a53,""" \r\n\r\n == Before adding a new product to t...",add new product list make sure relevant add ne...,0,0,0,0,0,0


In [4]:
train_s = train
test_s = test_all

In [5]:
X_train = train_s["comment_text_clean"].fillna("fillna").values
y_train = train_s[class_names].values
X_test = test_s["comment_text_clean"].fillna("fillna").values
y_test = test_s[class_names].values

In [30]:
print("Extracting tf-idf features for Words (ngram=(1,2))")

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=30000)

t0 = time.time()
train_word_features = word_vectorizer.fit_transform(X_train)
t1 = time.time()
print("fit to train in %0.2fs." % (t1 - t0))

test_word_features = word_vectorizer.transform(X_test)
t2 = time.time()
print("transform to test in %0.2fs." % (t2 - t1))

print("done in %0.2fs." % (t2 - t0))

Extracting tf-idf features for Words (ngram=(1,2))
fit to train in 19.88s.
transform to test in 3.37s.
done in 23.25s.


In [31]:
print("Extracting tf-idf features for Charactes (ngram=(2,4))")

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 4),
    max_features=40000)

t0 = time.time()
train_char_features  = char_vectorizer.fit_transform(X_train)
t1 = time.time()
print("fit to train in %0.2fs." % (t1 - t0))

test_char_features  = char_vectorizer.transform(X_test)
t2 = time.time()
print("transform to test in %0.2fs." % (t2 - t1))

print("done in %0.2fs." % (t2 - t0))

Extracting tf-idf features for Charactes (ngram=(2,4))
fit to train in 64.33s.
transform to test in 24.29s.
done in 88.62s.


In [32]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [33]:
scores = []
for i, class_name in enumerate(class_names):
    classifier = LogisticRegression(C=5, dual=True)

    cv_score = np.mean(cross_val_score(classifier, train_features, y_train[:,i], cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

CV score for class toxic is 0.9717208624743511
CV score for class severe_toxic is 0.9842183750245631
CV score for class obscene is 0.9876476160265278
CV score for class threat is 0.9802703010386672
CV score for class insult is 0.9758391859759626
CV score for class identity_hate is 0.9777837975976467


In [34]:
submission = pd.DataFrame.from_dict({'id': test_s['id']})
for i, class_name in enumerate(class_names):
    classifier.fit(train_features, y_train[:,i])
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    print('{} done'.format(class_name))

toxic done
severe_toxic done
obscene done
threat done
insult done
identity_hate done


In [35]:
pred = submission[class_names].values

In [36]:
roc_auc_score(y_test, pred)

0.9756365570562879

In [39]:
#pickle.dump( pred, open( "logistic_pred.p", "wb" ) )