In [1]:
import numpy as np
import pandas as pd

%matplotlib notebook

train = pd.read_csv('train.csv')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
X.shape

(159571, 8)

In [3]:
X.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [4]:
y = X[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
try:
    X.drop(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
except:
    pass

In [5]:
import re
import nltk

stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_input(t):
    t = t.strip()
    z = re.findall(r'[A-Za-z]+', t)
    z = [a for a in z if len(a) > 2]
    wnlemma = nltk.stem.WordNetLemmatizer()
    z = [wnlemma.lemmatize(a) for a in z]
    z = [a for a in z if not a in stop_words]
    t = ' '.join(z)
    return t

In [6]:
X.comment_text = X.comment_text.apply(lambda x: preprocess_input(x))

In [7]:
X.head()

Unnamed: 0,comment_text
0,Explanation Why edits made username Hardcore M...
1,aww match background colour seemingly stuck Th...
2,Hey man really trying edit war guy constantly ...
3,More make real suggestion improvement wondered...
4,You sir hero Any chance remember page


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

vect = TfidfVectorizer(min_df=3, max_df=0.8, 
                       ngram_range=(1, 2),
                       strip_accents='unicode',
                       smooth_idf=True,
                       sublinear_tf=True,
                       )
vect = vect.fit(X['comment_text'])
X_vect = vect.transform(X['comment_text'])

In [9]:
X_vect.shape

(159571, 320116)

In [10]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [11]:
test.fillna(value="This is just a normal comment about nothing to fill up the null value", inplace=True)

In [12]:
t_id = test['id']
test.drop(['id'], axis=1, inplace=True)
test.comment_text = test.comment_text.apply(lambda z: preprocess_input(z))

In [13]:
test.head()

Unnamed: 0,comment_text
0,bitch Rule succesful ever whats hating sad mof...
1,From RfC The title fine IMO
2,Sources Zawe Ashton Lapland
3,look back source information updated wa correc...
4,anonymously edit article


In [14]:
len(test)

153164

In [15]:
X_test = vect.transform(test['comment_text'])

In [16]:
X_test.shape

(153164, 320116)

In [17]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

y_pred = pd.read_csv('sample_submission.csv')

for c in cols:
    svm = LinearSVC()
    clf = CalibratedClassifierCV(svm) 
    clf.fit(X_vect, y[c])
    y_pred[c] = clf.predict_proba(X_test)[:,1]
    pred_train = clf.predict_proba(X_vect)[:,1]
    print(c, '--> log loss:', log_loss(y[c], pred_train))

toxic --> log loss: 0.0433591898521
severe_toxic --> log loss: 0.0119479320078
obscene --> log loss: 0.0244105414996
threat --> log loss: 0.00283070815917
insult --> log loss: 0.0334258256421
identity_hate --> log loss: 0.00868426684587


In [18]:
y_pred.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999804,0.109774,0.999278,0.007537,0.96857,0.128082
1,0000247867823ef7,0.007592,0.004213,0.002417,0.000832,0.007292,0.004262
2,00013b17ad220c46,0.042869,0.004211,0.012184,0.001001,0.014913,0.003706
3,00017563c3f7919a,0.002679,0.00415,0.004459,0.000663,0.005148,0.000978
4,00017695ad8997eb,0.005419,0.005583,0.004495,0.001617,0.01674,0.002575


In [19]:
y_pred.to_csv('my_submission.csv', index=False)