In [1]:
import numpy as np
import pandas as pd

%matplotlib notebook

X = pd.read_csv('train.csv')
X.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [2]:
null_text = X.comment_text[2]

In [3]:
X.shape

(95851, 8)

In [4]:
X.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
y = X[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
try:
    X.drop(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
except:
    pass

In [6]:
import re
import nltk

stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_input(t):
    t = t.strip()
    z = re.findall(r'[A-Za-z]+', t)
    z = [a for a in z if len(a) > 2]
    wnlemma = nltk.stem.WordNetLemmatizer()
    z = [wnlemma.lemmatize(a) for a in z]
    z = [a for a in z if not a in stop_words]
    t = ' '.join(z)
    return t

In [7]:
preprocess_input(null_text)

'Points interest removed point interest section added seemed kind spammy know probably mean disobey rule generally point interest tends rather touristy quite irrelevant area culture That opinion though want reply put reply add talkback Jamiegraham talkpage'

In [8]:
X.comment_text = X.comment_text.apply(lambda x: preprocess_input(x))

In [9]:
X.head()

Unnamed: 0,comment_text
0,Nonsense kiss geek said true account terminated
1,Please vandalize page edit Merwin continue blo...
2,Points interest removed point interest section...
3,Asking nationality Racial offence Wow aware Bl...
4,The reader going say ethereal vocal style dark...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

vect = TfidfVectorizer(min_df=3, max_df=0.8, 
                       ngram_range=(1, 2),
                       strip_accents='unicode',
                       smooth_idf=True,
                       sublinear_tf=True,
                       )
vect = vect.fit(X['comment_text'])
X_vect = vect.transform(X['comment_text'])

In [11]:
X_vect.shape

(95851, 191797)

In [12]:
test = pd.read_csv('test.csv')
test.fillna(value=null_text, inplace=True)
test.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [13]:
t_id = test['id']
test.drop(['id'], axis=1, inplace=True)
test.comment_text = test.comment_text.apply(lambda z: preprocess_input(z))

In [14]:
test.head()

Unnamed: 0,comment_text
0,Orphaned non free medium Image jboEvL jpg
1,Kentuckiana colloquial Even though area often ...
2,Hello fellow Wikipedians modified WIKI LINK Do...
3,AKC Suspensions The Morning Call Feb
4,WIKI LINK Talk Celts


In [15]:
len(test)

226998

In [16]:
X_test = vect.transform(test['comment_text'])

In [17]:
X_test.shape

(226998, 191797)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_pred = pd.read_csv('sample_submission.csv')

for c in cols:
    clf = LogisticRegression(C=4, solver='sag')
    clf.fit(X_vect, y[c])
    y_pred[c] = clf.predict_proba(X_test)[:,1]
    pred_train = clf.predict_proba(X_vect)[:,1]
    print('log loss:', log_loss(y[c], pred_train))

log loss: 0.0619289767302
log loss: 0.0151965734064
log loss: 0.03591566509
log loss: 0.00579993744213
log loss: 0.0438860286451
log loss: 0.0133077122225


In [19]:
y_pred.to_csv("my_submission.csv", index=False)