In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [4]:
labels = list(train.columns[2:])
train['total'] = train[labels].sum(axis=1)

In [5]:
train['toxicity'] = train['total'].map(lambda total: 0 if total == 0 else 1)

In [6]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total,toxicity
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0


In [7]:
train_label = train['toxicity']

In [8]:
test_labels_df = pd.read_csv("test_labels.csv")
test_labels_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [9]:
test_labels_df['toxic'].value_counts()

-1    89186
 0    57888
 1     6090
Name: toxic, dtype: int64

In [10]:
test = pd.concat([test,test_labels_df],axis=1)
test.head()

Unnamed: 0,id,comment_text,id.1,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [11]:
test[test['toxic']==-1].index

Int64Index([     0,      1,      2,      3,      4,      6,      8,      9,
                10,     12,
            ...
            153148, 153152, 153153, 153157, 153158, 153159, 153160, 153161,
            153162, 153163],
           dtype='int64', length=89186)

In [12]:
test = test.drop(test[test['toxic']==-1].index)

In [13]:
test.head()

Unnamed: 0,id,comment_text,id.1,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0001ea8717f6de06,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,000247e83dcc1211,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0002f87b16116a7f,0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0003e1cccfd5a40a,0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",00059ace3e3e9a53,0,0,0,0,0,0


In [14]:
labels_test = list(test.columns[3:])
test['total'] = test[labels_test].sum(axis=1)
test['toxicity'] = test['total'].map(lambda total: 0 if total == 0 else 1)
test_label = test['toxicity']

In [15]:
def lemmatize(comment):
    lems = [token.lemma_ for token in comment if (token.lemma_ != '-PRON-' and token.is_stop==False and 
                                   token.is_punct==False and token.is_digit==False and token.is_space==False 
                                          and not token.lemma_.startswith("'"))]
    return lems

In [16]:
train_comments_with_spacy = map(nlp,train['comment_text'])
train_lems = map(lemmatize, train_comments_with_spacy)
train_lems_joined = map(' '.join, train_lems)

In [17]:
test_comments_with_spacy = map(nlp,test['comment_text'])
test_lems = map(lemmatize, test_comments_with_spacy)
test_lems_joined = map(' '.join, test_lems)

In [21]:
train_comments_with_spacy = [nlp(x) for x in train['comment_text']]

KeyboardInterrupt: 

In [20]:
tfidf_with_lems = TfidfVectorizer(stop_words='english', token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")
tfidf_with_lems_train_data = tfidf_with_lems.fit_transform(train_lems)
tfidf_with_lems_test_data = tfidf_with_lems.transform(test_lems)

AttributeError: 'list' object has no attribute 'lower'

In [15]:
baseline_tfid = TfidfVectorizer(stop_words='english', token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")
baseline_tfid_train_data = baseline_tfid.fit_transform(train['comment_text'])
baseline_tfid_test_data = baseline_tfid.transform(test['comment_text'])

In [16]:
count_vec = CountVectorizer(stop_words='english', token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")

In [17]:
cv_train_data = count_vec.fit_transform(train['comment_text'])
cv_test_data = count_vec.transform(test['comment_text'])

# Modeling

In [18]:
logit = LogisticRegression()

In [19]:
logit.fit(baseline_tfid_train_data, train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
lr_train_preds = logit.predict(baseline_tfid_train_data)
lr_test_preds = logit.predict(baseline_tfid_test_data)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
lr_train_score = accuracy_score(train_label, lr_train_preds)
lr_train_score

0.9604564739206999

In [23]:
lr_test_score = accuracy_score(test_label, lr_test_preds)
lr_test_score

0.9343368032761261

In [24]:
nb_classifier = MultinomialNB()
nb_classifier.fit(baseline_tfid_train_data, train_label)
nb_train_preds = nb_classifier.predict(baseline_tfid_train_data)
nb_test_preds = nb_classifier.predict(baseline_tfid_test_data)

In [25]:
nb_train_score = accuracy_score(train_label, nb_train_preds)
nb_train_score

0.9259201233306804

In [26]:
nb_test_score = accuracy_score(test_label, nb_test_preds)
nb_test_score

0.9247553846634781

# Models with count vectorization

In [27]:
logit = LogisticRegression()
logit.fit(cv_train_data, train_label)
lr_train_preds = logit.predict(cv_train_data)
lr_test_preds = logit.predict(cv_test_data)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [28]:
lr_train_score = accuracy_score(train_label, lr_train_preds)
print(lr_train_score)
lr_test_score = accuracy_score(test_label, lr_test_preds)
lr_test_score

0.9763302855782066


0.919112820031886

In [29]:
nb_classifier = MultinomialNB()
nb_classifier.fit(cv_train_data, train_label)
nb_train_preds = nb_classifier.predict(cv_train_data)
nb_test_preds = nb_classifier.predict(cv_test_data)

In [30]:
nb_train_score = accuracy_score(train_label, nb_train_preds)
print(nb_train_score)
nb_test_score = accuracy_score(test_label, nb_test_preds)
nb_test_score

0.9545280784102375


0.917659195348401