In [4]:
import numpy as np 
import pandas as pd 

In [5]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [6]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [7]:
train_data['text'] = train_data.comment_text.apply(lambda x: x.replace('\n', ' '))
test_data['text'] = test_data.comment_text.apply(lambda x: x.replace('\n', ' '))

In [8]:
cats = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_prepared_data = []

def format_text_spacy(text):
    return (text.text, {'cats': {cat: text[cat] for cat in cats}})
    
for i in range(0,len(train_data)):
    text = train_data.iloc[i]
    train_prepared_data.append(format_text_spacy(text))

In [9]:
train_prepared_data[0:5]

[("Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
  {'cats': {'toxic': 0,
    'severe_toxic': 0,
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0}}),
 ("D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
  {'cats': {'toxic': 0,
    'severe_toxic': 0,
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0}}),
 ("Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
  {'cats': {'toxic': 0,
    'severe_toxic': 0,
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0

In [10]:
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example
import random

nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat_multilabel")
textcat.add_label("toxic")
textcat.add_label("severe_toxic")
textcat.add_label("obscene")
textcat.add_label("threat")
textcat.add_label("insult")
textcat.add_label("identity_hate")



1

In [11]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    for epoch in range(10):
        losses = {}
        batches = minibatch(train_prepared_data[0:10000], size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for text, annot in batch:
                examples.append(Example.from_dict(nlp.make_doc(text), annot))
            nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
        print("Epoch: {} Loss: {}".format(epoch+1, losses))

Training the model...
Epoch: 1 Loss: {'textcat_multilabel': 38.45555296123277}
Epoch: 2 Loss: {'textcat_multilabel': 25.63382860703814}
Epoch: 3 Loss: {'textcat_multilabel': 20.5430282584648}
Epoch: 4 Loss: {'textcat_multilabel': 15.77784881794878}
Epoch: 5 Loss: {'textcat_multilabel': 12.646443012982672}
Epoch: 6 Loss: {'textcat_multilabel': 10.126887550931997}
Epoch: 7 Loss: {'textcat_multilabel': 8.171042074343859}
Epoch: 8 Loss: {'textcat_multilabel': 6.792340388942762}
Epoch: 9 Loss: {'textcat_multilabel': 6.084351639486101}
Epoch: 10 Loss: {'textcat_multilabel': 4.712001674993871}


In [15]:
test = nlp("you are ugly")

test.cats

{'toxic': 0.9958822727203369,
 'severe_toxic': 0.0980038195848465,
 'obscene': 0.08120159059762955,
 'threat': 0.021849799901247025,
 'insult': 0.8477211594581604,
 'identity_hate': 0.008134551346302032}