*In this module, we will be classifying toxic comments in online forums. To do so, we will be leveraging the powerful technique of transfer learning by first training a generic English language model and then using those embeddings to classify comments.*

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *
from fastai.text import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

# 1 Language Model

In [2]:
PATH = './data/'

## 1.1 Text Preprocessing

In [3]:
train_txt = pd.read_csv(PATH + 'train.csv')
train_txt.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
val_idxs = get_cv_idxs(len(train_txt) - 1)
val_idxs

array([139361, 131630, 125325, ...,  34019,  83937,  78557])

In [5]:
val_txt = pd.DataFrame(columns=['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

for idx in val_idxs:
    val_txt = val_txt.append({
        "id": train_txt.iloc[idx].id,
        "comment_text": train_txt.iloc[idx].comment_text,
        "toxic": train_txt.iloc[idx].toxic,
        "severe_toxic": train_txt.iloc[idx].severe_toxic,
        "obscene": train_txt.iloc[idx].obscene,
        "threat": train_txt.iloc[idx].threat,
        "insult": train_txt.iloc[idx].insult,
        "identity_hate": train_txt.iloc[idx].identity_hate
    }, ignore_index=True)

val_txt.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,e9da055864f188e6,"If this is denied again, can I take this any h...",0,0,0,0,0,0
1,c03e1e0c39d897cd,Hi\nI'm also from Northern Ireland ),0,0,0,0,0,0
2,9e5b37f5a2619a54,""", 1 January 2008 (UTC)\nWell, you're an admin...",0,0,0,0,0,0
3,d8657148a508d925,"""\n\nIn light of this. I do not see how my edi...",0,0,0,0,0,0
4,dfa7cbdbdfd4729f,"""\n\n Begin text copy from logical subpage to ...",0,0,0,0,0,0


In [6]:
train_txt = train_txt.drop(train_txt.index[val_idxs])

In [7]:
test_txt = pd.read_csv(PATH + 'test.csv')
test_txt.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
all_text = ""

In [None]:
train_txt.comment_text[0]

In [None]:
for i in range(len(train_txt)):
    all_text += train_txt.comment_text[i]

for j in range(len(test_txt)):
    all_text += test_txt.comment_text[j]

In [None]:
len(all_text)

In [None]:
all_text[:20000]

In [None]:
!mkdir ./data/lang_model/

In [None]:
!touch ./data/lang_model/train.txt

In [None]:
!touch ./data/lang_model/val.txt

In [None]:
with open(PATH + 'train.txt', 'w') as file:
    file.write(all_text[:89076293])

In [None]:
with open(PATH + 'val.txt', 'w') as file:
    file.write(all_text[89076293:])

## 1.2 Tokenize Text, Prepare Model for Training

In [8]:
TRAIN_PATH = 'train.txt'
VAL_PATH = 'val.txt'
TEST_PATH = 'val.txt'

We will be using the *spacy* tokenizer to tokenize our texts

In [9]:
spacy_tok = spacy.load('en')

In [10]:
TEXT = data.Field(lower=True, tokenize="spacy")

*bptt* stands for back-prop through time and defines how many layers to back propagate through during training. The higher the number, the better the model will be at dealing with long term dependencies in the sentences. However, increasing the number also greatly increases time and memory requirements.

In [11]:
bs=32; bptt=70

In [12]:
%%time
FILES = dict(train=TRAIN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

CPU times: user 6min 43s, sys: 2.82 s, total: 6min 45s
Wall time: 6min 45s


The *TEXT* object now contains a *vocab* field. We must save this so that we can use it later during the classificiation task.

In [13]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

(# batches; # unique tokens in the vocab; # tokens in the training set; # sentences)

In [None]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

Let's explore the mapping from words to integers

In [None]:
TEXT.vocab.itos[:15]

And here's the reverse mapping

In [None]:
TEXT.vocab.stoi['the']

Tell Torchtext to turn the words into numbers

In [None]:
TEXT.numericalize([md.trn_ds[0].text[:12]])

## 1.3 Training

In [19]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [17]:
# modify Adam optimizer since large amount of momentum doesn't work well with RNN
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [None]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05) # dropouts for AWD LSTM
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3 # gradient clipping threshold

In [None]:
learner.fit(2e-3, 2, wds=1e-6, cycle_len=1, cycle_mult=2)

In [None]:
learner.save_encoder('adam1_enc')

In [None]:
learner.load_encoder('adam1_enc')

Language model accuracy is measured by perplexity, so let's go ahead and print that

In [None]:
print("Model Perplexity:", math.exp(4.004))

## 1.4 Test

In [None]:
m=learner.model
ss="""I used to love using this service but now"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
s

In [None]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [None]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [None]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

# 2 Sentiment Analysis / Toxicity Classification

In [14]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

The custom code to load this particular dataset was obtained from [here](https://gist.github.com/ohmeow/5b3543a5115040001fce59a105ac4269) with a few additions made by me

In [None]:
class TextMultiLabelDataset(torchtext.data.Dataset):
    def __init__(self, df, tt_text_field, tt_label_field, txt_col, lbl_cols, **kwargs):
        # torchtext Field objects
        fields = [('text', tt_text_field)]
        for l in lbl_cols: fields.append((l, tt_label_field))
            
        is_test = False if lbl_cols[0] in df.columns else True
        n_labels = len(lbl_cols)
        
        examples = []
        for idx, row in df.iterrows():
            if not is_test:
                lbls = [ row[l] for l in lbl_cols ]
            else:
                lbls = [0.0] * n_labels
                
            txt = str(row[txt_col])
            examples.append(data.Example.fromlist([txt]+lbls, fields))
                            
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(example): 
        return len(example.text)
    
    @classmethod
    def splits(cls, text_field, label_field, train_df, txt_col, lbl_cols, val_df=None, test_df=None, **kwargs):
        # build train, val, and test data
        train_data, val_data, test_data = (None, None, None)
        
        if train_df is not None: 
            train_data = cls(train_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)
        if val_df is not None: 
            val_data = cls(val_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)
        if test_df is not None: 
            test_data = cls(test_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)
    
    
class TextMultiLabelDataLoader():
    def __init__(self, src, x_fld, y_flds, y_dtype='torch.cuda.FloatTensor'):
        self.src, self.x_fld, self.y_flds = src, x_fld, y_flds
        self.y_dtype = y_dtype

    def __len__(self): return len(self.src)#-1

    def __iter__(self):
        it = iter(self.src)
        for i in range(len(self)):
            b = next(it)
            
            if (len(self.y_flds) > 1):
                targ = [ getattr(b, y) for y in self.y_flds ] 
                targ = torch.stack(targ, dim=1).type(self.y_dtype)
            else: 
                targ = getattr(b, self.y_flds[0])
                targ = targ.type(self.y_dtype)

            yield getattr(b, self.x_fld), targ

class TextMultiLabelModel(BasicModel):
    def get_layer_groups(self):
        m = self.model[0]
        return [(m.encoder, m.dropouti), *zip(m.rnns, m.dropouths), (self.model[1])]
            
class TextMultiLabelData(ModelData):

    @classmethod
    def from_splits(cls, path, splits, bs, text_name='text', label_names=['label'], 
                    target_dtype='torch.cuda.FloatTensor'):
        
        text_fld = splits[0].fields[text_name]
        
        label_flds = []
        if (len(label_names) == 1): 
            label_fld = splits[0].fields[label_names[0]]
            label_flds.append(label_fld)
            if (label_fld.use_vocab): 
                label_fld.build_vocab(splits[0])
                target_dtype = 'torch.cuda.LongTensor'
        else:
            for n in label_names:
                label_fld = splits[0].fields[n]
                label_flds.append(label_fld)

        iters = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
        trn_iter,val_iter,test_iter = iters[0],iters[1],None
        test_dl = None
        if len(iters) == 3:
            test_iter = iters[2]
            test_dl = TextMultiLabelDataLoader(test_iter, text_name, label_names, target_dtype)
        trn_dl = TextMultiLabelDataLoader(trn_iter, text_name, label_names, target_dtype)
        val_dl = TextMultiLabelDataLoader(val_iter, text_name, label_names, target_dtype)

        obj = cls.from_dls(path, trn_dl, val_dl, test_dl)
        obj.bs = bs
        obj.pad_idx = text_fld.vocab.stoi[text_fld.pad_token]
        obj.nt = len(text_fld.vocab)

        # if multiple labels, assume the # of classes = the # of labels 
        if (len(label_names) > 1):
            c = len(label_names)
        # if label has a vocab, assume the vocab represents the # of classes
        elif (hasattr(label_flds[0], 'vocab')): 
            c = len(label_flds[0].vocab)
        else:
            c = 1
            
        obj.c = c

        return obj
    
    def to_model(self, m, opt_fn):
        model = TextMultiLabelModel(to_gpu(m))
        return RNN_Learner(self, model, opt_fn=opt_fn)

    def get_model(self, opt_fn, max_sl, bptt, emb_sz, n_hid, n_layers, dropout, **kwargs):
        m = get_rnn_classifier(bptt, max_sl, self.c, self.nt,
              layers=[emb_sz*3, self.c], drops=[dropout],
              emb_sz=emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=self.pad_idx, **kwargs)
        return self.to_model(m, opt_fn)

In [None]:
TOXIC_LABEL = data.Field(sequential=False)
splits = TextMultiLabelDataset.splits(TEXT, TOXIC_LABEL, train_txt, 'comment_text', 
                                      ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], 
                                      val_df=val_txt, test_df=test_txt)

In [None]:
md2 = TextMultiLabelData.from_splits(PATH, splits, bs,
                                     label_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [None]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.65, wdrop=0.5, dropoute=0.1, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

In [None]:
m3.load_encoder('adam1_enc')
lrs=np.array([1e-4,1e-3,1e-3,1e-2,3e-2])

In [None]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
# m3.unfreeze()
# m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

In [15]:
model2 = LanguageModelData.from_dataframes(PATH, TEXT, 'comment_text', train_txt, val_txt, test_txt, min_freq=10)

In [20]:
learner = model2.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05) # dropouts for AWD LSTM
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3 # gradient clipping threshold

In [21]:
learner.load_encoder('adam1_enc')

In [24]:
learner.fit(1e-3, 3, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=7), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      3.829187   3.802736  
    1      3.849821   3.812442                                
    2      3.788953   3.799779                                
    3      3.840013   3.822724                                
    4      3.807636   3.81516                                 
    5      3.783741   3.802731                                
    6      3.74316    3.799709                                


[array([3.79971])]

In [25]:
learner.save('model')