In [1]:
import os
import torch
import time
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from sacremoses import MosesTokenizer
import re
import random
import unicodedata
import codecs
import itertools
from tqdm import tqdm
from tqdm.notebook import tqdm
import operator
from io import open
import math
import numpy as np
from ipywidgets import IntProgress
from sklearn.model_selection import KFold

mt = MosesTokenizer(lang='en')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.enabled = False

In [2]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print("device = ", device)

device =  cuda


In [3]:
VIOLENT_FILE = os.path.join(os.getcwd(), 'data/violent_train.txt')
NOT_VIOLENT_FILE = os.path.join(os.getcwd(), 'data/not_violent_train.txt')


In [4]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3

class Vocab:
    def __init__(self, name, freq_threshold=5):
        self.name = name
        self.freq_threshold = freq_threshold
        self.word2index = {"<PAD>":PAD_token, "<SOS>":SOS_token, "<EOS>":EOS_token, "<UNK>":UNK_token}
        self.word2count = {}
        self.index2word = {PAD_token:"<PAD>", SOS_token:"<SOS>", EOS_token:"<EOS>", UNK_token:"<UNK>"}
        self.n_words = 4

    def __len__(self):
        return len(self.index2word)

    def remove_non_ascii(self, words):
        """Remove non-ASCII characters from list of tokenized words"""
        new_words = []
        for word in words:
            new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_words.append(new_word)
        return new_words

    def to_lowercase(self, words):
        """Convert all characters to lowercase from list of tokenized words"""
        new_words = []
        for word in words:
            new_word = word.lower()
            new_words.append(new_word)
        return new_words

    def remove_punctuation(self, words):
        """Remove punctuation from list of tokenized words"""
        new_words = []
        for word in words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    def normalize(self, words):
        words = self.remove_non_ascii(words)
        words = self.to_lowercase(words)
        words = self.remove_punctuation(words)
        return words
    
    def addSentence(self, sentence):
        words = []
        for word in mt.tokenize(sentence):
            words.append(word)
        words = self.normalize(words)
        for word in words:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2count:
            self.word2count[word] = 1
        else:
            self.word2count[word] += 1
        if self.word2count[word] == self.freq_threshold:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1
    
    def addWord_trimmed(self, word):
        self.word2index[word] = self.n_words
        self.index2word[self.n_words] = word
        self.n_words += 1
    
    def trimWords(self, MAX_VOCAB_SIZE=30000):
        self.word2count=dict(sorted(self.word2count.items(), key=operator.itemgetter(1), reverse=True))
        iter = 0
        keep_words = []
        for key in self.word2count:
            keep_words.append(key)
            iter += 1
            if iter == MAX_VOCAB_SIZE:
                break
        self.word2index = {"<PAD>":PAD_token, "<SOS>":SOS_token, "<EOS>":EOS_token, "<UNK>":UNK_token}
        self.index2word = {PAD_token:"<PAD>", SOS_token:"<SOS>", EOS_token:"<EOS>", UNK_token:"<UNK>"}
        self.n_words = 4
        for word in keep_words:
            self.addWord_trimmed(word)

In [5]:
class NewsMediaDataset(Dataset):
    def __init__(self, v_filename=VIOLENT_FILE, nv_filename=NOT_VIOLENT_FILE, min_length=10):
        self.min_length = min_length
        self.v_articles = self.read_utterances(filename=v_filename)
        self.nv_articles = self.read_utterances(filename=nv_filename)
        self.classes = {"violent": 1, "not_violent": 0}
        self.n_samples = len(self.v_articles) + len(self.nv_articles)
        v_inputs = self.input_generator(self.v_articles)
        nv_inputs = self.input_generator(self.nv_articles)
        self.inputs = v_inputs + nv_inputs
        random.shuffle(self.inputs)

    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, index):
        article = self.inputs[index][0]
        target = self.inputs[index][1]
        return article, target
    
    def unicodeToAscii(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    def normalizeString(self, s):
        s = self.unicodeToAscii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        s = re.sub(r"\s+", r" ", s).strip()
        return s

    def read_utterances(self, filename):
        lines = open(filename, encoding='utf-8').read().strip().split('\n')
        articles = [self.normalizeString(l) for l in lines]
        return articles

    def input_generator(self, articles):
        inputs = []
        for article in articles:
            if article in self.v_articles:
                target = self.classes["violent"]
            elif article in self.nv_articles:
                target = self.classes["not_violent"]
            inputs.append([article, target])
        return inputs

In [6]:
mediaDataset = NewsMediaDataset()
print('Dataset length : ', len(mediaDataset))

vocab = Vocab('news_media')
for input in mediaDataset:
    sent = input[0]
    vocab.addSentence(sent)

print('Vocab length : ', len(vocab))

Dataset length :  6244
Vocab length :  13301


In [7]:
def indexesFromSentence(vocab, sentence):
    tokens = []
    for word in mt.tokenize(sentence):
        if word in vocab.word2index:
            tokens.append(vocab.word2index[word])
        else:
            tokens.append(vocab.word2index['<UNK>'])
    tokens.append(EOS_token)
    return tokens

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def inputVar(l, vocab):
    indexes_batch = [indexesFromSentence(vocab, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

def batch2TrainData(vocab, input_batches):
    input_batches.sort(key=lambda x: len(mt.tokenize(x[0])), reverse=True)
    input_batch, output_batch = [], []
    for inp in input_batches:
        input_batch.append(inp[0])
        output_batch.append(inp[1])
    inp, lengths = inputVar(input_batch, vocab)
    output_batch = torch.tensor(output_batch)
    return inp, lengths, output_batch

In [8]:
small_batch_size = 5
batches = batch2TrainData(vocab, [random.choice(mediaDataset) for _ in range(small_batch_size)])
input_variable, lengths, outputs = batches
print("\n\n")
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", outputs)




input_variable: tensor([[  34,  253,  953,    4,  149],
        [ 903,  271, 1359,  393,  480],
        [  88,   24,    8,   23, 1388],
        ...,
        [  77,    0,    0,    0,    0],
        [   3,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([727, 642, 415, 217,  32])
target_variable: tensor([0, 0, 1, 1, 1])


In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size,
                          n_layers, dropout=(0 if n_layers == 1 else dropout),
                          batch_first=False, bidirectional=True)
        self.decoder = nn.Linear(hidden_size * 2, 1)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.Sigmoid()

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output_ = self.decoder(hidden)
        return self.act(output_.squeeze())

In [10]:
HIDDEN_SIZE = 128
BATCH_SIZE = 8
N_LAYERS = 1
DROPOUT = 0.5

embedding = nn.Embedding(vocab.n_words, HIDDEN_SIZE)

model = EncoderRNN(hidden_size=HIDDEN_SIZE, 
                   embedding=embedding, 
                   n_layers=N_LAYERS, 
                   dropout=DROPOUT)

In [11]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

model = model.to(device)
criterion = criterion.to(device)

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,900,929 trainable parameters


In [13]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [14]:
print(model)

EncoderRNN(
  (embedding): Embedding(13301, 128)
  (gru): GRU(128, 128, bidirectional=True)
  (decoder): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (act): Sigmoid()
)


In [15]:
def train(model, train_dataset, vocab, optimizer, criterion, batch_size, n_iters, device, clip):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()

    batches = [batch2TrainData(vocab, 
               [random.choice(train_dataset) 
               for _ in range(batch_size)])
               for _ in range(n_iters)]
    
    for batch in batches:

        inputs, lengths, outputs = batch
        inputs, lengths, outputs = inputs.to(device), lengths.to(device), outputs.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(inputs, lengths)

        outputs = outputs.type_as(predictions)

        loss = criterion(predictions, outputs)
        
        acc = binary_accuracy(predictions, outputs)
        
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(batches), epoch_acc / len(batches)

In [16]:
def evaluate(model, test_dataset, vocab, criterion, batch_size, n_iters, device):

    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():

        batches = [batch2TrainData(vocab, 
                   [random.choice(test_dataset) 
                   for _ in range(batch_size)])
                   for _ in range(n_iters)]
    
        for batch in batches:

            inputs, lengths, outputs = batch
            inputs, lengths, outputs = inputs.to(device), lengths.to(device), outputs.to(device)

            predictions = model(inputs, lengths)

            outputs = outputs.type_as(predictions)
            
            loss = criterion(predictions, outputs)
            
            acc = binary_accuracy(predictions, outputs)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(batches), epoch_acc / len(batches)

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 10
N_ITERS = 100
N_SPLITS = 5
CLIP = 3

folds = KFold(n_splits=N_SPLITS, shuffle=True)

In [19]:
for train_index, test_index in folds.split(mediaDataset):

    train_dataset, test_dataset = [], []

    for tr_idx in train_index:
        train_dataset.append(mediaDataset[tr_idx])

    for te_idx in test_index:
        test_dataset.append(mediaDataset[te_idx])
    
    best_valid_loss = float('inf')

    for epoch in tqdm(range(N_EPOCHS)):

        start_time = time.time()
        
        train_loss, train_acc = train(model=model,
                                      train_dataset=train_dataset, 
                                      vocab=vocab,
                                      optimizer=optimizer, 
                                      criterion=criterion,
                                      batch_size=BATCH_SIZE,
                                      n_iters=N_ITERS,
                                      device=device,
                                      clip=CLIP)

        valid_loss, valid_acc = evaluate(model=model,
                                         test_dataset=test_dataset, 
                                         vocab=vocab, 
                                         criterion=criterion,
                                         batch_size=BATCH_SIZE,
                                         n_iters=N_ITERS,
                                         device=device)
        
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
        
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




KeyboardInterrupt: 