<a href="https://colab.research.google.com/github/tmgrgg/nlp-mafiascum/blob/master/CNN_Mafiascum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Cloning into the repository to obtain files
!git clone https://bitbucket.org/bopjesvla/thesis.git
!cp thesis/src/* .

Cloning into 'thesis'...
remote: Counting objects: 220, done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 220 (delta 19), reused 0 (delta 0)[K
Receiving objects: 100% (220/220), 734.99 MiB | 22.62 MiB/s, done.
Resolving deltas: 100% (96/96), done.
Checking out files: 100% (58/58), done.
cp: -r not specified; omitting directory 'thesis/src/output'


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np

SEED = 1234
torch.backends.cudnn.deterministic = True

# Build Dataset

In [0]:
DOCS_PATH = 'drive/My Drive/24h_of_deception_basic.pkl'

docs = pd.read_pickle(DOCS_PATH)
docs = docs.sample(frac=1)

print(len(docs))

112523


In [0]:
from torchtext import data

class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field), ('index', Field(sequential=False, use_vocab=False))]
        examples = []
        for i, row in df.iterrows():
            label = row.scum 
            text = row.words
            examples.append(data.Example.fromlist([text, label, i], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

from torchtext.data import Field, LabelField

# define Fields
TEXT = Field(sequential=True, batch_first=True)
LABEL = LabelField(dtype=torch.float)
#LABEL = LabelField(dtype=torch.long)

# define Dataset
dataset = DataFrameDataset(docs, TEXT, LABEL)

train_data, test_data = dataset.split(split_ratio=0.8)
train_data, valid_data = train_data.split(split_ratio = 0.8, random_state= random.seed(SEED))

# Specify Model

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

# TRAINING 

## Training Utilities

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
from sklearn.metrics import roc_auc_score, average_precision_score

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    all_predictions = []
    all_y_true = []
    
    for batch in iterator:       
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        all_predictions.append(predictions.cpu().detach().numpy())
        all_y_true.append(batch.label.cpu().detach().numpy())

    
    all_y_true = np.hstack(all_y_true)
    all_predictions = np.hstack(all_predictions)
    auc_score = roc_auc_score(all_y_true, all_predictions)   
    ap_score = average_precision_score(all_y_true, all_predictions) 
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator), auc_score, ap_score

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        all_predictions = []
        all_y_true = []
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            all_predictions.append(predictions.cpu().numpy())
            all_y_true.append(batch.label.cpu().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    all_y_true = np.hstack(all_y_true)
    all_predictions = np.hstack(all_predictions)
    auc_score = roc_auc_score(all_y_true, all_predictions)   
    ap_score = average_precision_score(all_y_true, all_predictions)  
    return epoch_loss / len(iterator), epoch_acc / len(iterator), auc_score, ap_score

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
from collections import OrderedDict

class StatsTracker():
  def __init__(self, plot_freq=5):
    self.counter = 0
    self.plot_freq = plot_freq
    self._metrics = OrderedDict({})

  def _make_plot(self):
      fig, ax = plt.subplots(1, len(self._metrics.keys()), squeeze=False, figsize=(18,4))
      for i, metric in enumerate(self._metrics.keys()):
        train_metric = self._metrics[metric]['train']
        valid_metric = self._metrics[metric]['valid']
        ax[0, i].plot(list(range(len(train_metric))), train_metric, c='b', label='Train', marker='.')
        ax[0, i].plot(list(range(len(valid_metric))), valid_metric, c='r', label='Valid', marker='.')
        ax[0, i].set_ylabel(metric)
        ax[0, i].set_xlabel('epochs')
        ax[0, i].legend(loc='upper right')

  def plot(self):
    # will plot the current loss_graph every plot_freq^th call to plot
    self.counter += 1
    print('eyo')
    print(self.counter)
    print(self.counter % self.plot_freq)
    if (self.counter % self.plot_freq == 0):
      print('hey')
      fig = self._make_plot()
      clear_output()
      plt.show()
      #fig.close()

  def save(self, path):
    # save the current loss_graph to the given path
    self._make_plot()
    plt.savefig(path)
    plt.close()

  def log(self, value, metric, setting='train'):
    assert(setting == 'train' or setting == 'valid')
    if metric not in self._metrics.keys():
      self._metrics[metric] = {'train': [], 'valid': []}
    self._metrics[metric][setting].append(value)

## Define Model Parameters and Training Hyperparameters

In [0]:
MAX_VOCAB_SIZE = 100_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip:  40%|███▉      | 343M/862M [02:19<58:21, 148kB/s]  

In [0]:
BATCH_SIZE = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)

pos_weight = len(docs[docs['scum'] == False])/len(docs[docs['scum'] == True])
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_weight]))
print('positive weight:', pos_weight)

model = model.to(device)
criterion = criterion.to(device)

## Training Loop

In [0]:
N_EPOCHS = 50

best_valid_loss = float('inf')
tracker = StatsTracker(plot_freq=1)
valid_loss, valid_acc, valid_auc, valid_ap = evaluate(model, valid_iterator, criterion)
tracker.log(valid_loss, 'loss', setting='valid')
tracker.log(valid_acc, 'accuracy', setting='valid')
tracker.log(valid_auc, 'AUC', setting='valid')
tracker.log(valid_ap, 'AP', setting='valid')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_auc, train_ap = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_auc, valid_ap = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-model.pt')

    tracker.log(train_loss, 'loss', setting='train')
    tracker.log(valid_loss, 'loss', setting='valid')
    tracker.log(train_acc, 'accuracy', setting='train')
    tracker.log(valid_acc, 'accuracy', setting='valid')
    tracker.log(train_auc, 'AUC', setting='train')
    tracker.log(valid_auc, 'AUC', setting='valid')
    tracker.log(train_ap, 'AP', setting='train')
    tracker.log(valid_ap, 'AP', setting='valid')
    tracker.plot()

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train AUC: {train_auc:.3f} | Train AP: {train_ap:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. AUC: {valid_auc:.3f} | Val. AP: {valid_ap:.3f}')
    

Test results:

In [0]:
model.load_state_dict(torch.load('best-model.pt'))

test_loss, test_acc, test_auc, test_ap = evaluate(model, test_iterator, criterion)

print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}% | Test AUC: {test_auc:.3f} | Test AP: {test_ap:.3f}')