# Simple baseline with AllenNlP

I haven't seen anyone try to use AllenNLP for a kaggle competition before, so I wrote this kernel to show how it could be done.  
AllenNLP abstracts away most of the boilerplate code like training loops, loading pretrained embeddings, and keeping track of experiments which lets you write a lot less code. It also lets you change model architectures and hyperparameters by  creating new experiments entirely from configuration files instead of changing the code for each new experiment.

### Install AllenNLP from dataset

In [2]:
!pip install allennlp


Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/b5/14/f0f9dd1ce012e7723742821b95b33dd9bdc53befe209600608bc7be1f650/allennlp-1.2.0-py3-none-any.whl (498kB)
[K     |████████████████████████████████| 501kB 3.4MB/s 
Collecting tensorboardX>=1.2
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |████████████████████████████████| 317kB 16.6MB/s 
[?25hCollecting boto3<2.0,>=1.14
[?25l  Downloading https://files.pythonhosted.org/packages/85/54/099a2ea5d4b2d5931a26f280a7585f613b1fafaac9189e489a9e25004a01/boto3-1.16.13-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 15.5MB/s 
Collecting jsonpickle
  Downloading https://files.pythonhosted.org/packages/af/ca/4fee219cc4113a5635e348ad951cf8a2e47fed2e3342312493f5b73d0007/jsonpickle-1.4.1-py2.py3-none-any.whl
Collecting overrides==3.1.0
  Downloading https://

In [3]:
!pip install overrides



In [5]:
!pip install mlcrate 

Collecting mlcrate
  Downloading https://files.pythonhosted.org/packages/78/7d/7a58b3eeae81efb695806eadf11e9290c6223ce1d3cf3a16b2a374901275/mlcrate-0.2.0-py3-none-any.whl
Collecting pathos
[?25l  Downloading https://files.pythonhosted.org/packages/10/9e/0100b1d500851fc8e093da5463ca38e013c86ea0855e7c510ca0d3e1f7c1/pathos-0.2.7-py2.py3-none-any.whl (81kB)
[K     |████████████████████████████████| 81kB 3.6MB/s 
Collecting multiprocess>=0.70.11
[?25l  Downloading https://files.pythonhosted.org/packages/8f/dc/426a82723c460cfab653ebb717590103d6e38cebc9d1f599b0898915ac1d/multiprocess-0.70.11.1-py36-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 5.2MB/s 
[?25hCollecting ppft>=1.6.6.3
[?25l  Downloading https://files.pythonhosted.org/packages/51/7b/e63dcf1f9b5ecd37691ee8a7029f71ddb7cafab780a60e312d913afc0f29/ppft-1.6.6.3-py3-none-any.whl (65kB)
[K     |████████████████████████████████| 71kB 5.9MB/s 
Collecting pox>=0.2.9
  Downloading https://files.pythonhosted.org/

In [7]:
%load_ext autoreload
%autoreload 2

FOLD = 0

import os
import sys
import random
import glob
import gc
import logging
import requests
import re

from typing import Dict, Tuple, List
from collections import OrderedDict
from overrides import overrides
from time import sleep

import cv2
import numpy as np
import pandas as pd

import mlcrate as mlc

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim

import torchvision

import allennlp

from allennlp.common import Registrable, Params
from allennlp.common.util import START_SYMBOL, END_SYMBOL, JsonDict

from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import ArrayField, TextField
#from allennlp.data.iterators import BucketIterator, MultiprocessIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, CharacterTokenizer
from allennlp.data.vocabulary import Vocabulary

from allennlp.models import Model

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper # MIGHT USE FOR ABSTRACTION

from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.nn.beam_search import BeamSearch

from allennlp.training.metrics import F1Measure, BLEU
from allennlp.training import Trainer

sys.path.insert(0, './math_handwriting_recognition')

logger = logging.getLogger()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load and split data

In [8]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
sample_submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv')

FileNotFoundError: ignored

In [None]:
!mkdir jigsaw
!touch jigsaw/__init__.py

# Get a 5 fold cv
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = list(kfold.split(train))[0]
train_df, val_df = train.iloc[train_idx].reset_index(), train.iloc[val_idx].reset_index()
train_df.to_csv('train.csv')
val_df.to_csv('val.csv')

## Dataset reader

In [None]:
%%writefile jigsaw/dataset.py
import os
import random
from typing import Dict, Tuple, List
from overrides import overrides

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch

import spacy

import allennlp

from allennlp.common.util import START_SYMBOL, END_SYMBOL, get_spacy_model

from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import ArrayField, TextField, MetadataField, LabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, CharacterTokenizer, WordTokenizer

@Tokenizer.register("simple")
class LatexTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()

    def _tokenize(self, text):
        return [Token(token) for token in text.split()]

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        tokens = self._tokenize(text)

        return tokens

@DatasetReader.register('jigsaw')
class JigsawDatasetReader(DatasetReader):
    def __init__(self, root_path: str, tokenizer: Tokenizer, lazy: bool = True, subset: bool = False) -> None:
        super().__init__(lazy)
        
        self.root_path = root_path
        self.subset = subset
        
        self._tokenizer = tokenizer
        self._token_indexer = {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file: str):
        df = pd.read_csv(os.path.join(self.root_path, file))

        if self.subset:
            df = df.loc[:16]

        for _, row in df.iterrows():
            idx = row['id']
            comment_text = row['comment_text']
            
            if 'target' in df.columns:
                target = int(row['target'] > 0.5)
                yield self.text_to_instance(idx, comment_text, target)
            else:
                yield self.text_to_instance(idx, comment_text)
            
    @overrides
    def text_to_instance(self, idx: str, comment_text: str, target: float = None) -> Instance:
        comment_text = self._tokenizer.tokenize(comment_text)
        
        fields = {}
        fields['idx'] = MetadataField({'idx': idx})
        fields['comment_text'] = TextField(comment_text, self._token_indexer)

        if target is not None:
            fields['target'] = LabelField(target, skip_indexing=True)
        
        return Instance(fields)

Writing jigsaw/dataset.py


## Simple LSTM baseline model

In [None]:
 %%writefile jigsaw/model.py
import os
import random
from typing import Dict, Tuple
from overrides import overrides

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision

import allennlp

from allennlp.common import Registrable, Params
from allennlp.common.util import START_SYMBOL, END_SYMBOL

from allennlp.data.vocabulary import Vocabulary

from allennlp.models import Model

from allennlp.modules import FeedForward
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper

from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

from allennlp.nn.beam_search import BeamSearch

from allennlp.training.metrics import F1Measure, BLEU, Auc, BooleanAccuracy

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

@Model.register('baseline')
class Baseline(Model):
    def __init__(self, embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, classifier: FeedForward, vocab: Vocabulary) -> None:
        super().__init__(vocab)

        self.embedding = embeddings
        
        self.encoder = encoder
        self.classifier = classifier
        
        self.loss = nn.BCEWithLogitsLoss()
        self.accuracy = BooleanAccuracy()
        
    @overrides
    def forward(self, idx: Dict[str, torch.Tensor], comment_text: Dict[str, torch.Tensor], target: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(comment_text)

        x = self.embedding(comment_text)
        x = self.encoder(x, mask)
        x = self.classifier(x).view(-1)
        
        logits = torch.sigmoid(x)
                
        out = {'idx': idx, 'pred': logits}

        if target is not None:
            if not self.training:
                self.accuracy((logits > 0.5).int(), target.int())

            out['loss'] = self.loss(x, target.float())

        return out

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        if not self.training:
            metrics = {
                "accuracy": self.accuracy.get_metric(reset)
            }
        else:
            metrics = {}
        
        return metrics

Writing jigsaw/model.py


## Predictor to get test predictions

In [None]:
%%writefile jigsaw/predictor.py
import os
import random
from typing import Dict, Tuple, List
from overrides import overrides
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import skimage
import cv2

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision

import mlcrate as mlc

import allennlp

from allennlp.common import Registrable, Params
from allennlp.common.util import START_SYMBOL, END_SYMBOL, JsonDict, sanitize

from allennlp.data import DatasetReader, Instance
from allennlp.data.vocabulary import Vocabulary

from allennlp.models import Model

from allennlp.predictors.predictor import Predictor

from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.nn.beam_search import BeamSearch

from allennlp.training.metrics import F1Measure, BLEU

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

@Predictor.register('jigsaw')
class JigsawPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        
    def dump_line(self, outputs: JsonDict) -> str:
        pred = str(outputs['pred'])

        return f'{pred}\n'

Writing jigsaw/predictor.py


## Config file to set up experiments without changing the code

In [None]:
%%writefile config.json
{
    "dataset_reader": {
        "type": "jigsaw",
        "root_path": "./",
        "lazy": true,
        "subset": false,
        "tokenizer": {
            "type": "simple"
        }
    },
    "train_data_path": "train.csv",
    "validation_data_path": "val.csv",
    "model": {
        "type": "baseline",
        "embeddings": {
          "tokens": {
            "type": "embedding",
            "pretrained_file": "../input/quoratextemb/embeddings/glove.840B.300d/glove.840B.300d.txt",
            "embedding_dim": 300,
            "trainable": false
          }
        },
        'encoder': {
            'type': 'lstm',
            'bidirectional': false,
            'input_size': 300,
            'hidden_size': 64,
            'num_layers': 1
        },
        'classifier': {
            'input_dim': 64,
            'num_layers': 1,
            'hidden_dims': 1,
            'activations': 'linear' # sigmoid activation is applied separately
        }
    },
    "iterator": {
        "type": "bucket",
        "sorting_keys":[["comment_text", "num_tokens"]],
        "batch_size": 512
    },
    "trainer": {
        "num_epochs": 4,
        "cuda_device": 0,
        "optimizer": {
            "type": "adam",
            "lr": 0.001
        },
        "grad_clipping": 5,
        "learning_rate_scheduler": {
            "type": "reduce_on_plateau",
            "factor": 0.5,
            "patience": 5
        },
        "num_serialized_models_to_keep": 1,
        "summary_interval": 10,
        "histogram_interval": 100,
        "should_log_parameter_statistics": true,
        "should_log_learning_rate": true
    },
    'vocabulary': {
        'max_vocab_size': 100000,
#         "directory_path": "./vocabulary"
    }
}

Writing config.json


## Train the model

In [None]:
!allennlp train config.json -s ./logs --include-package jigsaw
# !rm -rf logs/*

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-06 20:45:08,618 - INFO - allennlp.common.params - random_seed = 13370
2019-04-06 20:45:08,618 - INFO - allennlp.common.params - numpy_seed = 1337
2019-04-06 20:45:08,618 - INFO - allennlp.common.params - pytorch_seed = 133
2019-04-06 20:45:08,619 - INFO - allennlp.common.checks - Pytorch version: 1.0.1.post2
2019-04-06 20:45:08,620 - INFO - allennlp.common.params - evaluate_on_test = False
2019-04-06 20:45:08,620 - INFO - allennlp.common.from_params - instantiating class <class 'allennlp.data.dataset_readers.dataset_reader.DatasetReader'> from params {'lazy': True, 'root_path': './', 'subset': False, 'tokenizer': {'type': 'simple'}, 'type': 'jigsaw'} and extras set()
2019-04-06 20:45:08,621 - INFO - allennlp.common.params - dataset_reader.type = jigsaw
2019-04-06 20:45:08,621 - INFO - allennlp.common.from_params - instantiating class <class 'jigsaw.dataset.JigsawDatasetReader'> fro

## Evaluate the model's performance on the train and val sets

In [None]:
!allennlp evaluate --cuda-device 0 --include-package jigsaw ./logs/model.tar.gz train.csv
!allennlp evaluate --cuda-device 0 --include-package jigsaw ./logs/model.tar.gz val.csv

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-06 22:07:42,342 - INFO - allennlp.models.archival - loading archive file ./logs/model.tar.gz
2019-04-06 22:07:42,343 - INFO - allennlp.models.archival - extracting archive file ./logs/model.tar.gz to temp dir /tmp/tmpfk7rmpiw
2019-04-06 22:07:43,386 - INFO - allennlp.data.vocabulary - Loading token dictionary from /tmp/tmpfk7rmpiw/vocabulary.
2019-04-06 22:07:43,484 - INFO - allennlp.common.from_params - instantiating class <class 'allennlp.models.model.Model'> from params {'classifier': {'activations': 'linear', 'hidden_dims': 1, 'input_dim': 64, 'num_layers': 1}, 'embeddings': {'tokens': {'embedding_dim': 300, 'trainable': False, 'type': 'embedding'}}, 'encoder': {'bidirectional': False, 'hidden_size': 64, 'input_size': 300, 'num_layers': 1, 'type': 'lstm'}, 'type': 'baseline'} and extras {'vocab'}
2019-04-06 22:07:43,485 - INFO - allennlp.common.from_params - instantiating class <c

In [None]:
%%time
!allennlp predict --output-file ./train_preds.csv --batch-size 64 --cuda-device 0 --use-dataset-reader --predictor jigsaw --include-package jigsaw --silent ./logs/model.tar.gz train.csv
# From https://superuser.com/questions/246837/how-do-i-add-text-to-the-beginning-of-a-file-in-bash
!sed -i '1s/^/prediction\n/' train_preds.csv
train_preds = pd.read_csv('train_preds.csv')
train_roc_auc_score = roc_auc_score(train_df.target.values > 0.5, train_preds.prediction.values)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-06 22:24:28,401 - INFO - allennlp.models.archival - loading archive file ./logs/model.tar.gz
2019-04-06 22:24:28,402 - INFO - allennlp.models.archival - extracting archive file ./logs/model.tar.gz to temp dir /tmp/tmp720urff3
2019-04-06 22:24:29,448 - INFO - allennlp.common.params - vocabulary.type = default
2019-04-06 22:24:29,448 - INFO - allennlp.data.vocabulary - Loading token dictionary from /tmp/tmp720urff3/vocabulary.
2019-04-06 22:24:29,545 - INFO - allennlp.common.from_params - instantiating class <class 'allennlp.models.model.Model'> from params {'classifier': {'activations': 'linear', 'hidden_dims': 1, 'input_dim': 64, 'num_layers': 1}, 'embeddings': {'tokens': {'embedding_dim': 300, 'trainable': False, 'type': 'embedding'}}, 'encoder': {'bidirectional': False, 'hidden_size': 64, 'input_size': 300, 'num_layers': 1, 'type': 'lstm'}, 'type': 'baseline'} and extras {'vocab'}


In [None]:
%%time
!allennlp predict --output-file ./val_preds.csv --batch-size 64 --cuda-device 0 --use-dataset-reader --predictor jigsaw --include-package jigsaw --silent ./logs/model.tar.gz val.csv
# From https://superuser.com/questions/246837/how-do-i-add-text-to-the-beginning-of-a-file-in-bash
!sed -i '1s/^/prediction\n/' val_preds.csv
val_preds = pd.read_csv('val_preds.csv')
val_roc_auc_score = roc_auc_score(val_df.target.values > 0.5, val_preds.prediction.values)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-06 22:40:13,555 - INFO - allennlp.models.archival - loading archive file ./logs/model.tar.gz
2019-04-06 22:40:13,556 - INFO - allennlp.models.archival - extracting archive file ./logs/model.tar.gz to temp dir /tmp/tmpiw2mry9z
2019-04-06 22:40:14,773 - INFO - allennlp.common.params - vocabulary.type = default
2019-04-06 22:40:14,774 - INFO - allennlp.data.vocabulary - Loading token dictionary from /tmp/tmpiw2mry9z/vocabulary.
2019-04-06 22:40:14,888 - INFO - allennlp.common.from_params - instantiating class <class 'allennlp.models.model.Model'> from params {'classifier': {'activations': 'linear', 'hidden_dims': 1, 'input_dim': 64, 'num_layers': 1}, 'embeddings': {'tokens': {'embedding_dim': 300, 'trainable': False, 'type': 'embedding'}}, 'encoder': {'bidirectional': False, 'hidden_size': 64, 'input_size': 300, 'num_layers': 1, 'type': 'lstm'}, 'type': 'baseline'} and extras {'vocab'}


In [None]:
!cat logs/metrics.json

{
  "best_epoch": 3,
  "peak_cpu_memory_MB": 6563.652,
  "peak_gpu_0_memory_MB": 1329,
  "training_duration": "01:15:04",
  "training_start_epoch": 0,
  "training_epochs": 3,
  "epoch": 3,
  "training_loss": 0.1069820237150221,
  "training_cpu_memory_MB": 6563.652,
  "training_gpu_0_memory_MB": 1329,
  "validation_accuracy": 0.9607618256111919,
  "validation_loss": 0.11243568266354101,
  "best_validation_accuracy": 0.9607618256111919,
  "best_validation_loss": 0.11243568266354101
}

In [None]:
print(f'Train ROC-AUC: {round(train_roc_auc_score, 4)}')
print(f'Val ROC-AUC: {round(val_roc_auc_score, 4)}')

Train ROC-AUC: 0.9543
Val ROC-AUC: 0.9415


## Predict on the test set and save submission

In [None]:
%%time
!allennlp predict --output-file ./test_preds.csv --batch-size 64 --cuda-device 0 --use-dataset-reader --predictor jigsaw --include-package jigsaw --silent ./logs/model.tar.gz ../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv
# From https://superuser.com/questions/246837/how-do-i-add-text-to-the-beginning-of-a-file-in-bash
!sed -i '1s/^/prediction\n/' test_preds.csv
test_preds = pd.read_csv('test_preds.csv')
sample_submission['prediction'] = test_preds['prediction'].values
mlc.kaggle.save_sub(sample_submission, 'submission.csv')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-06 22:44:10,079 - INFO - allennlp.models.archival - loading archive file ./logs/model.tar.gz
2019-04-06 22:44:10,080 - INFO - allennlp.models.archival - extracting archive file ./logs/model.tar.gz to temp dir /tmp/tmpj_zryeci
2019-04-06 22:44:11,118 - INFO - allennlp.common.params - vocabulary.type = default
2019-04-06 22:44:11,118 - INFO - allennlp.data.vocabulary - Loading token dictionary from /tmp/tmpj_zryeci/vocabulary.
2019-04-06 22:44:11,214 - INFO - allennlp.common.from_params - instantiating class <class 'allennlp.models.model.Model'> from params {'classifier': {'activations': 'linear', 'hidden_dims': 1, 'input_dim': 64, 'num_layers': 1}, 'embeddings': {'tokens': {'embedding_dim': 300, 'trainable': False, 'type': 'embedding'}}, 'encoder': {'bidirectional': False, 'hidden_size': 64, 'input_size': 300, 'num_layers': 1, 'type': 'lstm'}, 'type': 'baseline'} and extras {'vocab'}


In [None]:
sample_submission.head()

Unnamed: 0,id,prediction
0,7000000,0.007651
1,7000001,0.000628
2,7000002,0.001997
3,7000003,0.000672
4,7000004,0.961583


## Delete unnecessary files to free up more space

In [None]:
!rm -rf logs
!rm out.txt
!rm config.json

rm: cannot remove 'out.txt': No such file or directory


In [None]:
!rm -rf packages