In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util

In [3]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    #changed from 64
    batch_size=48,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

In [4]:
from allennlp.common.checks import ConfigurationError

In [5]:
USE_GPU = torch.cuda.is_available()

In [6]:
DATA_ROOT = Path("/reddit-selfposts")

In [7]:
USE_GPU

True

Set random seed manually to replicate results

In [8]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x7f27080bd190>

# Load Data

In [9]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader

In [12]:
config.testing

True

In [10]:
df = pd.read_csv('reddit-selfposts/rspct.tsv',sep='\t')

In [11]:
labs=df["subreddit"].unique()

In [29]:
labs.tolist()

['talesfromtechsupport',
 'teenmom',
 'Harley',
 'ringdoorbell',
 'intel',
 'residentevil',
 'BATProject',
 'hockeyplayers',
 'asmr',
 'rawdenim',
 'steinsgate',
 'DBZDokkanBattle',
 'Nootropics',
 'l5r',
 'NameThatSong',
 'homeless',
 'antidepressants',
 'absolver',
 'KissAnime',
 'sissyhypno',
 'oculusnsfw',
 'dpdr',
 'Garmin',
 'AskLiteraryStudies',
 'poetry_critics',
 'skiing',
 'shrimptank',
 'logorequests',
 'Stargate',
 'foreskin_restoration',
 'sharepoint',
 'synthesizers',
 'gravityfalls',
 'androiddev',
 'Grimdawn',
 'driving',
 'FORTnITE',
 'dndnext',
 'Magic',
 'MtvChallenge',
 'FoWtcg',
 'harrypotter',
 'TryingForABaby',
 'sewing',
 'foxholegame',
 'madmen',
 'JUSTNOMIL',
 'APStudents',
 'sharditkeepit',
 'amateurradio',
 'sleeptrain',
 'fatpeoplestories',
 'GameStop',
 'scuba',
 'Firefighting',
 'Mustang',
 'riverdale',
 'flying',
 'bartenders',
 'scooters',
 'trumpet',
 'projecteternity',
 'musictheory',
 'factorio',
 'SexToys',
 'EternalCardGame',
 'PLC',
 'sailing',
 '

### Prepare dataset

In [12]:
label_cols = labs.tolist()

In [66]:
labels = label_cols

In [13]:
df['combined']=df['title'].str.cat(df['selftext'], sep=' ')
df.head()

Unnamed: 0,id,subreddit,title,selftext,combined
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi...",Remember your command line switches... Hi ther...
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...,"So what was Matt ""addicted"" to? Did he ever sa..."
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...,No Club Colors Funny story. I went to college ...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...,"Not door bell, but floodlight mount height. I ..."
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,...",Worried about my 8700k small fft/data stress r...


In [71]:
from allennlp.data.fields import TextField, MetadataField, ArrayField, LabelField

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], id: str=None,
                         label: str=None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields['id'] = id_field
        
        if label is not None:
            fields['label'] = LabelField(label, label_namespace="subreddit")
        

        return Instance(fields)
# changed to pass df instead of filename in order to use combined title +selfpost    
    @overrides
    def _read(self, df) -> Iterator[Instance]:
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["combined"])],
                row["id"], row["subreddit"],
            )

### Prepare token handlers

We will use the spacy tokenizer here

In [72]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer

# the token indexer is responsible for mapping tokens to integers
token_indexer = ELMoTokenCharactersIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]

In [73]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [74]:
#split 80% train 20% test
train_ds = reader.read(df[:810399])
test_ds = reader.read(df[810399:]) 
val_ds = None


0it [00:00, ?it/s][A
157it [00:00, 1568.85it/s][A
287it [00:00, 1475.27it/s][A
391it [00:00, 916.06it/s] [A
465it [00:00, 852.49it/s][A
533it [00:00, 775.34it/s][A
600it [00:00, 723.99it/s][A
674it [00:00, 727.66it/s][A
745it [00:00, 715.82it/s][A
814it [00:01, 669.57it/s][A
880it [00:01, 661.27it/s][A
956it [00:01, 687.71it/s][A
1000it [00:01, 757.72it/s][A
0it [00:00, ?it/s][A
67it [00:00, 663.13it/s][A
131it [00:00, 650.34it/s][A
207it [00:00, 674.53it/s][A
277it [00:00, 680.17it/s][A
338it [00:00, 648.33it/s][A
400it [00:00, 638.63it/s][A
477it [00:00, 672.43it/s][A
543it [00:00, 667.68it/s][A
607it [00:01, 470.21it/s][A
681it [00:01, 523.61it/s][A
746it [00:01, 555.83it/s][A
810it [00:01, 576.28it/s][A
880it [00:01, 607.49it/s][A
944it [00:01, 612.74it/s][A
1000it [00:01, 611.71it/s][A

In [18]:
len(train_ds)

1000

In [19]:
train_ds[:10]

[<allennlp.data.instance.Instance at 0x7f2679259c18>,
 <allennlp.data.instance.Instance at 0x7f267936e080>,
 <allennlp.data.instance.Instance at 0x7f2679390a90>,
 <allennlp.data.instance.Instance at 0x7f267937ccf8>,
 <allennlp.data.instance.Instance at 0x7f26791c19b0>,
 <allennlp.data.instance.Instance at 0x7f26791bcb38>,
 <allennlp.data.instance.Instance at 0x7f267921b320>,
 <allennlp.data.instance.Instance at 0x7f267917f358>,
 <allennlp.data.instance.Instance at 0x7f2679193358>,
 <allennlp.data.instance.Instance at 0x7f26791a7ac8>]

In [20]:
vars(train_ds[0].fields["tokens"])

{'tokens': [Remember,
  your,
  command,
  line,
  switches,
  ...,
  Hi,
  there,
  ,,
  <,
  lb,
  >,
  The,
  usual,
  .,
  Long,
  time,
  lerker,
  ,,
  first,
  time,
  poster,
  ,,
  be,
  kind,
  etc,
  .,
  Sorry,
  if,
  this,
  is,
  n't,
  the,
  right,
  place,
  ...,
  <lb><lb,
  >,
  Alright,
  .,
  Here,
  's,
  the,
  story,
  .,
  I,
  'm,
  an,
  independent,
  developer,
  who,
  produces,
  my,
  own,
  software,
  .,
  We,
  're,
  going,
  to,
  call,
  me,
  well,
  ,,
  $,
  me.<lb><lb,
  >,
  I,
  work,
  with,
  $,
  dev,
  who,
  helps,
  to,
  produce,
  software,
  with,
  me,
  .,
  We,
  use,
  $,
  PopularVersionControl.<lb><lb,
  >,
  We're,
  trying,
  to,
  remove,
  a,
  branch,
  that,
  was,
  created,
  by,
  mistake,
  .,
  The,
  branch,
  is],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.elmo_indexer.ELMoTokenCharactersIndexer at 0x7f266d01dc18>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index

In [21]:
vars(train_ds[0].fields["label"])

{'label': 'talesfromtechsupport',
 '_label_namespace': 'subreddit',
 '_label_id': None,
 '_skip_indexing': False}

### Prepare vocabulary

We don't need to build the vocab: all that is handled by the token indexer

In [75]:
vocab = Vocabulary()

### Prepare iterator

The iterator is responsible for batching the data and preparing it for input into the model. We'll use the BucketIterator that batches text sequences of smilar lengths together.

In [76]:
from allennlp.data.iterators import BucketIterator

In [77]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

We need to tell the iterator how to numericalize the text data. We do this by passing the vocabulary to the iterator. This step is easy to forget so be careful! 

In [78]:
iterator.index_with(vocab)

### Read sample

In [79]:
batch = next(iter(iterator(train_ds)))

In [80]:
batch["tokens"]["tokens"]

tensor([[[259,  73, 102,  ..., 261, 261, 261],
         [259,  45, 260,  ..., 261, 261, 261],
         [259,  68,  98,  ..., 261, 261, 261],
         ...,
         [259, 104,  98,  ..., 261, 261, 261],
         [259,  45, 260,  ..., 261, 261, 261],
         [259, 111, 112,  ..., 261, 261, 261]],

        [[259,  66, 111,  ..., 261, 261, 261],
         [259, 102, 121,  ..., 261, 261, 261],
         [259,  46, 260,  ..., 261, 261, 261],
         ...,
         [259, 118, 113,  ..., 261, 261, 261],
         [259, 103, 112,  ..., 261, 261, 261],
         [259,  77, 102,  ..., 261, 261, 261]],

        [[259,  66, 101,  ..., 261, 261, 261],
         [259, 103, 112,  ..., 261, 261, 261],
         [259, 100, 112,  ..., 261, 261, 261],
         ...,
         [259,  98, 100,  ..., 261, 261, 261],
         [259, 117, 112,  ..., 261, 261, 261],
         [259,  98, 260,  ..., 261, 261, 261]],

        ...,

        [[259,  88, 105,  ..., 261, 261, 261],
         [259,  98, 115,  ..., 261, 261, 261]

In [81]:
batch["tokens"]["tokens"].shape

torch.Size([48, 100, 50])

In [82]:
len(label_cols)

1013

# Prepare Model

In [83]:
import torch
import torch.nn as nn
import torch.optim as optim

In [153]:
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder

class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)        
        #self.loss = nn.BCEWithLogitsLoss()
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                id: Any, label: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output

### Prepare embeddings

In [154]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder

options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})


In [155]:
# The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, bidirectional=True, batch_first=True))

Notice how simple and modular the code for initializing the model is. All the complexity is delegated to each component.

In [156]:
model = BaselineModel(
    word_embeddings, 
    encoder, 
    
)

In [157]:
if USE_GPU: model.cuda()
else: model

In [158]:
word_embeddings.get_output_dim()

256

# Basic sanity checks

In [159]:
batch = nn_util.move_to_device(batch, 0 if USE_GPU else -1)

In [160]:
tokens = batch["tokens"]
labels = batch

In [161]:
len(batch)

3

In [162]:
mask = get_text_field_mask(tokens)
mask

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

In [163]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)
class_logits

tensor([[ 0.0980,  0.0321,  0.2379,  ...,  0.1722, -0.2032,  0.0974],
        [-0.1167, -0.0541, -0.0688,  ...,  0.1196, -0.0654, -0.0840],
        [-0.0474,  0.0684, -0.1666,  ...,  0.0241,  0.0820,  0.2559],
        ...,
        [-0.0172, -0.0043, -0.0644,  ...,  0.1197, -0.1370,  0.1961],
        [ 0.1296,  0.0076, -0.0863,  ...,  0.1371, -0.1519, -0.0759],
        [-0.0839, -0.0278, -0.0297,  ...,  0.2144, -0.3449,  0.0702]],
       device='cuda:0', grad_fn=<AddmmBackward>)

In [164]:
model(**batch)

{'class_logits': tensor([[ 0.0225,  0.0352,  0.1239,  ...,  0.0717, -0.1963,  0.1122],
         [-0.1217, -0.0098, -0.0310,  ..., -0.0282, -0.1682,  0.1307],
         [-0.0331,  0.0834, -0.1441,  ...,  0.0225, -0.2000, -0.0023],
         ...,
         [ 0.0147, -0.1620, -0.1021,  ..., -0.0043, -0.0536,  0.1194],
         [ 0.0688,  0.0390, -0.0924,  ...,  0.1304, -0.2704,  0.0936],
         [-0.0279, -0.1960, -0.1062,  ...,  0.3022, -0.1587,  0.1246]],
        device='cuda:0', grad_fn=<AddmmBackward>),
 'loss': tensor(6.9538, device='cuda:0', grad_fn=<NllLossBackward>)}

In [165]:
loss = model(**batch)["loss"]

In [166]:
loss

tensor(6.9708, device='cuda:0', grad_fn=<NllLossBackward>)

In [167]:
loss.backward()

In [168]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[ 8.1649e-05, -4.0192e-06, -1.7894e-04,  ...,  3.3296e-04,
          -4.5080e-04, -1.3943e-04],
         [-3.7892e-04, -2.8551e-04,  1.1013e-04,  ...,  4.9197e-04,
           3.4767e-04,  2.0401e-04],
         [-1.0234e-03, -3.2159e-04, -2.1497e-04,  ..., -6.4096e-04,
          -9.6384e-05,  3.3268e-04],
         ...,
         [ 2.2296e-04,  2.6854e-04,  2.5267e-05,  ..., -5.8279e-04,
          -4.8647e-04, -8.6643e-05],
         [-1.6347e-05, -6.5584e-05,  7.2844e-05,  ..., -8.6177e-05,
          -6.1994e-05, -6.5548e-05],
         [ 2.2110e-04,  4.4802e-04, -1.8429e-05,  ...,  6.7507e-05,
          -2.1316e-04, -7.5948e-05]], device='cuda:0'),
 tensor([[-4.0692e-05, -1.6632e-04,  2.8889e-04,  ...,  1.0030e-04,
          -2.5475e-04,  4.2535e-06],
         [ 3.0712e-05,  8.6505e-05, -1.3134e-04,  ..., -1.7953e-04,
           1.3291e-04,  1.2732e-04],
         [-2.1602e-04, -8.0404e-05,  4.1226e-05,  ...,  1.3235e-04,
          -1.2508e-04, -3.1627e-04],
         ...,
        

In [169]:
model.parameters()

<generator object Module.parameters at 0x7f2697701930>

# Train

In [170]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [171]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    cuda_device=0 if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [172]:
metrics = trainer.train()


  0%|          | 0/21 [00:00<?, ?it/s][A
loss: 6.9842 ||:   5%|▍         | 1/21 [00:00<00:06,  3.18it/s][A
loss: 6.9582 ||:  10%|▉         | 2/21 [00:00<00:05,  3.45it/s][A
loss: 6.9339 ||:  14%|█▍        | 3/21 [00:00<00:05,  3.59it/s][A
loss: 6.9163 ||:  19%|█▉        | 4/21 [00:01<00:04,  3.70it/s][A
loss: 6.8950 ||:  24%|██▍       | 5/21 [00:01<00:04,  3.77it/s][A
loss: 6.8776 ||:  29%|██▊       | 6/21 [00:01<00:03,  4.10it/s][A
loss: 6.8626 ||:  33%|███▎      | 7/21 [00:01<00:03,  4.07it/s][A
loss: 6.8455 ||:  38%|███▊      | 8/21 [00:02<00:03,  4.04it/s][A
loss: 6.8283 ||:  43%|████▎     | 9/21 [00:02<00:03,  3.99it/s][A
loss: 6.8136 ||:  48%|████▊     | 10/21 [00:02<00:02,  3.96it/s][A
loss: 6.7943 ||:  52%|█████▏    | 11/21 [00:02<00:02,  4.09it/s][A
loss: 6.7761 ||:  57%|█████▋    | 12/21 [00:02<00:02,  4.05it/s][A
loss: 6.7608 ||:  62%|██████▏   | 13/21 [00:03<00:01,  4.02it/s][A
loss: 6.7442 ||:  67%|██████▋   | 14/21 [00:03<00:01,  3.98it/s][A
loss: 6.7283 |

# Generating Predictions

In [173]:
from allennlp.data.iterators import DataIterator
from tqdm import tqdm
from scipy.special import expit # the sigmoid function

def tonp(tsr): return tsr.detach().cpu().numpy()

class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return expit(tonp(out_dict["class_logits"]))
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [174]:
from allennlp.data.iterators import BasicIterator
# iterate over the dataset without changing its order
seq_iterator = BasicIterator(batch_size=64)
seq_iterator.index_with(vocab)

In [175]:
predictor = Predictor(model, seq_iterator, cuda_device=0 if USE_GPU else -1)
train_preds = predictor.predict(train_ds) 
test_preds = predictor.predict(test_ds) 


  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:00<00:05,  2.57it/s][A
 12%|█▎        | 2/16 [00:00<00:04,  2.83it/s][A
 19%|█▉        | 3/16 [00:00<00:04,  3.05it/s][A
 25%|██▌       | 4/16 [00:01<00:03,  3.21it/s][A
 31%|███▏      | 5/16 [00:01<00:03,  3.35it/s][A
 38%|███▊      | 6/16 [00:01<00:02,  3.44it/s][A
 44%|████▍     | 7/16 [00:02<00:02,  3.51it/s][A
 50%|█████     | 8/16 [00:02<00:02,  3.56it/s][A
 56%|█████▋    | 9/16 [00:02<00:01,  3.60it/s][A
 62%|██████▎   | 10/16 [00:02<00:01,  3.63it/s][A
 69%|██████▉   | 11/16 [00:03<00:01,  3.63it/s][A
 75%|███████▌  | 12/16 [00:03<00:01,  3.61it/s][A
 81%|████████▏ | 13/16 [00:03<00:00,  3.63it/s][A
 88%|████████▊ | 14/16 [00:03<00:00,  3.65it/s][A
 94%|█████████▍| 15/16 [00:04<00:00,  3.69it/s][A
100%|██████████| 16/16 [00:04<00:00,  3.87it/s][A
  0%|          | 0/16 [00:00<?, ?it/s][A
100%|██████████| 16/16 [00:05<00:00,  3.70it/s]
