<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [6]:
from overrides import overrides
from typing import List, Sequence, Iterable, Tuple, Dict

from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from sklearn.utils import resample

import pandas as pd

In [4]:
@DatasetReader.register("aita_transformer_reader")
class AITATestReader(DatasetReader):
    """
    Reads a file from the Stanford Natural Language Inference (SNLI) dataset.  This data is
    formatted as jsonl, one json-formatted instance per line.  The keys in the data are
    "gold_label", "sentence1", and "sentence2".  We convert these keys into fields named "label",
    "premise" and "hypothesis", along with a metadata field containing the tokenized strings of the
    premise and hypothesis.
    # Parameters
    tokenizer : `Tokenizer`, optional (default=`SpacyTokenizer()`)
        We use this `Tokenizer` for both the premise and the hypothesis.  See :class:`Tokenizer`.
    token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`)
        We similarly use this for both the premise and the hypothesis.  See :class:`TokenIndexer`.
    combine_input_fields : `bool`, optional
            (default=`isinstance(tokenizer, PretrainedTransformerTokenizer)`)
        If False, represent the premise and the hypothesis as separate fields in the instance.
        If True, tokenize them together using `tokenizer.tokenize_sentence_pair()`
        and provide a single `tokens` field in the instance.
    """

    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        combine_input_fields: bool = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if combine_input_fields is not None:
            self._combine_input_fields = combine_input_fields
        else:
            self._combine_input_fields = isinstance(self._tokenizer, PretrainedTransformerTokenizer)

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        df = pd.read_pickle(file_path)
        logger.info("Label Initial Counts")
        logger.info(df.label.value_counts())

        logger.info("Resampling labels, since resample_labels"
            " was set to true.")
        labels = list(df.label.unique())
        label_dataframes = []
        for label in labels:
            label_dataframes.append(df[df.label == label])
        label_counts = [len(x) for x in label_dataframes]
        largest_label = max(label_counts)
        df = pd.concat([
            resample(label_df,
                replace=True,
                n_samples=largest_label,
                random_state=420)
            for label_df in label_dataframes])
        logger.info("New label sampling is:")
        logger.info(df.label.value_counts())

        for _, row in df.iterrows():
            yield self.text_to_instance(row.title, row.selftext, row.label)

    @overrides
    def text_to_instance(
        self,  # type: ignore
        title: str,
        post: str,
        label: str = None,
    ) -> Instance:

        fields: Dict[str, Field] = {}
        tokens = self._tokenizer.tokenize_sentence_pair(title, post)
        fields["tokens"] = TextField(tokens, self._token_indexers)

        if label:
            fields["label"] = LabelField(label)

        return Instance(fields)

In [7]:
df = pd.read_pickle('../data/aita-tiny-test.pkl')

In [9]:
tokenizer = PretrainedTransformerTokenizer('roberta-base', max_length=128)

I0315 09:41:18.396000 140644183689024 configuration_utils.py:254] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /home/wfu/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.a7ab0e5de2d8321d6d6a15b199110f2c99be72976b7d151423cb8d8c261a13b6
I0315 09:41:18.399602 140644183689024 configuration_utils.py:290] Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings":

In [14]:
help(tokenizer.tokenize_sentence_pair)

Help on method tokenize_sentence_pair in module allennlp.data.tokenizers.pretrained_transformer_tokenizer:

tokenize_sentence_pair(sentence_1:str, sentence_2:str) -> List[allennlp.data.tokenizers.token.Token] method of allennlp.data.tokenizers.pretrained_transformer_tokenizer.PretrainedTransformerTokenizer instance
    This methods properly handles a pair of sentences.



In [18]:
tokenizer.tokenize("Hi " * 1000, "Test " * 100)

TypeError: tokenize() takes 2 positional arguments but 3 were given