In [1]:
from pathlib import Path
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util

In [2]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=False,
    seed=1,
    batch_size=32,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

In [3]:
from allennlp.common.checks import ConfigurationError


In [4]:
USE_GPU = torch.cuda.is_available()


In [5]:
DATA_ROOT = Path("../data") / "processed"

In [6]:
from typing import Dict, Iterable, List, Tuple

import torch
from allennlp.data import DatasetReader, Instance
from allennlp.data import Vocabulary
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2VecEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy

In [7]:
ls ../data/processed

test.text.trimmed.tsv  train.text.trimmed.tsv  [0m[01;36mtrain.text.tsv[0m@


In [8]:
# @DatasetReader.register('classification-tsv')
class ClassificationTsvReader(DatasetReader):
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = None,
                 **kwargs):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens

    def text_to_instance(self, text: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(text)
        if self.max_tokens:
            tokens = tokens[:self.max_tokens]
        text_field = TextField(tokens, self.token_indexers)
        fields = {'text': text_field}
        if label:
            fields['label'] = LabelField((label))
        return Instance(fields)

#     def _read(self, file_path: str) -> Iterable[Instance]:
#         df = pd.read_csv(file_path, sep='\t', header=None, index_col=0)
#         df.columns = ['hyperpartisan', '1', '2', 'text', 'title']
#         df = df.drop(columns=['1','2'])
#         
#         print(df.head())
#         
#         for i, row in df.iterrows():
#             yield self.text_to_instance(row["text"], row["hyperpartisan"])

    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path, 'r') as lines:
            for line in lines:
                label, text, title = line.strip().split('\t')
                yield self.text_to_instance(text, label)


In [9]:
# Instantiate and use the dataset reader to read a file containing the data
reader = ClassificationTsvReader()
dataset = reader.read(DATA_ROOT / 'train.text.trimmed.tsv')

# Returned dataset is a list of Instances by default
print('type of dataset: ', type(dataset))
print('type of its first element: ', type(dataset[0]))
print('size of dataset: ', len(dataset))

HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…

type of dataset:  <class 'allennlp.data.dataset_readers.dataset_reader.AllennlpDataset'>
type of its first element:  <class 'allennlp.data.instance.Instance'>
size of dataset:  545





In [10]:
# Nsamp = 1000
# maxtokens = 500
# maxtokenlen = 20
# 
# def tokenizer(row):
#     if row is None or row == '':
#         tokens = ""
#     else:
#         tokens = str(row).split(" ")[:maxtokens]
#     return tokens

In [11]:
type(dataset[1])

allennlp.data.instance.Instance

In [12]:
all_instance_fields_and_types: List[Dict[str, str]] = [
            {k: v.__class__.__name__ for k, v in x.fields.items()} for x in dataset
        ]

In [13]:
all_instance_fields_and_types

[{'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'TextField', 'label': 'LabelField'},
 {'text': 'Te

In [14]:
all(all_instance_fields_and_types[0] == x for x in all_instance_fields_and_types)

True

In [15]:
list(dataset[1].fields["text"])

[Fox,
 News,
 panel,
 with,
 Katie,
 Pavlich,
 and,
 Adrienne,
 Elrod,
 (,
 Photo,
 :,
 Screen,
 capture,
 ),
 <splt>,
 A,
 Fox,
 News,
 host,
 and,
 panelist,
 alleged,
 that,
 it,
 was,
 Hillary,
 Clinton,
 that,
 colluded,
 with,
 Russia,
 and,
 thus,
 should,
 be,
 taken,
 to,
 jail,
 —,
 not,
 President,
 Donald,
 Trump,
 .,
 <splt>,
 While,
 Russian,
 bots,
 sowed,
 division,
 among,
 Democratic,
 voters,
 ,,
 attacked,
 people,
 of,
 color,
 ,,
 attacked,
 Hillary,
 Clinton,
 ,,
 worked,
 to,
 suppress,
 voters,
 of,
 color,
 then,
 spread,
 fake,
 voter,
 fraud,
 claims,
 ,,
 Fox,
 News,
 thinks,
 this,
 was,
 a,
 campaign,
 in,
 favor,
 of,
 Clinton,
 's,
 2016,
 bid,
 .,
 <splt>,
 ",
 Well,
 ,,
 if,
 it,
 's,
 really,
 a,
 big,
 threat,
 and,
 we,
 want,
 to,
 stop,
 them,
 from,
 meddling,
 in,
 our,
 elections,
 ,,
 why,
 is,
 it,
 that,
 during,
 a,
 three,
 -,
 year,
 time,
 span,
 the,
 Obama,
 administration,
 really,
 did,
 n't,
 do,
 much,
 to,
 stop,
 this,
 from,
 h

In [16]:
len(dataset)

545

In [17]:
@Model.register('simple_classifier')
class SimpleClassifier(Model):
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder):
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        num_labels = vocab.get_vocab_size("labels")
        self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
        self.accuracy = CategoricalAccuracy()
        
    def forward(self,
                text: Dict[str, torch.Tensor],
                label: torch.Tensor) -> Dict[str, torch.Tensor]:
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        # Shape: (batch_size, num_tokens)
        mask = util.get_text_field_mask(text)
        # Shape: (batch_size, encoding_dim)
        encoded_text = self.encoder(embedded_text, mask)
        # Shape: (batch_size, num_labels)
        logits = self.classifier(encoded_text)
        # Shape: (batch_size, num_labels)
        probs = torch.nn.functional.softmax(logits)
        # Shape: (1,)
        loss = torch.nn.functional.cross_entropy(logits, label)
        return {'loss': loss, 'probs': probs}
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}
    


In [18]:

import allennlp
from allennlp.data import PyTorchDataLoader, DatasetReader
from allennlp.nn import util
from allennlp.training.trainer import GradientDescentTrainer, Trainer
from allennlp.training.optimizers import AdamOptimizer
from allennlp.training.metrics import CategoricalAccuracy


In [27]:

def run_training_loop():
    dataset_reader = ClassificationTsvReader(max_tokens=64)

    train_data, dev_data = read_data(dataset_reader)

    vocab = build_vocab(train_data + dev_data)
    model = build_model(vocab)

    # outputs = model.forward_on_instances(instances[:4])
    # print(outputs)
    
    # This is the allennlp-specific functionality in the Dataset object;
    # we need to be able convert strings in the data to integers, and this
    # is how we do it.
    train_data.index_with(vocab)
    
    dev_data.index_with(vocab)

    # These are again a subclass of pytorch DataLoaders, with an
    # allennlp-specific collate function, that runs our indexing and
    # batching code.
    train_loader, dev_loader = build_data_loaders(train_data, dev_data)
    
    
    trainer = build_trainer(
        model,
        train_loader,
        dev_loader,
    )
    print("Starting training")
    trainer.train()
    print("Finished training")
    return model, dataset_reader

def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print("Building the vocabulary")
    return Vocabulary.from_instances(instances)

def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
    encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
    return SimpleClassifier(vocab, embedder, encoder)

def build_data_loaders(
    train_data: torch.utils.data.Dataset,
    dev_data: torch.utils.data.Dataset,
) -> Tuple[allennlp.data.DataLoader, allennlp.data.DataLoader]:
    # Note that DataLoader is imported from allennlp above, *not* torch.
    # We need to get the allennlp-specific collate function, which is
    # what actually does indexing and batching.
    train_loader = PyTorchDataLoader(train_data, batch_size=8, shuffle=True)
    dev_loader = PyTorchDataLoader(train_data, batch_size=8, shuffle=False)
    return train_loader, dev_loader

def build_trainer(
    model: Model,
    train_loader: PyTorchDataLoader,
    dev_loader: PyTorchDataLoader,
) -> Trainer:
    parameters = [
        [n, p]
        for n, p in model.named_parameters() if p.requires_grad
    ]
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    optimizer = AdamOptimizer(parameters)
    trainer = GradientDescentTrainer(
        model=model,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=5,
        optimizer=optimizer,
    )
    return trainer

def read_data(
    reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read(DATA_ROOT / 'train.text.trimmed.tsv')
    validation_data = reader.read(DATA_ROOT / 'test.text.trimmed.tsv')
    # validation_data = reader.read("quick_start/data/movie_review/dev.tsv")
    return training_data, validation_data


In [28]:


model, dataset_reader = run_training_loop()

Reading data


HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…

Building the vocabulary





HBox(children=(HTML(value='building vocab'), FloatProgress(value=0.0, max=645.0), HTML(value='')))


You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


Building the model
Starting training


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))

  probs = torch.nn.functional.softmax(logits)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))

Finished training





In [29]:
test_data = dataset_reader.read('../data/processed/test.text.trimmed.tsv')

HBox(children=(HTML(value='reading instances'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width…




In [30]:
test_data.index_with(model.vocab)

In [31]:
data_loader = PyTorchDataLoader(test_data, batch_size=8)

In [35]:
from allennlp.training.util import evaluate

results = evaluate(model, data_loader, cuda_device=0)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

  probs = torch.nn.functional.softmax(logits)



In [36]:
print(results)

{'accuracy': 0.0, 'loss': 0.8926467872582949}
