In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from tqdm import tqdm

from scene.data.reader import DataReader
from scene.data.tokenizers import spacy_word_tokenizer

from scene.ml.models import BaselineModel
from scene.ml.encoders import YoonKimConv1DEncoder
from allennlp.modules.seq2vec_encoders import CnnHighwayEncoder

from allennlp.training.trainer import Trainer
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.iterators import BucketIterator

from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.data.iterators import BasicIterator

In [2]:
DATAPATH = '/home/ygx/dev/kaggle/scene/data/splits/small_val/csv'

In [3]:
reader = DataReader(
    tokenizer=spacy_word_tokenizer,
)

traindata = reader.read(DATAPATH, 'train')
valdata = reader.read(DATAPATH, 'val')
testdata = reader.read(DATAPATH, 'test')

20321it [01:46, 198.80it/s]
2258it [00:11, 198.47it/s]
5589it [00:28, 196.31it/s]


In [4]:
# iterator = BucketIterator(
#     batch_size=28,
#     sorting_keys=[("tokens", "num_tokens")]
# )

iterator = BasicIterator(batch_size=28)

vocab = Vocabulary.from_instances(traindata + valdata + testdata)
iterator.index_with(vocab)

token_embedding = Embedding(
    num_embeddings=vocab.get_vocab_size('tokens'),
    embedding_dim=300
)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

encoder = CnnHighwayEncoder(
    embedding_dim=word_embeddings.get_output_dim(),
    filters=[(3,100), (4,100), (5,100)],
    num_highway=2,
    projection_dim=100,
    do_layer_norm=True
)

model = BaselineModel(
    word_embeddings,
    vocab,
    encoder,
    n_classes=9
)

02/21/2019 14:47:33 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 28168/28168 [00:04<00:00, 6667.63it/s]


In [68]:
def to_numpy(x): return x.detach().cpu().numpy()


class Predictor:

    def __init__(self, model, iterator, device="cpu"):
        self.model = model
        self.iterator = iterator
        self.device = device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        arry = to_numpy(out_dict["logits"])
        out = F.softmax(arry, dim=1)
        return out
    
    def predict(self, data):
        pred_generator = self.iterator(data, num_epochs=1, shuffle=False)
        self.model.eval()

        pred_generator_tqdm = tqdm(
            pred_generator,
            total=self.iterator.get_num_batches(data)
        )

        ids = []
        logits = []
        iter = 0
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                out = self.model(batch["tokens"], batch["id"])
                logits.append(out["logits"])
                ids.append(torch.tensor(out["id"]))
        
        ids = torch.cat(ids, dim=0)
        logits = torch.cat(logits, dim=0)
        output = {"id": ids, "logits": logits}

        return output

In [69]:
predictor = Predictor(model,iterator)

In [70]:
out = predictor.predict(testdata)





  0%|          | 0/200 [00:00<?, ?it/s][A[A[A[A



  1%|          | 2/200 [00:00<00:10, 18.81it/s][A[A[A[A



  2%|▎         | 5/200 [00:00<00:09, 19.75it/s][A[A[A[A



  4%|▍         | 8/200 [00:00<00:09, 20.54it/s][A[A[A[A



  6%|▌         | 11/200 [00:00<00:08, 22.42it/s][A[A[A[A



  7%|▋         | 14/200 [00:00<00:07, 23.54it/s][A[A[A[A



  8%|▊         | 17/200 [00:00<00:07, 23.96it/s][A[A[A[A



 10%|█         | 20/200 [00:00<00:07, 24.56it/s][A[A[A[A



 12%|█▏        | 23/200 [00:00<00:07, 24.97it/s][A[A[A[A



 13%|█▎        | 26/200 [00:01<00:07, 24.58it/s][A[A[A[A



 14%|█▍        | 29/200 [00:01<00:06, 24.91it/s][A[A[A[A



 16%|█▌        | 32/200 [00:01<00:06, 25.01it/s][A[A[A[A



 18%|█▊        | 35/200 [00:01<00:06, 24.93it/s][A[A[A[A



 19%|█▉        | 38/200 [00:01<00:06, 25.05it/s][A[A[A[A



 20%|██        | 41/200 [00:01<00:06, 24.66it/s][A[A[A[A



 22%|██▏       | 44/200 [00:01<00:06, 24.31it/s]

In [71]:
out['id'].shape

torch.Size([5589])

In [72]:
len(testdata)

5589

In [73]:
out['logits'].shape

torch.Size([5589, 9])

In [79]:
out['id'][:10]

tensor([ 1,  8, 13, 24, 38, 40, 53, 55, 65, 66])

In [81]:
import os
import pandas as pd


testpath = os.path.join(DATAPATH, 'test.csv')
test_df = pd.read_csv(testpath)

In [83]:
test_df[0:10]

Unnamed: 0,id,text
0,1,glances at her. BOOK Maybe I ought to learn t...
1,8,hout breaking stride. Tatiana sees her and can...
2,13,dead bodies. GEORDI Mitchell... DePaul... LANG...
3,24,take myself. BRANDON How bad is the other thi...
4,38,"her body to shield his own. KAY Freeze it, Bug..."
5,40,im from ear to ear. Ya want me to make a state...
6,53,"BEN We need to help Reed Sue shakes her head,..."
7,55,slowly. At the entrance to the alley stands a ...
8,65,edge of the field. Neil steps closer. THE TOMB...
9,66,"special, take ya in the kitchen and suck your ..."


In [98]:
decoded = model.decode(out)
# decoded
ids = decoded['id']
genre = decoded['label']

preds = {'id': ids, 'genre': genre}
preds = pd.DataFrame(preds)

In [85]:
valpath = os.path.join(DATAPATH, 'val.csv')
val_df = pd.read_csv(valpath)

uniq, counts = np.unique(val_df['genre'], return_counts=True)

In [88]:
uniq

array(['action', 'adventure', 'comedy', 'drama', 'horror', 'other',
       'romance', 'sci-fi', 'thriller'], dtype=object)

In [89]:
counts

array([239,  15, 294, 887,  46,  27,   6,  61, 683])

In [99]:
preds[0:10]

Unnamed: 0,id,genre
0,1,drama
1,8,drama
2,13,drama
3,24,drama
4,38,drama
5,40,drama
6,53,drama
7,55,drama
8,65,drama
9,66,drama
