In [1]:
from torchtext import data, datasets
from torchtext.vocab import Vectors
from torch.nn import init
import torch
import spacy
import numpy as np

spacy_en = spacy.load("en_core_web_lg")
def tokenizer(text):  # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LABEL = data.Field(sequential=False, use_vocab=False, is_target=True)
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)



In [2]:
train, val = data.TabularDataset.splits(
    path='./data', train='train.csv', validation='dev.csv', format='csv', skip_header=True,
    fields=[('label_id', LABEL), ('title', TEXT), ('description', TEXT)]
)
TEXT.build_vocab(train, vectors='glove.840B.300d')
vocab_size = len(TEXT.vocab)
print("Vocab size: " + str(vocab_size))
labels = np.unique(list(map(lambda x: int(x), train.label_id)))
num_classes = len(labels)
print("Number classes: " + str(num_classes))



Vocab size: 85049
Number classes: 4


In [4]:
train_iter = data.BucketIterator(train, batch_size=128,
                                 sort_key=lambda x: len(x.title) + len(x.description),
                                 shuffle=False, device=DEVICE)



In [5]:
train[0].label_id, train[0].title, train[0].description

('3',
 ['wall',
  'st.',
  'bears',
  'claw',
  'back',
  'into',
  'the',
  'black',
  '(',
  'reuters',
  ')'],
 ['reuters',
  '-',
  'short',
  '-',
  'sellers',
  ',',
  'wall',
  'street',
  "'s",
  'dwindling\\band',
  'of',
  'ultra',
  '-',
  'cynics',
  ',',
  'are',
  'seeing',
  'green',
  'again',
  '.'])

In [6]:
example_batch = None
for batch in train_iter:
    example_batch = batch
    break



In [7]:
example_batch.description.shape

torch.Size([83, 128])

In [8]:
example_batch.title[:, 0]

tensor([  452,   608,  1573, 14909,   114,    70,     2,   796,    15,    33,
           16,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1], device='cuda:0')

In [9]:
example_batch.label_id

tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4], device='cuda:0')

In [10]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f8998564e50>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'to': 5,
             '-': 6,
             'a': 7,
             'of': 8,
             'in': 9,
             'and': 10,
             'on': 11,
             'for': 12,
             ' ': 13,
             '#': 14,
             '(': 15,
             ')': 16,
             '39;s': 17,
             'that': 18,
             'with': 19,
             'as': 20,
             'at': 21,
             "'s": 22,
             'is': 23,
             'its': 24,
             'new': 25,
             ';': 26,
             'by': 27,
             'it': 28,
             'said': 29,
             'has': 30,
             ':': 31,
             'from': 32,
             'reuters': 33,
             'an': 34,
             'ap': 35,
             'his': 36,
             'after': 37,
          

In [11]:
TEXT.vocab.stoi['<pad>']

1

In [18]:
int(train[0].label_id)

3

In [19]:
LABEL.dtype

torch.int64