In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# `TorchText` after version 0.7
Reference: https://github.com/pytorch/text/releases/tag/v0.7.0-rc3

## Build a Dataset Manually

In [2]:
from torchtext.experimental.datasets import AG_NEWS
train, test = AG_NEWS(ngrams=3, root="assets/data")

120000lines [00:05, 23034.54lines/s]


In [3]:
train[0]

(tensor(3),
 tensor([    609,     601,       2,    2505,   65419,     160,      96,       3,
            1211,      14,      32,      15,      32,      16,  963004,       4,
             609,     534,      17,      10, 2015073,       7, 1037580,       4,
              53,    7915,    1112,     475,       2,   16848,     715,  224520,
          658998,   74344,    8947,     436,   11338, 1650300,      43,      45,
             152,     155,  583047,  963005,   68633,     746,   16799,      23,
         3292797, 2015074,  879762, 1037581,    4135,  124783,  212521,  771712,
            4448,   24823,  975603,  586573,  658999,  691530,   29723,  201721,
         3626360, 1650301,      44,     154,     153, 3244489,  583048,  963006,
           68634,   18040,   17401, 1098193, 3292798, 2015075,  879763, 1037582,
          571700,  642791,  955903,  771713]))

In [4]:
train[1]

(tensor(3),
 tensor([  74282,    1567,    1218,    1965,    8565,      14,      32,      15,
              32,      16,    1335,    1136,     469,   74282,     139,       4,
          226571,      33,       6,    9161,      12,     800, 1055729,       9,
         2871940,    3459,       8,       3,     740,     345,       4,      33,
            7610, 3061313,   16385,      11,     292,     517,       7,       3,
             182,       2, 1752657,  329496, 3789212, 1831434, 1382164,      43,
              45,     152,     155,  110330,   76571,   41184, 2141959,  112768,
            3154,  222433, 1310772,    2184,   40623,   59792,   26310, 2690242,
         3939874, 1470718, 2871941,   97029,      26,   10588, 1939337,    5977,
             635,   34522, 3158791, 3061314,   50193,   34752,  293241,     755,
              29,    1493,    1951, 1752658, 2663192, 3789213, 1831435, 1382165,
              44,     154,     153,  935444, 1181221,  209660, 2494282, 2141960,
          419087

In [5]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for label, txt in batch:
        texts.append(txt)
        labels.append(label)
    return texts, labels

dataloader = DataLoader(train, batch_size=8, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(dataloader):
    print(idx, texts, labels)
    break

0 [tensor([    609,     601,       2,    2505,   65419,     160,      96,       3,
           1211,      14,      32,      15,      32,      16,  963004,       4,
            609,     534,      17,      10, 2015073,       7, 1037580,       4,
             53,    7915,    1112,     475,       2,   16848,     715,  224520,
         658998,   74344,    8947,     436,   11338, 1650300,      43,      45,
            152,     155,  583047,  963005,   68633,     746,   16799,      23,
        3292797, 2015074,  879762, 1037581,    4135,  124783,  212521,  771712,
           4448,   24823,  975603,  586573,  658999,  691530,   29723,  201721,
        3626360, 1650301,      44,     154,     153, 3244489,  583048,  963006,
          68634,   18040,   17401, 1098193, 3292798, 2015075,  879763, 1037582,
         571700,  642791,  955903,  771713]), tensor([  74282,    1567,    1218,    1965,    8565,      14,      32,      15,
             32,      16,    1335,    1136,     469,   74282,     139, 

In [6]:
type(texts[0])

torch.Tensor