-
Notifications
You must be signed in to change notification settings - Fork 138
/
trec.py
50 lines (39 loc) · 1.28 KB
/
trec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe, CharNGram
import torch
if not torch.cuda.is_available() :
device = -1
else:
device = 0
# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)
# make splits for data
train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=3, device=device)
# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)
# Approach 2:
TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()],device=device)
LABEL.build_vocab(train)
train_iter, test_iter = datasets.TREC.iters(batch_size=4)
# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)