## How to use torchtext to (pre)process text data

In [1]:

from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, batch_first=True)

# make splits for data
train, valid, test = datasets.WikiText2.splits(TEXT)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0])['text'][0:10])

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))

# make iterator for splits
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test), batch_size=3, bptt_len=30, device=0)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.target)

# Approach 2:
train_iter, valid_iter, test_iter = datasets.WikiText2.iters(batch_size=4, bptt_len=30)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.target)

downloading wikitext-2-v1.zip
extracting
train.fields {'text': <torchtext.data.field.Field object at 0x7fc504741f28>}
len(train) 1
vars(train[0]) ['<eos>', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', '<eos>', 'senjō', 'no']


100%|██████████| 400000/400000 [00:30<00:00, 13280.34it/s]


len(TEXT.vocab) 28913
Variable containing:
     9     19    287
    11     45  10227
  3875    326    126
  3895   1707    104
   887      7  13435
    11   1820      4
     9      8     16
     9      2     29
 20079    456   1667
   110    402      8
  3875     98    430
    93      3  20036
    47   1785     26
     0     86      2
  3895    458    287
    23      8  10227
   786   1333    306
    47  18693      3
 28911    599  20036
     3   2677   2355
  6680      4     83
     4   8017      6
  3875    323      0
     5      2     83
     2    456     95
  5052    402  21582
    93     98   3967
    22     16     10
     3     10   8235
  1845     12   9818
[torch.cuda.LongTensor of size 30x3 (GPU 0)]

Variable containing:
    11     45  10227
  3875    326    126
  3895   1707    104
   887      7  13435
    11   1820      4
     9      8     16
     9      2     29
 20079    456   1667
   110    402      8
  3875     98    430
    93      3  20036
    47   1785     26
     0  

In [15]:
a = train.examples[0]

In [17]:
import pandas as pd
import numpy as np
from gensim.utils import tokenize, simple_preprocess, simple_tokenize
from torchtext import data, datasets
from torchtext.vocab import GloVe

## In order to split dataset into valid set and train set, we need to save each.
torchtext를 이용해 데이터셋을 나누기 위해서는 train set/valid set 을 각각 csv로 저장해둡니다.

In [18]:
all_datas = pd.read_csv('../spooky_author/data/train.csv')
all_datas.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [19]:
valid_ratio = 0.2
train_num = int(len(all_datas) * (1-valid_ratio))
train_datasets = all_datas.loc[0:train_num]
valid_datasets = all_datas.loc[train_num:]
valid_datasets.head()

Unnamed: 0,id,text,author
15663,id11838,It was not until the expiration of a week that...,EAP
15664,id26718,"In whatever way the shifting is managed, it is...",EAP
15665,id07373,In the darkness every faint noise of the night...,HPL
15666,id05788,Amount entered in Journal fifty cents which se...,EAP
15667,id23701,When they had thinned out enough to be glimpse...,HPL


In [20]:
train_datasets.to_csv('../spooky_author/data/train_cached.csv', index=False)
valid_datasets.to_csv('../spooky_author/data/valid_cached.csv', index=False)

## usage of torchtext
- data.Field는 데이터 처리를 위한 유용한 함수들이 모인 클래스,
- data.Dataset 은 거기 들어갈 데이터를 Field와 엮어주는 클래스 인 것 같습니다. 자세히는 모름

In [21]:
import torch

In [32]:
def tokenizer(string):
    return [token for token in tokenize(string)]

In [49]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=20,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.LongTensor
                  # cuda를 써도 됩니다
                 )
LABELS = data.Field(sequential=False)

train, val = data.TabularDataset.splits(
    path='../spooky_author/data',
    train='train_cached.csv',
    validation='valid_cached.csv', format='csv',
    skip_header=True,
    fields=[
        ('id', None),
        ('text', TEXT),
        ('author', LABELS),
    ])

In [50]:
# Field의 vocab을 build 하기 전에는 다음과 같이 내용을 확인할 수 있습니다.
for token in train.examples[1].text:
    print(token, end=" ")

It never once occurred to me that the fumbling might be a mere mistake 

In [51]:
TEXT.build_vocab(train, val, # 몇 개의 단어 셋을 줘도 괜찮습니다.
                 max_size=2000,  # 단어사전의 최대 크기입니다. 여기에는 padding word, unknown word는 포함되지 않습니다.
                 min_freq=5  # 몇 개 이상 나타나는 단어에 대해서만 vocab을 생성합니다. 
                )

In [52]:
# build_vocab 후에는 Field.vocab을 확인할 수 있습니다.
TEXT.vocab.freqs
TEXT.vocab.itos
TEXT.vocab.stoi
TEXT.vocab.vectors


In [53]:
TEXT.vocab.load_vectors('glove.twitter.27B.50d')

In [58]:
TEXT.vocab.vectors


 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.2532 -0.0149  0.5937  ...  -0.2893  0.4549  0.1866
          ...             ⋱             ...          
-0.6943 -0.3575 -0.2070  ...   0.5110 -1.3684  0.6470
-0.4406  1.3399  0.3456  ...  -0.0701 -0.5296 -0.1710
-0.0558 -0.2905 -1.0690  ...   0.1650  0.4894 -0.1557
[torch.FloatTensor of size 2002x50]

In [73]:
train_iter = data.BucketIterator((train ), batch_size=4)
train_iter

<torchtext.data.iterator.BucketIterator at 0x7fc470a21240>

In [74]:
for i in train_iter:
    break

TypeError: unorderable types: Example() < Example()

In [69]:
for i in train.examples:
    if isinstance(i, data.example.Example):
        continue
    print(type(i))
    break

In [64]:
for i in train_iter:
    break

AttributeError: 'Field' object has no attribute 'vocab'

In [63]:
train_iter.dataset.fields

{'author': <torchtext.data.field.Field at 0x7fc473c0b8d0>,
 'id': None,
 'text': <torchtext.data.field.Field at 0x7fc473c0b908>}

In [115]:
next(z.text)

'as'

In [93]:
df = pd.read_csv('../spooky_author/data/train_cached.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [82]:
type(train)

torchtext.data.dataset.TabularDataset

In [83]:
TEXT.build_vocab(train, val, max_size=2000, min_freq=10)

In [89]:
from torch.utils.data import DataLoader

In [91]:
dl = DataLoader(train)

In [92]:
for i in dl:
    break

TypeError: batch must contain tensors, numbers, dicts or lists; found <class 'torchtext.data.example.Example'>

In [84]:
dataset_iter = data.Iterator(train, batch_size=5)

In [88]:
next(iter(dataset_iter))

TypeError: 'generator' object is not subscriptable

In [85]:
for examples in dataset_iter:
    x = examples.text
    y = examples.author
    break

TypeError: 'generator' object is not subscriptable

In [67]:
TEXT.vocab.load_vectors('glove.twitter.27B.50d')

In [68]:
TEXT.vocab.vectors


 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.2532 -0.0149  0.5937  ...  -0.2893  0.4549  0.1866
          ...             ⋱             ...          
-0.6943 -0.3575 -0.2070  ...   0.5110 -1.3684  0.6470
-0.4406  1.3399  0.3456  ...  -0.0701 -0.5296 -0.1710
-0.0558 -0.2905 -1.0690  ...   0.1650  0.4894 -0.1557
[torch.FloatTensor of size 2002x50]

In [45]:
t = train.fields['text']

In [48]:
a = next(iter(train))

TypeError: send() takes exactly one argument (0 given)