### 作業目的: 熟練以Torchtext進行文本資料讀取

本次作業主要會使用[polarity](http://www.cs.cornell.edu/people/pabo/movie-review-data/)的電影評論來進行使用torchtext資料讀取，學員可以在附件的polarity.tsv看到所使用的資料。

Hint: 這次作業同學可以嘗試使用[torchtext.data.TabularDataset](https://torchtext.readthedocs.io/en/latest/data.html#tabulardataset)，可以更簡易讀取資料

### 載入套件

In [1]:
import torch
import pandas as pd
import numpy as np
import torchtext
import nltk
import spacy
from torchtext import data, datasets
print(torchtext.__version__)

0.13.1


In [3]:
# 探索資料
# 可以發現資料為文本與類別，而類別即為正評與負評
input_data = pd.read_csv('./polarity.tsv', delimiter='\t', header=None, names=['text', 'label'])
input_data.head()

Unnamed: 0,text,label
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,jaws is a rare film that grabs your attentio...,1
4,moviemaking is a lot like being the general ma...,1


### 建立Pipeline生成資料

In [7]:
# 建立Dataset
class Generator_data(object):
    def __init__(self, input_data):
        self.text = input_data['text']
        self.label = input_data['label']
        
    def token(self, x):
        return nltk.word_tokenize(x)
    
    def __iter__(self):
        for i in zip(self.text, self.label):
            yield (i[1], i[0])
            
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        return self.label[idx], self.text[idx]

In [8]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

dataset = Generator_data(input_data)
train_iter = dataset

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [9]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) -1

In [10]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(torch.tensor(label_pipeline(_label)))
        process_text = torch.tensor(text_pipeline(_text), dtype = torch.int64)
        text_list.append(process_text)
        offsets.append(process_text.size(0))
        
    label_list = torch.tensor( label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    len_pad = int(offsets.max())
    text_pad = []
    for i in range(len(text_list)):
        t = torch.nn.functional.pad(text_list[i], (0,len_pad - text_list[i].shape[0]), 'constant', 0)
        t = torch.unsqueeze(t, 0)
        text_pad.append(t)
    text_pad = torch.cat(text_pad)
    return label_list, text_pad, offsets

In [11]:
from torch.utils.data import DataLoader
dataloader = DataLoader(train_iter, batch_size=3, shuffle=True, collate_fn=collate_batch)
for i in dataloader:
    print(i[0])
    print(i[1])
    break

tensor([ 0, -1,  0])
tensor([[  13,    8,   11,  ...,    0,    0,    0],
        [ 214, 1927,  192,  ...,    0,    0,    0],
        [ 323,  954,    3,  ...,    0,    0,    0]])
