### 作業目的: 熟練以Torchtext進行文本資料讀取

本次作業主要會使用[polarity](http://www.cs.cornell.edu/people/pabo/movie-review-data/)的電影評論來進行使用torchtext資料讀取，學員可以在附件的polarity.tsv看到所使用的資料。

Hint: 這次作業同學可以嘗試使用[torchtext.data.TabularDataset](https://torchtext.readthedocs.io/en/latest/data.html#tabulardataset)，可以更簡易讀取資料

### 載入套件

In [17]:
import re
import torch
import pandas as pd
import numpy as np
from torchtext import data, datasets

In [18]:
# 探索資料
# 可以發現資料為文本與類別，而類別即為正評與負評
input_data = pd.read_csv('./polarity.tsv', delimiter='\t', header=None, names=['text', 'label'])
input_data

Unnamed: 0,text,label
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,jaws is a rare film that grabs your attentio...,1
4,moviemaking is a lot like being the general ma...,1
...,...,...
1995,"if anything , "" stigmata "" should be taken as ...",0
1996,"john boorman's "" zardoz "" is a goofy cinematic...",0
1997,the kids in the hall are an acquired taste .it...,0
1998,there was a time when john carpenter was a gre...,0


### 建立Pipeline生成資料

In [19]:
def remove_non_char(x):
    x = ' '.join(x)
    x = re.sub(r'[^a-zA-Z]', ' ', x)
    x = x.split()

    return x

In [20]:
# 建立Field與Dataset
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)

def spacy_tokenize(x):
    return [tok.text for tok in tokenizer(x)]

TEXT = data.Field(sequential=True, dtype=torch.float64, tokenize=spacy_tokenize, preprocessing=remove_non_char)
LABEL = data.LabelField(dtype=torch.float)
fields = [('text', TEXT), ('label', LABEL)]

examples = []
for text, label in input_data.values:
    examples.append(data.Example.fromlist(data=[text, label], fields=fields))

In [23]:
# 取的examples並打亂順序
import random
random.shuffle(examples)
# 以8:2的比例切分examples
train_ex = examples[:int(len(examples)*0.8)]
test_ex = examples[int(len(examples)*0.8):]

# 建立training與testing dataset
train_data = data.Dataset(examples=train_ex, fields=dict(fields))
test_data = data.Dataset(examples=test_ex, fields=dict(fields))

train_data[0].label, train_data[0].text[:30]

(0,
 ['i',
  'heard',
  'actor',
  'skeet',
  'ulrich',
  'discussing',
  'this',
  'film',
  'in',
  'a',
  'couple',
  'of',
  'interviews',
  'and',
  'in',
  'both',
  'instances',
  'he',
  'felt',
  'the',
  'strange',
  'compulsion',
  'to',
  'compare',
  'it',
  'a',
  'little',
  'series',
  'of',
  'films'])

In [22]:
# 建立字典
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

print(f"Vocabularies of index 0-5: {TEXT.vocab.itos[:10]} \n")
print(f"words to index {TEXT.vocab.stoi}")

u': 34923, 'umbrella': 34924, 'ummm': 34925, 'ummyup': 34926, 'umpire': 34927, 'umpteenth': 34928, 'umptenth': 34929, 'unabashed': 34930, 'unabated': 34931, 'unaccountably': 34932, 'unaddressed': 34933, 'unaffected': 34934, 'unaffecting': 34935, 'unambiguously': 34936, 'unanimous': 34937, 'unanswerable': 34938, 'unanticipated': 34939, 'unarguable': 34940, 'unarguably': 34941, 'unashamed': 34942, 'unassertive': 34943, 'unassociated': 34944, 'unattainable': 34945, 'unauthentic': 34946, 'unauthorized': 34947, 'unavailable': 34948, 'unawareness': 34949, 'unbelief': 34950, 'unbelieveably': 34951, 'unbelivable': 34952, 'unbiased': 34953, 'unbilled': 34954, 'unblemished': 34955, 'unboarded': 34956, 'unbrewed': 34957, 'unbusy': 34958, 'unbuttoning': 34959, 'unchained': 34960, 'unchristian': 34961, 'unclaimed': 34962, 'uncleanness': 34963, 'uncommunicative': 34964, 'uncompromisingly': 34965, 'unconcious': 34966, 'unconditionally': 34967, 'unconnected': 34968, 'unconnectedness': 34969, 'unconsci

In [24]:
train_iter, test_iter = data.Iterator(dataset=train_data, batch_size=2, repeat=False, sort_key=lambda ex:len(ex.text)), data.Iterator(dataset=test_data, batch_size=2, repeat=False, sort_key=lambda ex:len(ex.text))

In [25]:
i = 0
for train_batch in train_iter:
    print(train_batch.text, train_batch.text.shape)
    print(train_batch.label, train_batch.label.shape)
    i+=1
    if i == 3:
        break

tensor([[1.7000e+01, 4.0000e+01],
        [1.0500e+03, 2.2000e+01],
        [8.3170e+03, 8.0200e+02],
        ...,
        [1.0000e+00, 1.0000e+01],
        [1.0000e+00, 6.0000e+00],
        [1.0000e+00, 3.4200e+02]], dtype=torch.float64) torch.Size([1107, 2])
tensor([1., 1.]) torch.Size([2])
tensor([[4.0000e+01, 2.4910e+03],
        [1.6400e+02, 5.1000e+02],
        [2.3000e+01, 5.6000e+02],
        ...,
        [1.0000e+00, 5.0000e+00],
        [1.0000e+00, 1.0937e+04],
        [1.0000e+00, 3.8780e+03]], dtype=torch.float64) torch.Size([1095, 2])
tensor([0., 0.]) torch.Size([2])
tensor([[8.2430e+03, 1.7200e+02],
        [2.0000e+00, 6.6100e+02],
        [3.8500e+02, 7.3900e+02],
        ...,
        [1.0000e+00, 7.3900e+02],
        [1.0000e+00, 6.9800e+02],
        [1.0000e+00, 3.2846e+04]], dtype=torch.float64) torch.Size([995, 2])
tensor([1., 0.]) torch.Size([2])
