# this note book is for torchtext tutorial

In [2]:
from torchtext import data
from torchtext import vocab
from torchtext.data import Dataset
from torchtext.data import Field
from torchtext.data.example import  Example

## field 

主要功能
- tokenization(创建Example时): "hello world." --> ["hello", "world", "."]
- 构建 Vocab
- pad(创建Batch时): ["hello", "world", "."] --> ["hello", "world", ".", "<pad>", "<pad>"]因为每个 Example 的长度不一定相等, 需要 pad 成相同长度才可以 batch 起来.
- neumericalize(pad之后的操作, 需要有Vocab介入): ["hello", "world", ".", "<pad>", "<pad>"] --> [2, 3, 4, 0, 0]

In [12]:
txt_field = Field(sequential=True,include_lengths=True,use_vocab=True,batch_first=True)    # sequential 表明 是否是连续文本
label_field = Field(sequential=False,use_vocab=True,pad_token=None,unk_token=None,batch_first=True)
#如果标签的 use_vocab=false 则 data（[‘text’，label]）中的lable为数字，如果use_vocab=true，则lable为字符串

# for Example.fromlist to get example
train_fields_list  = [('text',txt_field),('label',label_field)]
test_fields_list  = [('text',txt_field)]

# for Example.fromdict to get example
train_fields_dict = {'text':('text',txt_field),'label':('label',label_field)}
test_fields_dic = {'text':('text',txt_field)}



## example

example 对象 是只有属性的简单对象


例如 对象只有 example.text example.label 两个属性

example 的构建需要输入 数据（一条） 和 对应的field
同时需要将数据处理成与example.fromXXX相对应的格式

In [17]:
# for Example.fromlist and it should be corresponding to the fields_list
data_list_train = ['this is an example','1']
data_list_test = ['this is for test']

# for Example.fromdict and it should be corresponding to the fields_dic
data_dict_train = {'text':'this is an example for dict','label':'1'}
data_dict_test = {'text':'this is test for dict'}

# construct example from Example.fromlist
example_train_list = Example.fromlist(data_list_train,train_fields_list)
example_test_list = Example.fromlist(data_list_test,test_fields_list)

# construct example from Example.fromdict
example_train_dict = Example.fromdict(data_dict_train,train_fields_dict)
example_text_dict = Example.fromdict(data_dict_test,test_fields_dic)

In [19]:
print(example_train_list.text,example_train_dict.text)
print(example_test_list)

['this', 'is', 'an', 'example'] ['this', 'is', 'an', 'example', 'for', 'dict']
<torchtext.data.example.Example object at 0x7fa60b1f1438>


### get example list to construct dataset

In [None]:
train_examples = [Example.fromlist(i,train_fields) for i in train_data]
test_examples = [Example.fromlist(i,test_fields) for i in test_data]

# dataset

In [None]:
trainSet = Dataset(train_examples,train_fields)
valSet = Dataset(val_examples,train_fields)
testSet = Dataset(test_examples,test_fields)

## dataset construction -- vector and vocab

### vector
attribute :
- itos
- stoi
- dim
- vectors

vector 对象用来加载已经预训练过的vector，对象会加载vector文件中所有单词，建立映射，如果内存不够，则将max_vectors 赋值，限制整个加载的预训练的单词数量

In [None]:
vec = vocab.Vectors('cc.zh.300.vec',r'/home/yinrongdi/vector',
                    unk_init=torch.nn.init.xavier_uniform)

### vocab 
vocab 对象是field对象的一个属性，vocab对象使用field.build_vocab构建，
- stoi
- itos
- freqs
- vetcors

vocab 对象用来构建field下一系列dataset的词表，以及词表对应的预训练vector

In [None]:
txt_field.build_vocab(trainSet,testSet,valSet, min_freq = 1,vectors = vec )

## batch and iterator
构建batch之后，产生iterator

### batch
Create a Batch from a list of examples
__init__(self, data=None, dataset=None, device=None) 将data（list（example））中所有的数据打包成batch，同时进行pad 和 numerical 操作。


batch 的生成，是在声明iterator之后，内部自动构建的，所以无需进行特殊操作

## iterator
- sort: 
            Whether to sort examples according to self.sort_key.
            Note that shuffle and sort default to train and (not train).
- sort_within_batch: 
            Whether to sort (in descending order according to
            self.sort_key) within each batch. If None, defaults to self.sort.
            If self.sort is True and this is False, the batch is left in the
            original (ascending) sorted order.
- device (str or `torch.device`): 
            A string or instance of `torch.device`
            specifying which device the Variables are going to be created on.
            If left as default, the tensors will be created on cpu. Default: None.
将构建好的batch 通过generator的形式进行返回，从而构建好了迭代器，如果要指定gup，直接在这里指定就行了

In [None]:
trainDl = data.Iterator(trainSet,shuffle=True,batch_size = 64,sort_key=lambda x: len(x.text),sort_within_batch=True,repeat=False,sort=True)
valDl = data.Iterator(valSet,shuffle=False,batch_size = 1,sort_key=lambda x: len(x.text),sort_within_batch=True,repeat=False,sort=True)
#sort = True
testDl = data.Iterator(testSet,shuffle=False,batch_size = 1,sort_key=lambda x: len(x.text),sort_within_batch=True,repeat=False,sort=True)


### 至此，所有的前期处理工作完成，后续就是搭建模型进行处理

# for pytorch defalt dataloader

In [1]:
import torch
import torch.utils.data as data
import numpy as np
import torch.nn.utils.rnn as rnn_utils
import collections

da = [[torch.tensor([1]),
       torch.tensor([2,2]),
       torch.tensor([3,3,3]),
       torch.tensor([4,4,4,4]),
       torch.tensor([5,5,5,5,5])],[1,2,3,4,5]]

def collate_fn_rand(batch):
    '''
    :param batch: batch[0] is tensor by defalt
    :return:
    '''
    if isinstance(batch[0],collections.Sequence):
        transposed = zip(*batch)
        return [collate_fn_rand(samples) for samples in transposed]
    elif torch.is_tensor(batch[0]):
        # 如果是标量，对应数据的label
        if batch[0].shape == torch.Size([]):
            return torch.stack(batch)

        max_len = max([len(i) for i in batch])
        pad_batch = [torch.tensor(i.numpy().tolist()+[0]*(max_len - len(i))).long()for i in batch]
        pad_batch = torch.stack(pad_batch)
        return pad_batch
    elif isinstance(batch[0],int):
        return torch.tensor(batch).long()


    
    
class myds(data.Dataset):
    def __init__(self,da,sort=False):
        self.context = da[0]
        self.label = da[1]
        if sort:
            self.sort_data()
    def sort_data(self):
        data_len = [len(data) for data in self.context]
        sort_index = torch.sort(torch.tensor(data_len),descending=True)[1].long()]
        self.context = [self.contex[i] for i in sort_index]
        self.label = [self.label[i] for i in sort_index]
    def __len__(self):
        return len(self.data[0])
    def __getitem__(self, index):
        return self.context[index] , self.label[index]


In [2]:
# for random select and pad
mydata = myds(da)
myloader = data.DataLoader(dataset=mydata,batch_size=3,shuffle=True,collate_fn=collate_fn_rand)
for data ,label in myloader:
    print(data,label)
    print()

tensor([[1, 0, 0],
        [3, 3, 3],
        [2, 2, 0]]) tensor([1, 3, 2])

tensor([[5, 5, 5, 5, 5],
        [4, 4, 4, 4, 0]]) tensor([5, 4])



In [None]:
# for less pad