### 作業目的: 熟練自定義collate_fn與sampler進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [104]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Sampler, SequentialSampler, RandomSampler
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yz830\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yz830\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 探索資料與資料前處理
這份作業我們使用test資料中的pos與neg


In [8]:
# 讀取字典，這份字典為review內所有出現的字詞
def read_vocab():
    with open(r'C:\Users\yz830\project\NLP_asst\data\aclImdb\imdb.vocab', 'r', encoding = 'utf-8') as vocab:
        return vocab.read().split()

vocab = read_vocab()
print(vocab[:30])
# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")
### <your code> ###
print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
vocab_dict = {voc:i for i, voc in enumerate(vocab)}

['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'he', 'are', 'his', 'have', 'be', 'one', '!', 'all', 'at']
vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89527


In [53]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

pos_path = glob.glob(r'C:\Users\yz830\project\NLP_asst\data\aclImdb\train\pos\*.txt')
neg_path = glob.glob(r'C:\Users\yz830\project\NLP_asst\data\aclImdb\train\neg\*.txt')
review_all = pos_path + neg_path
label = [1]*len(pos_path) + [0] * len(neg_path)
review_pairs = list(zip(review_all, label))

print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('C:\\Users\\yz830\\project\\NLP_asst\\data\\aclImdb\\train\\pos\\0_9.txt', 1), ('C:\\Users\\yz830\\project\\NLP_asst\\data\\aclImdb\\train\\pos\\10000_8.txt', 1)]
Total reviews: 25000


### 建立Dataset, DataLoader, Sampler與Collate_fn讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量函式
(generate_vec)，注意這裡我們用來產生詞向量的方法是單純將文字tokenize(為了使產生的文本長度不同，而不使用BoW)

In [24]:
def load_review(review_path):
    with open(review_path, 'r') as f:
        review = f.read()
    #移除non-alphabet符號、贅字與tokenize
    reveiw = re.sub('[^a-zA-Z]', '', review)
    review = nltk.word_tokenize(review)
    review = list(set(review).difference(set(stopwords.words('english'))))
    return review
    
def generate_vec(review, vocab_dic):
    word_vectors = np.zeros(len(review))
    for i, word in enumerate(review):
        if vocab_dic.get(word):
            word_vectors[i] = vocab_dic.get(word)  
    return word_vectors

samp_review = load_review(review_pairs[37][0])
[len(generate_vec(load_review(review_pairs[i][0]), vocab_dict)) for i in range(10)] #可以看見每個字都不等長

[77, 190, 93, 65, 57, 97, 69, 207, 211, 169]

In [99]:
#建立客製化dataset

class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        review = load_review(self.data[idx][0])
        review_vec = generate_vec(review, vocab_dict)
        review_label = self.data[idx][1]
        return torch.tensor(review_vec), torch.tensor(review_label)
    

#建立客製化collate_fn，將長度不一的文本pad 0 變成相同長度
def collate_fn(batch):
    corpus, labels = zip(*batch)

    lengths = [len(x) for x in corpus]
    max_length = max(lengths)

    batch_corpus = []
    for i in range(len(corpus)):
        temp_pads = torch.zeros(max_length)
        temp_pads[:lengths[i]] = corpus[i]
        batch_corpus.append(temp_pads.view(-1,1))
    return torch.cat(batch_corpus,dim=0), torch.tensor(labels), torch.tensor(lengths)

In [107]:
class RandomSequentialSampler(Sampler):
    
    def __init__(self, data_source, batch_size):
        self.data_source = data_source
        self.batch_size = batch_size
        
    def __iter__(self):
        n_batch = len(self) // self.batch_size #calculate number of batches
        leftover = len(self) % self.batch_size #calculate the remaining part
        index = np.zeros(len(self), dtype=int)
        
        #batch
        for i in range(n_batch):
            start_idx = random.randint(0, len(self) - self.batch_size)
            batch_idx = start_idx + np.arange(0, self.batch_size)
            index[self.batch_size * i : self.batch_size * (i+1)] = batch_idx
            
        #dealing with leftover
        if leftover:
            tail_start = random.randint(0, len(self) - leftover)
            tail_index = tail_start + np.arange(0, leftover)
            index[-leftover:] = tail_index
            
        return iter(index)
    
    def __len__(self):
        
        return len(self.data_source)

In [113]:
# 使用Pytorch的RandomSampler來進行indice讀取並建立dataloader
custom_dataset = dataset(review_pairs, vocab_dict)
custom_dataloader = DataLoader(custom_dataset, shuffle=True, collate_fn=collate_fn, batch_size=2)
next(iter(custom_dataloader))

(tensor([[0.0000e+00],
         [3.7400e+02],
         [1.3812e+04],
         [5.3030e+03],
         [2.0690e+03],
         [8.2500e+02],
         [2.8070e+03],
         [1.7561e+04],
         [2.2067e+04],
         [8.3880e+03],
         [1.7653e+04],
         [3.1170e+03],
         [1.1330e+03],
         [5.6450e+03],
         [2.4085e+04],
         [0.0000e+00],
         [0.0000e+00],
         [6.1000e+02],
         [5.1560e+03],
         [0.0000e+00],
         [0.0000e+00],
         [3.1600e+02],
         [0.0000e+00],
         [3.6000e+01],
         [7.4720e+03],
         [3.2770e+03],
         [3.7199e+04],
         [0.0000e+00],
         [3.3000e+02],
         [1.3580e+04],
         [0.0000e+00],
         [3.5818e+04],
         [0.0000e+00],
         [7.3000e+02],
         [1.0740e+03],
         [4.5700e+02],
         [3.8220e+03],
         [1.6048e+04],
         [3.1716e+04],
         [1.9660e+03],
         [4.4660e+03],
         [1.5900e+03],
         [1.0000e+02],
         [2

In [114]:

next(iter(custom_dataloader))

(tensor([[1.0100e+02],
         [0.0000e+00],
         [8.9332e+04],
         [4.8000e+01],
         [9.3000e+01],
         [2.8000e+02],
         [1.7000e+01],
         [5.7000e+01],
         [1.0700e+02],
         [0.0000e+00],
         [0.0000e+00],
         [3.9200e+02],
         [6.2800e+02],
         [6.2000e+01],
         [4.0200e+02],
         [3.5689e+04],
         [4.6000e+01],
         [0.0000e+00],
         [2.4300e+02],
         [1.5860e+03],
         [1.5260e+03],
         [7.5325e+04],
         [2.0840e+03],
         [1.8680e+03],
         [0.0000e+00],
         [7.8800e+02],
         [3.6900e+02],
         [8.7683e+04],
         [5.6590e+03],
         [5.2740e+03],
         [1.2200e+03],
         [7.6600e+02],
         [1.3040e+03],
         [3.4300e+03],
         [1.1900e+02],
         [3.9810e+03],
         [2.3100e+02],
         [7.8900e+02],
         [3.7340e+03],
         [4.5800e+02],
         [1.1400e+02],
         [2.2800e+02],
         [1.1600e+02],
         [0