### 作業目的: 熟練Pytorch Dataset與DataLoader進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [2]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yz830\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yz830\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### 探索資料與資料前處理
在train資料中，有分成pos(positive)與neg(negative)，分別為正評價與負評價，此評價即為label。

In [7]:
load_svmlight_file

<function sklearn.datasets._svmlight_format_io.load_svmlight_file(f, *, n_features=None, dtype=<class 'numpy.float64'>, multilabel=False, zero_based='auto', query_id=False, offset=0, length=-1)>

In [9]:
!tar zxvf aclImdb_v1.tar.gz

lImdb/test/neg/347_1.txt
x aclImdb/test/neg/346_2.txt
x aclImdb/test/neg/345_3.txt
x aclImdb/test/neg/344_4.txt
x aclImdb/test/neg/343_1.txt
x aclImdb/test/neg/342_1.txt
x aclImdb/test/neg/341_1.txt
x aclImdb/test/neg/340_2.txt
x aclImdb/test/neg/339_1.txt
x aclImdb/test/neg/338_1.txt
x aclImdb/test/neg/337_1.txt
x aclImdb/test/neg/336_1.txt
x aclImdb/test/neg/335_2.txt
x aclImdb/test/neg/334_2.txt
x aclImdb/test/neg/333_3.txt
x aclImdb/test/neg/332_1.txt
x aclImdb/test/neg/331_4.txt
x aclImdb/test/neg/330_1.txt
x aclImdb/test/neg/329_1.txt
x aclImdb/test/neg/328_4.txt
x aclImdb/test/neg/327_1.txt
x aclImdb/test/neg/326_1.txt
x aclImdb/test/neg/325_1.txt
x aclImdb/test/neg/324_3.txt
x aclImdb/test/neg/323_2.txt
x aclImdb/test/neg/322_1.txt
x aclImdb/test/neg/321_1.txt
x aclImdb/test/neg/320_3.txt
x aclImdb/test/neg/319_1.txt
x aclImdb/test/neg/318_3.txt
x aclImdb/test/neg/317_2.txt
x aclImdb/test/neg/316_1.txt
x aclImdb/test/neg/315_4.txt
x aclImdb/test/neg/314_2.txt
x aclImdb/test/neg

In [17]:
# 讀取字典，這份字典為review內所有出現的字詞
import os

def read_vocab():
    with open(os.path.join('aclImdb', 'imdb.vocab'), 'r', encoding='utf8') as vocab:
        return vocab.read().split()

vocab = read_vocab()
print(vocab[:30])

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")
###<your code>###
print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
vocab_dic = {voc:i for i, voc in enumerate(vocab)}
vocab_dic

['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'he', 'are', 'his', 'have', 'be', 'one', '!', 'all', 'at']
vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89527


{'the': 0,
 'and': 1,
 'a': 2,
 'of': 3,
 'to': 4,
 'is': 5,
 'it': 6,
 'in': 7,
 'i': 8,
 'this': 9,
 'that': 10,
 'was': 11,
 'as': 12,
 'for': 13,
 'with': 14,
 'movie': 15,
 'but': 16,
 'film': 17,
 'on': 18,
 'not': 19,
 'you': 20,
 'he': 21,
 'are': 22,
 'his': 23,
 'have': 24,
 'be': 25,
 'one': 26,
 '!': 27,
 'all': 28,
 'at': 29,
 'by': 30,
 'an': 31,
 'who': 32,
 'they': 33,
 'from': 34,
 'so': 35,
 'like': 36,
 'there': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 'if': 42,
 'has': 43,
 'out': 44,
 'what': 45,
 '?': 46,
 'some': 47,
 'good': 48,
 'more': 49,
 'when': 50,
 'she': 51,
 'very': 52,
 'even': 53,
 'my': 54,
 'no': 55,
 'up': 56,
 'time': 57,
 'would': 58,
 'which': 59,
 'only': 60,
 'story': 61,
 'really': 62,
 'their': 63,
 'see': 64,
 'had': 65,
 'can': 66,
 'were': 67,
 'me': 68,
 'we': 69,
 'than': 70,
 'well': 71,
 'much': 72,
 'been': 73,
 'get': 74,
 'people': 75,
 'will': 76,
 'bad': 77,
 'other': 78,
 'also': 79,
 'into': 80,
 'do': 81,
 'becau

In [20]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

review_pos = glob.glob('aclImdb/train/pos/*.txt')
review_neg = glob.glob('aclImdb/train/neg/*.txt')
review_all = review_pos + review_neg
y = [1] * len(review_pos) + [0] * len(review_neg)
review_pairs = list(zip(review_all, y))

print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('aclImdb/train/pos\\0_9.txt', 1), ('aclImdb/train/pos\\10000_8.txt', 1)]
Total reviews: 25000


### 建立Dataset與DataLoader讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量BoW的函式
(generate_bow)

In [29]:
def load_review(review_path):
    
    with open(review_path, 'r') as f:
        review = f.read()
        
    #移除non-alphabet符號、贅字與tokenize
    reveiw = re.sub('[^a-zA-Z]', '', review)
    review = nltk.word_tokenize(review)
    review = list(set(review).difference(set(stopwords.words('english'))))
    
    return review

In [22]:
def generate_bow(review, vocab_dic):
    bag_vector = np.zeros(len(vocab_dic))
    for word in review:
        if vocab_dic.get(word):
            bag_vector[vocab_dic.get(word)] += 1
            
    return bag_vector

In [30]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_dirs, vocab):
        self.data_dirs = data_dirs
        self.vocab = vocab

    def __len__(self):
        return len(self.data_dirs)

    def __getitem__(self, idx):
        pair = self.data_dirs[idx]
        review = pair[0]
        review = load_review(review)
        review = generate_bow(review, self.vocab)
        
        return review, pair[1]

In [35]:
# 建立客製化dataset
custom_dst = dataset(review_pairs, vocab_dic)
custom_dst[20000]


(array([0., 0., 0., ..., 0., 0., 0.]), 0)

In [39]:
# 建立dataloader
custom_dataloader = DataLoader(dataset=custom_dst, batch_size=12, shuffle=True)
next(iter(custom_dataloader))

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),
 tensor([0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0])]