# 4. IMDbのDataLoaderを作成

## 4.1. IMDbのデータをダウンロード

In [None]:
import urllib
import tarfile

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
save_path = "/content/drive/My Drive/Transformer/data/aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, save_path)

tar = tarfile.open(save_path)
tar.extractall("/content/drive/My Drive/Transformer/data/")
tar.close()

## 4.2. tsv形式のデータセットを作成

In [15]:
import glob
import os
import io

# 学習データ
train_file = open("/content/drive/My Drive/Transformer/data/IMDb_train.tsv", "w")

path = "/content/drive/My Drive/Transformer/data/aclImdb/train/"
for fname in glob.glob(os.path.join(path + "pos", "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as f:
        text = f.readline()
        text = text.replace("\t", " ")  # タブを半角スペースに置換
        text = text + "\t" + "1" + "\t" + "\n"  # ポジティブ(1)のラベルを付与
        train_file.write(text)

for fname in glob.glob(os.path.join(path + "neg", "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as f:
        text = f.readline()
        text = text.replace("\t", " ")  # タブを半角スペースに置換
        text = text + "\t" + "0" + "\t" + "\n"  # ネガティブ(0)のラベルを付与
        train_file.write(text)

train_file.close()



In [16]:
# テストデータ
test_file = open("/content/drive/My Drive/Transformer/data/IMDb_test.tsv", "w")

path = "/content/drive/My Drive/Transformer/data/aclImdb/test/"
for fname in glob.glob(os.path.join(path + "pos", "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as f:
        text = f.readline()
        text = text.replace("\t", " ")  # タブを半角スペースに置換
        text = text + "\t" + "1" + "\t" + "\n"  # ポジティブ(1)のラベルを付与
        test_file.write(text)

for fname in glob.glob(os.path.join(path + "neg", "*.txt")):
    with io.open(fname, "r", encoding="utf-8") as f:
        text = f.readline()
        text = text.replace("\t", " ")  # タブを半角スペースに置換
        text = text + "\t" + "0" + "\t" + "\n"  # ネガティブ(0)のラベルを付与
        test_file.write(text)

test_file.close()



## 4.3. 前処理と単語分割関数の定義

In [1]:
import re
import string

print(string.punctuation)   # 句点扱いの記号

# 前処理
def preprocessing_text(text):
    text = re.sub("<br />", "", text)  # 改行コードの削除

    for p in string.punctuation:
        if (p == ".") or (p == ","):    # カンマとピリオドはそのまま
            continue
        else:
            text = text.replace(p, " ")

    # カンマとピリオドの前後に半角スペースをいれる
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text

# 単語分割
def tokenizer_janome(text):
    # 英語なので半角スペースで分割するのみ
    return text.strip().split()

# 前処理＋単語分割
def tokenizer_with_preprocessing(text):
    return tokenizer_janome(preprocessing_text(text))

print(tokenizer_with_preprocessing("I like cats."))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['I', 'like', 'cats', '.']


## 4.4. Datasetの作成

In [4]:
import torchtext

max_length = 256 # 1つの文章内の最大単語数

TEXT = torchtext.data.Field(
    sequential=True,    # 可変データか
    tokenize=tokenizer_with_preprocessing,  # 単語分割の関数
    use_vocab=True,     # 単語を辞書に追加するか
    lower=True,         # アルファベットを小文字にするか
    include_lengths=True,   # 単語数を保持するか
    batch_first=True,       # バッチの次元が先にくるか
    fix_length=max_length,  # 各文章をパディングして同じ長さにする
    init_token="<cls>",     # 文章の最初の単語
    eos_token="<eos>"       # 文章の最後の単語
)

LABEL = torchtext.data.Field(
    sequential=False,    # 可変データか
    use_vocab=False,     # 単語を辞書に追加するか
)

# ※GoogleDriveに接続
train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path="/content/drive/My Drive/Transformer/data/",
    train="IMDb_train.tsv",
    test="IMDb_test.tsv",
    format="tsv",
    fields=[("Text", TEXT), ("Label", LABEL)]
)

# 学習データと検証データを分割する
import random
train_ds, val_ds = train_val_ds.split(split_ratio=0.8, random_state=random.seed(1234))

print("学習データ数: {}".format(len(train_ds)))
print("1つ目の学習データ: {}".format(vars(train_ds[0])))
print("検証データ数: {}".format(len(val_ds)))
print("1つ目の検証データ: {}".format(vars(val_ds[0])))
print("テストデータ数: {}".format(len(test_ds)))
print("1つ目のテストデータ: {}".format(vars(test_ds[0])))

学習データ数: 20000
1つ目の学習データ: {'Text': ['there', 'was', 'such', 'a', 'hype', 'about', 'a', 'game', 'show', 'with', 'bill', 'shatner', '.', '.', '.', 'and', 'especially', 'right', 'in', 'the', 'wake', 'of', 'deal', 'or', 'no', 'deal', 'and', '1', 'vs', '100', '.', 'so', ',', 'of', 'course', 'everyone', 'had', 'to', 'tune', 'in', 'to', 'see', 'what', 'all', 'the', 'fuss', 'was', 'about', 'on', 'the', 'new', 'game', 'show', '.', 'what', 'a', 'disappointment', 'as', 'ben', 'stein', 'so', 'stoically', 'and', 'nasally', 'says', ',', 'wooww', '.', 'the', 'only', 'thing', 'likable', 'about', 'this', 'show', 'was', 'the', 'fact', 'that', 'you', 'knew', 'it', 'would', 'eventually', 'be', 'over', '.', 'sitting', 'through', 'a', 'full', 'hour', 'of', 'it', 'was', 'like', 'going', 'to', 'the', 'dentist', '.', '.', '.', 'you', 'find', 'yourself', 'looking', 'at', 'the', 'clock', 'in', 'what', 'you', 'think', 'are', '10', 'minute', 'intervals', ',', 'only', 'to', 'find', 'out', 'that', 'only', 'a', 'minut

## 4.5. fasttextの単語ベクトルを作成

In [6]:
from torchtext.vocab import Vectors

english_fasttext_vectors = Vectors("/content/drive/My Drive/Transformer/data/wiki-news-300d-1M.vec")
print("1単語の次元数: {}".format(english_fasttext_vectors.dim))
print("単語数: {}".format(len(english_fasttext_vectors.itos)))

  0%|          | 0/999994 [00:00<?, ?it/s]Skipping token b'999994' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 999746/999994 [01:56<00:00, 8635.16it/s]

1単語の次元数: 300
単語数: 999994


NameError: ignored

In [7]:
# ボキャブラリの作成
TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors, min_freq=1)
print(TEXT.vocab.vectors.shape)
print(TEXT.vocab.vectors)  # 単語のベクトルを表示
print(TEXT.vocab.stoi)     # 単語のIDを表示

torch.Size([69959, 300])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


## 4.6. DataLoaderの作成

In [8]:
train_dl = torchtext.data.Iterator(train_ds, batch_size=32, train=True)
val_dl = torchtext.data.Iterator(val_ds, batch_size=32, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=32, train=False, sort=False)

batch = next(iter(train_dl))
print(batch.Text)
print(batch.Label)

(tensor([[   2,  105,   43,  ...,    1,    1,    1],
        [   2,   14,   87,  ...,    4,   93,    3],
        [   2,   12, 1926,  ...,    1,    1,    1],
        ...,
        [   2,   52,   50,  ...,    5,    5,    3],
        [   2,   19,  213,  ..., 1325,    4,    3],
        [   2,   14,  382,  ...,  106,   16,    3]]), tensor([131, 256, 167, 129, 147, 145, 237, 147, 256, 227, 208, 158, 186, 196,
        256, 236, 177, 221, 160, 215, 256, 159, 256, 219, 256, 256, 256, 129,
        256, 256, 256, 256]))
tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 1, 0])
