## Warning!!
### 이 jupyter notebook은 왠만하면 새로운 conda환경을 만들거나, colab에서 진행하세요...

In [1]:
import torchdata
import torchtext

In [None]:
# !pip install torchtext==0.14.1
# !pip install torchdata==0.5.1

In [9]:
import numpy as np
import pandas as pd
import torch

In [10]:
trn_file_path = 'data/AG_news_train.csv'
tst_file_path = 'data/AG_news_test.csv'

In [11]:
df_trn = pd.read_csv(trn_file_path, names=['class', 'title', 'description'])
df_trn.head()

Unnamed: 0,class,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [12]:
df_tst = pd.read_csv(tst_file_path, names=['class', 'title', 'description'])
df_tst.head()

Unnamed: 0,class,title,description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


# 1. torchtext를 활용한 Dataset 생성

In [13]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 2e-4
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 4

## (1) Dataset 준비하기
- <span style = 'font-size:1.2em;line-height:1.5em'>torchdata의 datapipe API를 활용</span>

In [14]:
from torchdata.datapipes.iter import IterableWrapper, FileOpener

In [15]:
trn_datapipe = IterableWrapper([trn_file_path])
tst_datapipe = IterableWrapper([tst_file_path])

trn_datapipe = FileOpener(trn_datapipe, mode='b')
tst_datapipe = FileOpener(tst_datapipe, mode='b')

trn_datapipe = trn_datapipe.parse_csv()
tst_datapipe = tst_datapipe.parse_csv()

for trn_sample in trn_datapipe:
    print(trn_sample)
    break
    
for tst_sample in tst_datapipe:
    print(tst_sample)
    break

['3', 'Wall St. Bears Claw Back Into the Black (Reuters)', "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."]
['3', 'Fears for T N pension after talks', "Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."]


## (2) Train/Validation set split and Test set

In [16]:
# Get the number of rows in dataset
N_ROWS = len(list(trn_datapipe))

# Split into training and val datapipes early on. Will build vocabulary from training datapipe only.
trn_dp, val_dp = trn_datapipe.random_split(total_length=N_ROWS,
                                           weights={"train": 0.8, "valid": 0.2},
                                           seed=0)

print(f'Num Train: {len(list(trn_dp))}')
print(f'Num Validate: {len(list(val_dp))}')

Num Train: 96000
Num Validate: 24000


In [17]:
# Get the number of rows in dataset
N_ROWS = len(list(tst_datapipe))

tst_dp = tst_datapipe

print(f'Num Test: {len(list(tst_dp))}')

Num Test: 7600


## (3) Vocabulary set 생성하기

- <span style = 'font-size:1.3em;line-height:1.5em'><b>build_vocab_from_iterator()</b>를 활용하여 `VOCABULARY_SIZE`만큼의 vocabulary set을 생성</span>
- <span style = 'font-size:1.3em;line-height:1.5em'><b>yield_token(data_iter)</b></span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>data_iter에서 순차적으로 생성되는 값을 활용하여 다음과 같은 역할 수행</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>Text column에 대해서는 tokenize한 결과를 반환</span>
        - <span style = 'font-size:1.1em;line-height:1.5em'>Category column에 대해서는 그 자체를 반환</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>위와 같은 역할을 수행하는 generator이다.</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>참고: iterable(iterator, generator 등)에 관한 설명 (https://wikidocs.net/134909) </span>

- <span style = 'font-size:1.3em;line-height:1.5em'><b>build_vocab_from_iterator()</b></span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>Iterative한 객체가 iterative하게 생성하는 값을 활용하여 vocabulary dict 를 생성</span>

In [18]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

In [19]:
import copy
import nltk
import re
from nltk.tokenize import TreebankWordTokenizer

def preprocess_english(text):
    my_text = copy.copy(text)
    my_text = my_text.replace('\n', '')
    sents = nltk.sent_tokenize(my_text)
    tokenizer = TreebankWordTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    
    p = re.compile('[^A-Za-z]')
    result = []
    for sent in sents:
        sent = sent.lower() # 소문자로 변환
        sent = p.sub(' ', sent) # 각 문장에서 특수문자 제거
        word_tokens = tokenizer.tokenize(sent) # word tokenization
        for token in word_tokens:
            if token not in stopwords:
                result.append(token) # stopwords removal
    return result

In [20]:
tokenizer = get_tokenizer(preprocess_english) # 별도로 정의한 함수로 tokenizing하기
# tokenizer = get_tokenizer('basic_english') # 기본설정으로 tokenizing하기

In [21]:
## 혹시 위에서 에러가 발생하는 분들은 이 코드를 실행해주세요,
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [22]:
def yield_tokens(data_iter, tokenizer, data_type = 'description'):
    assert data_type in ['class','title','description']
    for label, title, text in data_iter:
        if data_type == 'description':
            yield tokenizer(text)
        elif data_type == 'title':
            yield tokenizer(title)
        elif data_type == 'class':
            yield [label]

def get_vocab(train_datapipe, tokenizer, data_type = 'description', specials = ["<UNK>", "<PAD>"]):
    assert data_type in ['class','title','description']
    vocab = build_vocab_from_iterator(yield_tokens(train_datapipe, tokenizer, data_type),
                                      min_freq=3,
                                      specials=specials,
                                      max_tokens=VOCABULARY_SIZE)
    vocab.set_default_index(vocab["<UNK>"])
    return vocab

### yield_tokens의 역할을 한번 봅시다.

In [23]:
class_generator = yield_tokens(trn_dp, tokenizer, data_type='class')
title_generator = yield_tokens(trn_dp, tokenizer, data_type='title')
text_generator = yield_tokens(trn_dp, tokenizer, data_type='description')

print(list(trn_dp)[0]) ## train_dp의 0번째 값 보기
print()
print(next(class_generator)) ## class_generator에서 처음으로 생성된 값 보기
print()
print(next(title_generator)) ## label_generator에서 처음으로 생성된 값 보기
print()
print(next(text_generator)) ## text_generator에서 처음으로 생성된 값 보기


['3', 'Carlyle Looks Toward Commercial Aerospace (Reuters)', 'Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.']

['3']

['carlyle', 'looks', 'toward', 'commercial', 'aerospace', 'reuters']

['reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'reputation', 'making', 'well', 'timed', 'occasionally', 'controversial', 'plays', 'defense', 'industry', 'quietly', 'placed', 'bets', 'another', 'part', 'market']


### get_vocab 역할을 한번 봅시다.

In [24]:
text_vocab = get_vocab(trn_dp, tokenizer, data_type = 'description')
label_vocab = get_vocab(trn_dp, tokenizer, data_type = 'class', specials = ["<UNK>"])
print("Text vocabulary size: ", len(text_vocab))
print("Label vocabulary size: ", len(label_vocab))

Text vocabulary size:  20000
Label vocabulary size:  5


### vocab.get_itos(): 각 index에 해당하는 vocab들이 index 순으로 저장된 list

In [27]:
text_vocab.get_itos()

['<UNK>',
 '<PAD>',
 'said',
 'new',
 'reuters',
 'two',
 'us',
 'quot',
 'year',
 'first',
 'ap',
 'gt',
 'lt',
 'monday',
 'wednesday',
 'tuesday',
 'one',
 'world',
 'company',
 'thursday',
 'inc',
 'friday',
 'u',
 'last',
 'yesterday',
 'york',
 'president',
 'three',
 'week',
 'million',
 'corp',
 'oil',
 'sunday',
 'time',
 'would',
 'united',
 'game',
 'government',
 'today',
 'people',
 'years',
 'could',
 'group',
 'com',
 'second',
 'n',
 'percent',
 'iraq',
 'saturday',
 'software',
 'third',
 'night',
 'next',
 'season',
 'prices',
 'fullquote',
 'day',
 'security',
 'microsoft',
 'stocks',
 'quarter',
 'team',
 'announced',
 'internet',
 'minister',
 'state',
 'international',
 'four',
 'back',
 'high',
 'washington',
 'market',
 'billion',
 'may',
 'news',
 'former',
 'officials',
 'top',
 'win',
 'business',
 'states',
 'says',
 'month',
 'city',
 'victory',
 'record',
 'country',
 'end',
 'european',
 'open',
 'service',
 'reported',
 'technology',
 'largest',
 'league

### vocab.get_stoi(): 각 vocab이 어떤 index와 mapping되는지 저장된 dict

In [28]:
text_vocab.get_stoi()

{'picked': 2217,
 'kamal': 12855,
 'last': 23,
 'introduction': 5207,
 'rebounded': 3813,
 'greater': 1829,
 'weight': 3181,
 'waters': 3298,
 'helicopters': 4045,
 'conscious': 12151,
 'monday': 13,
 'honda': 3749,
 'southern': 378,
 'pensacola': 11849,
 'spawned': 13070,
 'sleek': 8975,
 'crashed': 1692,
 'headaches': 12803,
 'us': 6,
 'alongside': 5058,
 'two': 5,
 'underdog': 9752,
 'forming': 7526,
 'races': 3271,
 'sneaking': 19032,
 'n': 45,
 'grenada': 6192,
 'unintended': 17640,
 'olympics': 698,
 'entrenched': 12207,
 'qcom': 11400,
 'new': 3,
 'television': 484,
 'aid': 814,
 'savannah': 18949,
 'late': 312,
 'lp': 6300,
 'spirits': 6681,
 'francisco': 271,
 'night': 51,
 'list': 725,
 'ringers': 8722,
 'definite': 18068,
 'first': 9,
 '<UNK>': 0,
 'involving': 2170,
 'depicting': 19787,
 'participation': 9141,
 'stars': 1632,
 '<PAD>': 1,
 'shorter': 12487,
 'toll': 1598,
 'paypass': 15126,
 'inception': 15912,
 'ireland': 960,
 'passes': 1365,
 'organisations': 7748,
 'sec

In [29]:
print(label_vocab.get_itos())

['<UNK>', '3', '2', '1', '4']


### vocabulary set을 활용하여 vocabulary를 integer index로 변환

In [30]:
print(text_vocab['<UNK>'])
print(text_vocab['reuters'])
print(text_vocab['<PAD>'])

0
4
1


In [31]:
UNK_VALUE = text_vocab['<UNK>']
PADDING_VALUE = text_vocab['<PAD>']

### vocabulary set에 존재하지 않는 token은 \<UNK\>으로 취급됨

In [32]:
print(text_vocab['abcbacbac']) # unknown(<UNK>) token

0


## (4) text와 label을 전처리하는 `text_transform`, `label_transform`을 정의

In [33]:
text_transform = lambda x: [text_vocab[token] for token in tokenizer(x)]
label_transform = lambda x: label_vocab[x]

- <span style = 'font-size:1.3em;line-height:1.5em'>transform을 적용한 결과를 확인</span>

In [34]:
# Print out the output of text_transform
my_str = "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."
print("input string:", my_str)
print()
print("tokenize result:", tokenizer(my_str))
print()
print("text_transform result:", text_transform(my_str))

input string: Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.

tokenize result: ['unions', 'representing', 'workers', 'turner', 'newall', 'say', 'disappointed', 'talks', 'stricken', 'parent', 'firm', 'federal', 'mogul']

text_transform result: [1785, 2911, 339, 7060, 0, 167, 4093, 201, 8132, 1977, 251, 100, 8901]


# 2. DataLoader 생성하기

- <span style = 'font-size:1.3em;line-height:1.5em'>Pytorch의 DataLoader를 활용하여 batch를 자동으로 생성되도록 하자</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>`torch.utils.data.DataLoader`</span>
- <span style = 'font-size:1.3em;line-height:1.5em'>이 때, Pytorch 모델에 입력하기 위해서는 다음과 같은 작업이 필요하다</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>(1) index가 들어있는 list들을 torch tensor형태로 변환</span>
    - <span style = 'font-size:1.2em;line-height:1.5em'>(2) RNN 계열의 모델에 입력할 경우, batch내에서 가장 긴 sequence길이로 batch내 모든 sequence의 길이를 맞춰줘야함(padding)</span>

In [35]:
from torch.utils.data import Sampler, Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [36]:
def collate_batch(batch):
    desc_list, label_list = [], []
    for (_label, _title, _desc) in batch:
        processed_desc = torch.tensor(text_transform(_desc))
        desc_list.append(processed_desc)
        label_list.append(label_transform(_label))     
        
    labels = torch.tensor(label_list).to(DEVICE)
    descs = pad_sequence(desc_list, padding_value = PADDING_VALUE, batch_first=True).to(DEVICE)
    
    return labels, descs

In [37]:
BATCH_SIZE = 32

trn_dp_list = list(trn_dp)
trn_loader = DataLoader(trn_dp_list, 
                        batch_size = BATCH_SIZE, 
                        collate_fn = collate_batch, shuffle=True)

In [38]:
for i, (label_batch, text_batch) in enumerate(trn_loader):
    print(label_batch.size(), text_batch.size())
    if i == 5:
        break

torch.Size([32]) torch.Size([32, 44])
torch.Size([32]) torch.Size([32, 55])
torch.Size([32]) torch.Size([32, 41])
torch.Size([32]) torch.Size([32, 45])
torch.Size([32]) torch.Size([32, 45])
torch.Size([32]) torch.Size([32, 45])


In [39]:
text_batch

tensor([[9803,  744, 2255,  ...,    1,    1,    1],
        [ 653, 1323,  246,  ...,    1,    1,    1],
        [1087,  411,  113,  ...,    1,    1,    1],
        ...,
        [2558, 1112,   73,  ...,    1,    1,    1],
        [ 786,  846, 3055,  ...,    1,    1,    1],
        [   4,  177, 7304,  ...,    1,    1,    1]])

- <span style = 'font-size:1.3em;line-height:1.5em'>array of integers를 corresponding string token으로 변환하려면?</span>

In [152]:
vocab_itos = text_vocab.get_itos()
vec_vocab_itos = np.vectorize(lambda x: vocab_itos[x])
vec_vocab_itos(text_batch[1])

array(['according', 'international', 'ice', 'hockey', 'federation', 'n',
       'h', 'l', 'players', 'signed', 'european', 'teams', 'could',
       'hockey', 'headed', 'new', 'world', 'order', '<PAD>', '<PAD>',
       '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',
       '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',
       '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',
       '<PAD>', '<PAD>', '<PAD>'], dtype='<U13')

### [Optional] batch를 만들 때, 길이가 비슷한 것들끼리 생성하자

In [153]:
# Create our own sampler, to ensure we function with multiple worker threads
# See https://discuss.pytorch.org/t/using-distributedsampler-in-combination-with-batch-sampler-to-make-sure-batches-have-sentences-of-similar-length/119824/3

import random

class BatchSamplerSimilarLength(Sampler):
    def __init__(self, dataset, batch_size, indices=None, shuffle=True):
        self.batch_size = batch_size
        self.shuffle = shuffle
        # get the indices and length
        self.indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(dataset)]
        # if indices are passed, then use only the ones passed (for ddp)
        if indices is not None:
            self.indices = torch.tensor(self.indices)[indices].tolist()

    def __iter__(self):
        if self.shuffle:
            random.shuffle(self.indices)

        pooled_indices = []
        # create pool of indices with similar lengths
        for i in range(0, len(self.indices), self.batch_size * 10):
            pooled_indices.extend(sorted(self.indices[i:i + self.batch_size * 10], key=lambda x: x[1]))
        self.pooled_indices = [x[0] for x in pooled_indices]

        # Comment in for validation
        #self.pooled_lengths = [x[1] for x in pooled_indices]
        #print(self.pooled_lengths)
        #print(self.pooled_indices)

        # yield indices for current batch
        batches = [self.pooled_indices[i:i + self.batch_size] for i in
                   range(0, len(self.pooled_indices), self.batch_size)]

        if self.shuffle:
            random.shuffle(batches)
        for batch in batches:
            yield batch

    def __len__(self):
        return len(self.pooled_indices) // self.batch_size

In [154]:
%%time

trn_dp_list = list(trn_dp)
trn_loader = DataLoader(trn_dp_list,
                        batch_sampler=BatchSamplerSimilarLength(dataset = trn_dp_list,
                                                                batch_size=BATCH_SIZE),
                        collate_fn=collate_batch)

CPU times: total: 33.5 s
Wall time: 33.6 s


In [155]:
for i, (label_batch, text_batch) in enumerate(trn_loader):
    print(label_batch.size(), text_batch.size())
    if i == 5:
        break

torch.Size([32]) torch.Size([32, 36])
torch.Size([32]) torch.Size([32, 29])
torch.Size([32]) torch.Size([32, 28])
torch.Size([32]) torch.Size([32, 29])
torch.Size([32]) torch.Size([32, 32])
torch.Size([32]) torch.Size([32, 38])


In [156]:
vocab_itos = text_vocab.get_itos()
vec_vocab_itos = np.vectorize(lambda x: vocab_itos[x])
vec_vocab_itos(text_batch[1])

array(['president', 'thabo', 'mbeki', 'publicly', 'clear', 'name',
       'amidst', 'claims', 'allowed', 'deposed', 'haitian', 'leader',
       'jean', 'bertrand', 'aristide', 'incite', 'violence', 'south',
       'africa', 'says', 'african', 'christian', 'democratic', 'party',
       '<UNK>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',
       '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'],
      dtype='<U10')

In [157]:
text_batch[1]

tensor([   26,  6808,  6413,  3594,   929,   768, 11552,   705,  1434, 11651,
         4822,   155,  2478,  4197,  4317, 18409,   577,   140,   686,    81,
          602,  3282,   729,   360,     0,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1])