## 기초조작

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# squeeze는 1인 axis를 모두 제거한다
torch.FloatTensor([[[1, 2, 3]]]).squeeze()

tensor([1., 2., 3.])

In [10]:
# 그냥 안에 쓰면 size로 인식된다
torch.FloatTensor(2, 4, 3)

tensor([[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]])

In [8]:
lt = torch.LongTensor([0,1,2,0])[..., None]
oh = torch.zeros(4, 3)
oh.scatter_(1, lt, 1) # lt를 idx 삼아서 1을 채운다
# 부족하면 다시 lt의 idx를 0부터 반복

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]])

In [13]:
torch.ByteTensor([True, False, True])

tensor([1, 0, 1], dtype=torch.uint8)

In [24]:
# unsqueeeze
torch.FloatTensor([[0],[1]])
torch.FloatTensor([0,1]).unsqueeze(1)

tensor([[0.],
        [1.]])

In [36]:
#cat
a = torch.arange(2)
b = torch.arange(2,4)
c = torch.arange(5,7)
tmp1 = torch.cat([a.unsqueeze(1), b.unsqueeze(1), c.unsqueeze(1)])
tmp2 = torch.cat([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])
print(tmp1, '\n', tmp2)

tensor([[0],
        [1],
        [2],
        [3],
        [5],
        [6]]) 
 tensor([[0, 1],
        [2, 3],
        [5, 6]])


In [53]:
x = torch.FloatTensor([4, 5])
x.mul_(2).mul_(2)

tensor([16., 20.])

## Loss 이해

In [None]:
# 교차 엔트로피에 로그 가능도를 전개하면,
# 교차 엔트로피는 최소화 = 음의 로그 가능도 최소화
# 임을 수식적으로 증명할 수 있다

# 교차 엔트로피를 mle관점에서 설명하자면
# y분포(P)를 사용해서 y_hat(P_theta)의 엔트로피를 구하는 방식이다.
# 교차 엔트로피의 최소화는 곧 로그 가능도의 최대화, mle이다.
F.nll_loss(F.log_softmax(z, dim=1), y_train)
F.cross_entropy(z, y_train)

## l2_norm 규제

In [None]:
l2_reg = 0
for param in model.parameters():
    l2_reg += torch.norm(param)
cost += l2_reg

In [4]:
torch.norm(torch.FloatTensor([4,5,6]))

tensor(8.7750)

## torchtext

- TabularDataset.splits()  
테이블형 데이터를 data.Field와 mapping 해서 불러오는 함수  

- data.BucketIterator.split()  
train과 valid를 한 번에 Bucket처럼 묶어서 iter하는 함수  

In [54]:
from torchtext import data, datasets

In [50]:
# TAB으로 구분된 text 데이터 불러오기

class dataloader():
    def __init__(self, train_fn, valid_fn, batch_size=64, device=-1, max_vocab=9e+10, min_freq=1, use_eos=False, shuffle=True):
        super().__init__()
        self.label = data.Field(sequential=False,
                               use_vocab=True,
                               unk_token=None)
        self.text = data.Field(use_vocab=True,
                              batch_first=True,
                              include_lengths=False,
                              eos_token='<eos>' if use_eos else None)
        train, valid = data.TabularDataset.splits(path='',
                                                 train=train_fn,
                                                 validation=valid_fn,
                                                 format='tsv',
                                                 fields=[('label', self.label),
                                                        ('text', self.text)])
        self.train_iter, self.valid_idter = data.BucketIterator.splits((train, valid),
                                                                      batch_size=batch_size,
                                                                      device='cuda:%d' % device if devcie >= 0 else 'cpu',
                                                                      shuffle=shuffle,
                                                                      sort_key=lambda x: len(x.text()),
                                                                      sort_within_batch=True)
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)

In [59]:
import torchtext

In [63]:
PAD, BOS, EOS = 1, 2, 3

# 레이블이 없이 문장으로만 된 데이터 불러오기

class LMdataset(data.Dataset):
    def __init__(self, path, fields, max_length=None, **kwargs):
        # col이 튜플이나 list가 아닌 경우
        if not isinstance(fileds[0], (tuple, list)):
            fields = [('text', fields[0])] # 튜플화 시켜서 리스트로 저장!
        
        examples = []
        with open(path) as f:
            for line in f:
                line = line.strip()
                if max_length <= len(line.split()):
                    continue
                if line != '':
                    examples.append(data.Example.fromlist([line], fileds))
        super(LMdataset, self).__init__(examples, fields, **kwargs)
        


class dataloader():
    def __init__(self, train_fn, valid_fn, batch_size=64, device='cpu', max_vocab=9e+10, max_length=255,
                fix_length=None, use_bos=True, use_eos=True, shuffle=True):
        super().__init__()
        self.text = data.Field(sequence=True,
                              use_vocab=True,
                              batch_first=True,
                              include_length=True,
                              fix_length=fix_length,
                              init_token= '<BOS>' if use_bos else None,
                              eos_token='<EOS>' if use_eos else None)
        train = LMdataset(path=train_fn,
                         fields=[('text', self.text)],
                         max_length=max_length)
        valid = LMdataset(path=valid_fn,
                         fields=[('text', self.text)],
                         max_length=max_length)
        
        self.train_iter = data.BucketIterator(train,
                                             batch_size=batch_size,
                                             # format == %
                                             device='cuda:%d' % device if device >= 0 else 'cpu',
                                             shuffle=shuffle,
                                             sort_key=lambda x: -len(x.text),
                                             sort_within_batch=True)
        self.valid_iter = data.BucketIterator(valid,
                                             batch_size=batch_size,
                                             device='cuda:%d' % device if device >=0 else 'cpu',
                                             shuffle=False,
                                             sort_key=lambda x: -len(x.text),
                                             sort_within_batch=True)
        
        self.text.build_vocab(train, max_size=max_vocab)
        

AttributeError: module 'torchtext.data' has no attribute 'Dataset'

In [64]:
# 텍스트로만 된 데이터 2개 불러오기

PAD, BOS, EOS = 1, 2, 3

class TranslationDataset(data.Dataset):
    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(len(ex.src), len(ex.trg))
    
    def __init__(self, path, exts, fields, max_length=None, **kwargs):
        if not isinstance(fields[0], (tuple, list)):
            fields = [('src', fields[0]), ('trg', fields[1])]
        if not path.endswith('.'):
            path += '.'
        
        src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts)
        
        examples = []
        with open(src_path, encoding='utf-8') as src_file, open(trg_path, encoding='utf-8') as trg_file:
            for src_line, trg_line in zip(src_file, trg_file):
                src_line, trg_line = src_line.strip(), trg_line.strip()
                if max_length and max_length < max(len(src_line.split(), trg_line.split())):
                    continue
                if src_line != '' and trg_line != '':
                    examples.append(data.Example.fromlist([src_line, trg_line], fields))
        super().__init__(examples, fields, **kwargs)
    

class dataloader():
    def __init__(self, train_fn=None, valid_fn=None, exts=None, batch_size=64, device='cpu', max_vocab=9e+10, 
                 max_length=255, fix_length=None, use_bos=True, use_eos=True, shuffle=True, dsl=False):
        super().__init__()
        self.src = data.Field(sequential=True,
                             use_vocab=True,
                             batch_first=True,
                             include_length=True,
                             fix_length=fix_length,
                             init_token='<BOS>' if dsl else None,
                             eos_token='<EOS>' if dsl else None)
        self.tgt = data.Field(sequential=True,
                             use_vocab=True,
                             batch_first=True,
                             include_lengths=True,
                             fix_length=fix_lenth,
                             init_token='<BOS>' if use_bos else None,
                             eos_token='<EOS>' if use_eos else None)
        
        if train_fn is not None and valid_fn is not None and exts is not None:
            train = TranslationDataset(path=train_fn,
                                      exts=exts,
                                      fields=[('src', self.src), ('tgt', self.tgt)],
                                      max_length=max_length)
            valid = TranslationDataset(path=valid_fn,
                                      exts=exts,
                                      fields=[('src', self.src), ('tgt', self.tgt)],
                                      max_length=max_length)
            self.train_iter = data.BucketIterator(train,
                                                 batch_size=batch_size,
                                                 device='cuda:%d' % device if device >= 0 else 'cpu',
                                                 shuffle=shuffle,
                                                 sort_key=lambda x: len(x.tgt) + (max_length * len(x.src)),
                                                 sort_within_batch=True)
            self.valid_iter = data.BucketIterator(valid,
                                                 batch_size=batch_size,
                                                 device='cuda:%d' % device if device >= 0 else 'cpu',
                                                 shuffle=False,
                                                 sort_key=lambda x: len(x.tgt) + (max_length * len(x.src)),
                                                 sort_within_batch=True)
            self.src.build_vocab(train, max_size=max_vocab)
            self.tgt.build_vocab(train, max_size=max_vocab)
            
    def load_vocab(self, src_vocab, tgt_vocab):
        self.src.vocab = src_vocab
        self.tgt.vocab = tgt_vocab

AttributeError: module 'torchtext.data' has no attribute 'Dataset'