# BERT pytorch 버전에 쓰인 각종 문법

## BERT model 의 실행 구조
#### hugging face 코드를 보면 안쪽에서부터 class 구조를 만들고 있음 
#### 위에서부터 순차적으로 봐도 무방 

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import copy
import json
import logging
import math
import os
import shutil
import tarfile
import tempfile
import sys
from io import open

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

## __init__(생성자) 선언 시 보이는 super()

#### child class 에서 parent class 의 내용을 사용하고 싶을 경우에 이용

#### 오버라이딩 발생

In [114]:
class father():  # 부모 클래스
    def handsome(self):
        print("잘생겼다")
        
class brother(father):  # 자식클래스(부모클래스) 아빠매소드를 상속받겠다
    '''아들'''

class sister(father):  # 자식클래스(부모클래스) 아빠매소드를 상속받겠다
    def pretty(self):
        print("예쁘다")
 
    def handsome(self):
        '''물려받았어요'''

In [115]:
brother = brother()
brother.handsome()
 
girl = sister()
girl.handsome()  # 오버라이딩으로 실행 내용이 수정돼 출력 내용 없음
girl.pretty()

잘생겼다
예쁘다


#### super로 parent class method 이용

In [116]:
class father():  # 부모 클래스
    def handsome(self):
        print("잘생겼다")

class brother(father):  # 자식클래스(부모클래스) 아빠매소드를 상속받겠다
    '''아들'''

class sister(father):  # 자식클래스(부모클래스) 아빠매소드를 상속받겠다
    def pretty(self):
        print("예쁘다")
 
    def handsome(self):
        super().handsome()

In [117]:
brother = brother()
brother.handsome()
 
girl = sister()
girl.handsome()
girl.pretty()

잘생겼다
잘생겼다
예쁘다


#### 응용

In [118]:
class mother():
    def __init__(self, who):
        self.who = who
        
    def pretty(self):
        print("{}를 닮아 예쁘다".format(self.who))

class daughter(mother):
    def __init__(self, who, where):
        super().__init__(who)
        self.where = where
        
    def part(self):
        print("{} 말이야".format(self.where))

In [119]:
girl = daughter('엄마', '얼굴')
girl.pretty()
girl.part()

엄마를 닮아 예쁘다
얼굴 말이야


In [120]:
class mother():
    def __init__(self, who):
        self.who = who
        
    def pretty(self):
        print("{}를 닮아 예쁘다".format(self.who))

class daughter(mother):
    def __init__(self, who, where):
        super().__init__(who)
        self.where = where
        
    def part(self):
        print("{} 말이야".format(self.where))
        
    def pretty(self):
        super().pretty()
        self.part()

In [121]:
girl = daughter('엄마', '얼굴')
girl.pretty()

엄마를 닮아 예쁘다
얼굴 말이야


### 다중 상속에서 super()

#### 여러 번의 상속으로 class A 2번 호출

In [122]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        A.__init__(self)

class C(A):
    def __init__(self):
        print("Class C __init__()")
        A.__init__(self)

class D(B, C):
    def __init__(self):
        print("Class D __init__()")
        B.__init__(self)
        C.__init__(self)

d = D()

Class D __init__()
Class B __init__()
Class A __init__()
Class C __init__()
Class A __init__()


#### super() 를 통해 최상단 클래스 한 번만 호출

In [123]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        super(B, self).__init__()

class C(A):
    def __init__(self):
        print("Class C __init__()")
        super(C, self).__init__()

class D(B, C):
    def __init__(self):
        print("Class D __init__()")
        super(D, self).__init__()

d = D()

Class D __init__()
Class B __init__()
Class C __init__()
Class A __init__()


#### 생각해보기

In [124]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        super(B, self).__init__()

class C(A):
    def __init__(self):
        print("Class C __init__()")
        super(C, self).__init__()

class D(B, C):
    def __init__(self):
        B.__init__(self)
        print("Class D __init__()")
        C.__init__(self)

d = D()

Class B __init__()
Class C __init__()
Class A __init__()
Class D __init__()
Class C __init__()
Class A __init__()


In [125]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        super().__init__()

class C(A):
    def __init__(self):
        print("Class C __init__()")
        super().__init__()

class D(B, C):
    def __init__(self):
        super().__init__()
        print("Class D __init__()")
        

d = D()

Class B __init__()
Class C __init__()
Class A __init__()
Class D __init__()


#### 실제 코딩에서 자주 쓰이는 상속 형태 (nn.Module)

In [126]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        A.__init__(self)

class C(A):
    def __init__(self):
        print("Class C __init__()")
        A.__init__(self)
        
b = B()
c = C()

Class B __init__()
Class A __init__()
Class C __init__()
Class A __init__()


In [127]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        super(B, self).__init__()

class C(A):
    def __init__(self):
        print("Class C __init__()")
        super(C, self).__init__()

b = B()
c = C()

Class B __init__()
Class A __init__()
Class C __init__()
Class A __init__()


In [128]:
class A:
    def __init__(self):
        print("Class A __init__()")

class B(A):
    def __init__(self):
        print("Class B __init__()")
        super().__init__()

class C(A):
    def __init__(self):
        print("Class C __init__()")
        super().__init__()

b = B()
c = C()

Class B __init__()
Class A __init__()
Class C __init__()
Class A __init__()


## nn.Parameter(텐서 객체, requires_grad=True)

#### 텐서 객체가 module의 attribute를 사용하기 위해서 이용
#### requires_grad True가 디폴트로 변화도 추적 가능

In [129]:
param1 = nn.Parameter(torch.ones(5))
param1

Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)

In [130]:
param2 = nn.Parameter(torch.zeros(5))
param2

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)

## tensor.mean(input, dim, keepdim=False)

#### row 값들의 평균을 계산, input은 자기 자신

In [131]:
x = torch.rand(2, 5)
x

tensor([[4.6759e-01, 7.1681e-01, 1.4085e-04, 9.5262e-01, 2.3398e-01],
        [3.6850e-01, 7.8970e-01, 8.4010e-01, 6.7154e-01, 4.8803e-01]])

In [132]:
x.size()

torch.Size([2, 5])

#### dim 은 평균 내릴 rank를 의미

In [133]:
x1 = x.mean(0)
x1

tensor([0.4180, 0.7533, 0.4201, 0.8121, 0.3610])

In [134]:
x1.size()

torch.Size([5])

In [135]:
x2 = x.mean(1)
x2

tensor([0.4742, 0.6316])

In [136]:
x2.size()

torch.Size([2])

#### keepdim 이 True이면 원래 차원 규격을 유지

In [137]:
x3 = x.mean(-1, keepdim=True)
x3

tensor([[0.4742],
        [0.6316]])

In [138]:
x3.size()

torch.Size([2, 1])

In [139]:
x = torch.rand(2, 3, 4)
x

tensor([[[0.2836, 0.1740, 0.6098, 0.5093],
         [0.0673, 0.2343, 0.6839, 0.7901],
         [0.4197, 0.4017, 0.3716, 0.4157]],

        [[0.3331, 0.2045, 0.0594, 0.0957],
         [0.0530, 0.0109, 0.1657, 0.2136],
         [0.6505, 0.6651, 0.6175, 0.6870]]])

In [140]:
x4 = x.mean(-1, keepdim=True)
x4

tensor([[[0.3942],
         [0.4439],
         [0.4021]],

        [[0.1732],
         [0.1108],
         [0.6550]]])

In [141]:
x4.size()

torch.Size([2, 3, 1])

## torch.sqrt(input, out=None)

#### tensor 안의 각 요소들에 대해 루트를 적용한 텐서 객체 반환

In [142]:
x = torch.randn(2, 3)
x

tensor([[-0.5563,  0.1403, -0.0220],
        [-1.1635,  0.7729,  0.2909]])

In [143]:
torch.sqrt(x)

tensor([[   nan, 0.3746,    nan],
        [   nan, 0.8792, 0.5394]])

## nn.Embedding(총 단어의 갯수, 임베딩할 벡터 차원)

#### 고정된 단어 사전에 임베딩할 weight를 저장할 lookup table

In [144]:
embed1 = nn.Embedding(10, 8)
embed1.weight

Parameter containing:
tensor([[ 2.1487, -2.0854, -0.0240, -1.4967, -0.9739, -1.0213,  1.0601, -1.1535],
        [-0.8450,  0.6175, -0.3833, -0.5997, -1.0670,  0.7624,  0.6398, -1.7637],
        [-0.6543,  0.9907,  0.0610,  2.3914, -0.0079,  0.1553,  2.2415, -0.0107],
        [ 0.9340,  1.7949,  0.4663, -2.3766, -0.7469,  0.5045,  1.6657, -0.9562],
        [-0.6904, -0.3072, -0.0780,  0.2416, -0.8489, -0.3077,  0.1233,  0.6495],
        [ 0.8699,  0.4275, -0.6509, -0.5794,  0.0172, -1.2987,  0.0784, -0.5232],
        [ 1.6806, -0.2645, -0.6750,  0.6117,  1.0130,  2.5118, -0.9972, -0.7037],
        [ 0.4704,  0.7933, -0.5795,  0.9972, -1.3991,  0.6818, -0.1558, -0.9399],
        [-1.4944, -0.8453,  1.7152,  0.0736,  0.6591, -0.7722, -0.2458, -1.9559],
        [ 1.0944, -0.6446, -0.3205, -0.2747,  0.3047, -0.0378,  0.5062,  0.4221]],
       requires_grad=True)

#### padding_idx 에 지정한 index의 값들은 무조건 0

In [145]:
embed2 = nn.Embedding(10, 8, padding_idx=0)
embed2.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.1690,  0.1047,  2.1901,  1.3962,  0.2741, -0.5491, -0.2370, -0.2545],
        [ 0.7586, -0.5991, -0.4017, -1.6649, -0.7006, -0.2541, -0.1193, -1.4311],
        [-0.6885,  1.7901, -0.6420,  1.3866, -0.2242, -2.7974,  0.1684,  0.7262],
        [ 1.0062,  0.2971, -0.9541, -2.0617, -1.4385, -0.0613, -1.2690,  0.6394],
        [ 0.6941,  1.1420,  0.2144,  0.1156,  0.5988,  2.0249, -1.1006, -0.6832],
        [-0.3595, -0.3049, -0.0070, -0.2759,  0.6016, -0.4880, -0.3908,  0.5550],
        [ 0.5912,  0.6910, -0.6278, -0.4928,  1.5725,  1.3432,  0.8676, -0.0503],
        [-0.7057,  0.8965, -2.3492, -0.3322,  1.8872,  1.1978, -0.6464,  0.2613],
        [-1.1715,  0.1227, -2.4001,  0.7240, -1.6506,  2.4978,  1.5286, -0.4391]],
       requires_grad=True)

## torch.arange(start=0, end, step=1, dtype=None)

#### numpy의  arange 함수와 비슷한 기능
#### [start, end) 의 1-D tensor 객체 생성

In [146]:
x = torch.arange(5)
x

tensor([0, 1, 2, 3, 4])

In [147]:
x = torch.arange(2, 7)
x

tensor([2, 3, 4, 5, 6])

#### 차원 규격을 바로 하고 싶을 때 팁

In [148]:
x = torch.arange(12).view(3, 4)
x

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

#### bert코드에서 device는 cpu 혹은 gpu 를 쓸 지 설정하는 인자임

## tensor.unsqueeze(input, dim, out=None)

#### 원하는 dimension의 위치에 dimension을 추가하는 기능

In [149]:
x = torch.randn(2, 3, 2)
x

tensor([[[ 0.0791,  2.5146],
         [ 1.2468,  0.5602],
         [-0.5160, -1.2563]],

        [[-0.0624, -0.4611],
         [ 0.1054, -0.8319],
         [ 0.0084, -0.7752]]])

In [150]:
x1 = x.unsqueeze(0)
x1

tensor([[[[ 0.0791,  2.5146],
          [ 1.2468,  0.5602],
          [-0.5160, -1.2563]],

         [[-0.0624, -0.4611],
          [ 0.1054, -0.8319],
          [ 0.0084, -0.7752]]]])

In [151]:
x1.size()

torch.Size([1, 2, 3, 2])

In [152]:
x2 = x.unsqueeze(3)
x2

tensor([[[[ 0.0791],
          [ 2.5146]],

         [[ 1.2468],
          [ 0.5602]],

         [[-0.5160],
          [-1.2563]]],


        [[[-0.0624],
          [-0.4611]],

         [[ 0.1054],
          [-0.8319]],

         [[ 0.0084],
          [-0.7752]]]])

In [153]:
x2.size()

torch.Size([2, 3, 2, 1])

In [154]:
x3 = x.unsqueeze(-1)
x3

tensor([[[[ 0.0791],
          [ 2.5146]],

         [[ 1.2468],
          [ 0.5602]],

         [[-0.5160],
          [-1.2563]]],


        [[[-0.0624],
          [-0.4611]],

         [[ 0.1054],
          [-0.8319]],

         [[ 0.0084],
          [-0.7752]]]])

In [155]:
x3.size()

torch.Size([2, 3, 2, 1])

## tensor.expand_as(other_tensor)

#### 현재 tensor를 other_tensor의  규격으로 맞춰주는 기능
#### 확장할 규격이 없거나 1이어야 확장 가능한 것으로 보임

#### 확장할 규격이 다 달라 에러 발생

In [156]:
x4 = x.expand_as(x1)
x4

tensor([[[[ 0.0791,  2.5146],
          [ 1.2468,  0.5602],
          [-0.5160, -1.2563]],

         [[-0.0624, -0.4611],
          [ 0.1054, -0.8319],
          [ 0.0084, -0.7752]]]])

In [157]:
x = torch.rand(1, 3)
x

tensor([[0.8555, 0.6935, 0.0826]])

In [158]:
y = torch.rand(2, 5)
y

tensor([[0.4865, 0.2137, 0.4214, 0.2160, 0.6377],
        [0.5618, 0.2899, 0.3697, 0.8104, 0.7490]])

In [159]:
x.expand_as(y)

RuntimeError: The expanded size of the tensor (5) must match the existing size (3) at non-singleton dimension 1.  Target sizes: [2, 5].  Tensor sizes: [1, 3]

In [160]:
y.expand_as(x)

RuntimeError: The expanded size of the tensor (3) must match the existing size (5) at non-singleton dimension 1.  Target sizes: [1, 3].  Tensor sizes: [2, 5]

In [161]:
x = torch.rand(2)
x

tensor([0.0004, 0.0269])

In [162]:
y = torch.rand(4, 3)
y

tensor([[0.1054, 0.1722, 0.8643],
        [0.0146, 0.4907, 0.7806],
        [0.1931, 0.1039, 0.1156],
        [0.9902, 0.3669, 0.4768]])

In [163]:
x.expand_as(y)

RuntimeError: The expanded size of the tensor (3) must match the existing size (2) at non-singleton dimension 1.  Target sizes: [4, 3].  Tensor sizes: [2]

#### dimension이 다르지만 확장 규격이 1로 생각하고 작동

In [164]:
x = torch.rand(3)
x

tensor([0.3268, 0.4271, 0.0936])

In [165]:
y = torch.rand(4, 3)
y

tensor([[0.1345, 0.2745, 0.2929],
        [0.7050, 0.5366, 0.8102],
        [0.6568, 0.8906, 0.4919],
        [0.5044, 0.7277, 0.5602]])

In [166]:
x.expand_as(y)

tensor([[0.3268, 0.4271, 0.0936],
        [0.3268, 0.4271, 0.0936],
        [0.3268, 0.4271, 0.0936],
        [0.3268, 0.4271, 0.0936]])

In [167]:
x.expand_as(y).size()

torch.Size([4, 3])

#### 보통의 경우 확장할 dim이 1이어야 확장이 가능

In [168]:
x = torch.rand(1, 4, 1)
y = torch.rand(3, 4, 5)
x

tensor([[[0.7446],
         [0.7955],
         [0.7044],
         [0.6244]]])

In [169]:
x.expand_as(y)

tensor([[[0.7446, 0.7446, 0.7446, 0.7446, 0.7446],
         [0.7955, 0.7955, 0.7955, 0.7955, 0.7955],
         [0.7044, 0.7044, 0.7044, 0.7044, 0.7044],
         [0.6244, 0.6244, 0.6244, 0.6244, 0.6244]],

        [[0.7446, 0.7446, 0.7446, 0.7446, 0.7446],
         [0.7955, 0.7955, 0.7955, 0.7955, 0.7955],
         [0.7044, 0.7044, 0.7044, 0.7044, 0.7044],
         [0.6244, 0.6244, 0.6244, 0.6244, 0.6244]],

        [[0.7446, 0.7446, 0.7446, 0.7446, 0.7446],
         [0.7955, 0.7955, 0.7955, 0.7955, 0.7955],
         [0.7044, 0.7044, 0.7044, 0.7044, 0.7044],
         [0.6244, 0.6244, 0.6244, 0.6244, 0.6244]]])

## torch.zeros_like(input)

#### input tensor의 규격과 같고 각 요소가 0인 텐서 객체를 반환

In [170]:
x = torch.rand(2, 3)
x

tensor([[0.6619, 0.8975, 0.3214],
        [0.6543, 0.2491, 0.5895]])

In [171]:
y = torch.zeros_like(x)
y

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [172]:
z = torch.ones_like(x)
z

tensor([[1., 1., 1.],
        [1., 1., 1.]])

## nn.Linear(input, output)

#### layer 사이의 matrix를 형성하는 기능
#### ex) input size: [32, 512, 768], output size: [32, 512, 768]
#### middle state matrix size: [32, 768, 768]

In [173]:
class example_linear(nn.Module):
    def __init__(self):
        super(example_linear, self).__init__()
        self.num_attention_heads = 12
        self.attention_head_size = 64
        self.all_head_size = 768
        
        self.query = nn.Linear(768, self.all_head_size)
        self.key = nn.Linear(768, self.all_head_size)
        self.value = nn.Linear(768, self.all_head_size)
        
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        print('transpose view shape: \t\t', x.size())
        print('transpose permute shape: \t', x.permute(0, 2, 1, 3).size())
        return x.permute(0, 2, 1, 3)
    
    def forward(self, hidden_states):
        print('input shape: \t\t', hidden_states.size())
        mixed_query_layer = self.query(hidden_states)
        print('mixed_query_layer shape: \t', mixed_query_layer.size())
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)
        
        print('query_layer shape: \t', query_layer.size())
        print('key_layer transpose shape: \t', key_layer.transpose(-1, -2).size())
        
        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        print('attention_scores shpae: \t', attention_scores.size())
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores #+ attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        print('attention_probs shape: \t', attention_probs.size())
        
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
#         attention_probs = self.dropout(attention_probs)
        print('value_layer shape: \t', value_layer.size())
        context_layer = torch.matmul(attention_probs, value_layer)
        print('context_layer before permute shape: \t', context_layer.size())
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        print('context_layer after permute shape: \t', context_layer.size())
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        
        return context_layer

In [174]:
single = torch.randn(32, 512, 768)

In [175]:
att = example_linear()
out = att(single)

input shape: 		 torch.Size([32, 512, 768])
mixed_query_layer shape: 	 torch.Size([32, 512, 768])
transpose view shape: 		 torch.Size([32, 512, 12, 64])
transpose permute shape: 	 torch.Size([32, 12, 512, 64])
transpose view shape: 		 torch.Size([32, 512, 12, 64])
transpose permute shape: 	 torch.Size([32, 12, 512, 64])
transpose view shape: 		 torch.Size([32, 512, 12, 64])
transpose permute shape: 	 torch.Size([32, 12, 512, 64])
query_layer shape: 	 torch.Size([32, 12, 512, 64])
key_layer transpose shape: 	 torch.Size([32, 12, 64, 512])
attention_scores shpae: 	 torch.Size([32, 12, 512, 512])
attention_probs shape: 	 torch.Size([32, 12, 512, 512])
value_layer shape: 	 torch.Size([32, 12, 512, 64])
context_layer before permute shape: 	 torch.Size([32, 12, 512, 64])
context_layer after permute shape: 	 torch.Size([32, 512, 12, 64])


In [176]:
out.size()

torch.Size([32, 512, 768])

## tensor.permute(dimension)

#### permute는 원소의 순서를 보존한다는 점에서 view와 다름

In [177]:
x = torch.arange(24).view(1, 2, 3, 4)
x

tensor([[[[ 0,  1,  2,  3],
          [ 4,  5,  6,  7],
          [ 8,  9, 10, 11]],

         [[12, 13, 14, 15],
          [16, 17, 18, 19],
          [20, 21, 22, 23]]]])

In [178]:
y = x.view(1, 3, 2 ,4)
y

tensor([[[[ 0,  1,  2,  3],
          [ 4,  5,  6,  7]],

         [[ 8,  9, 10, 11],
          [12, 13, 14, 15]],

         [[16, 17, 18, 19],
          [20, 21, 22, 23]]]])

In [179]:
z = x.permute(0, 2, 1, 3)
z

tensor([[[[ 0,  1,  2,  3],
          [12, 13, 14, 15]],

         [[ 4,  5,  6,  7],
          [16, 17, 18, 19]],

         [[ 8,  9, 10, 11],
          [20, 21, 22, 23]]]])

## tensor.contiguous()

#### 정확한 기능을 모르겠음... 후에 누군가 추가 해주면 좋겠습니다

In [180]:
w = x.permute(0, 2, 1, 3).contiguous()
w

tensor([[[[ 0,  1,  2,  3],
          [12, 13, 14, 15]],

         [[ 4,  5,  6,  7],
          [16, 17, 18, 19]],

         [[ 8,  9, 10, 11],
          [20, 21, 22, 23]]]])

## nn.Softmax(dim=None)

#### 선택한 dimension에 softmax를 취한 텐서 객체 반환

In [181]:
x = torch.randn(4, 3)
x

tensor([[ 0.1389, -0.1301,  0.4512],
        [ 0.4560,  0.6331, -0.1089],
        [ 0.9279, -0.8859,  0.5154],
        [-1.1721,  1.4257, -2.1610]])

In [182]:
softmax = nn.Softmax()
softmax(x)

  


tensor([[0.3194, 0.2441, 0.4365],
        [0.3620, 0.4322, 0.2058],
        [0.5479, 0.0893, 0.3627],
        [0.0675, 0.9073, 0.0251]])

In [183]:
softmax = nn.Softmax(dim=0)
softmax(x)

tensor([[0.2064, 0.1197, 0.3689],
        [0.2835, 0.2568, 0.2107],
        [0.4544, 0.0562, 0.3934],
        [0.0556, 0.5673, 0.0271]])

In [184]:
softmax = nn.Softmax(dim=1)
softmax(x)

tensor([[0.3194, 0.2441, 0.4365],
        [0.3620, 0.4322, 0.2058],
        [0.5479, 0.0893, 0.3627],
        [0.0675, 0.9073, 0.0251]])

In [185]:
softmax = nn.Softmax(dim=-1)
softmax(x)

tensor([[0.3194, 0.2441, 0.4365],
        [0.3620, 0.4322, 0.2058],
        [0.5479, 0.0893, 0.3627],
        [0.0675, 0.9073, 0.0251]])

In [186]:
nn.Softmax(dim=-1)(x)

tensor([[0.3194, 0.2441, 0.4365],
        [0.3620, 0.4322, 0.2058],
        [0.5479, 0.0893, 0.3627],
        [0.0675, 0.9073, 0.0251]])

## nn.ModuleList(modules=None)

#### submodule 을 list에 담는 기능

In [187]:
class testModuleList(nn.Module):
    def __init__(self):
        super(testModuleList, self).__init__()
        self.layers = nn.ModuleList(nn.Linear(5, 5) for _ in range(10))
    
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = F.relu(layer(x))
            print('{}'.format(i), end='\t')
            
        return x

In [188]:
x = torch.randn(3, 5)
x

tensor([[ 0.5448, -0.2920,  0.7299,  0.8775, -0.4369],
        [ 0.4264, -0.1957,  0.8240,  0.2185,  0.2184],
        [ 0.1578,  0.6535,  0.2172,  0.2155, -0.4410]])

In [189]:
net = testModuleList()
out = net(x)

0	1	2	3	4	5	6	7	8	9	

In [190]:
out

tensor([[0.2128, 0.1179, 0.0000, 0.0000, 0.3976],
        [0.2129, 0.1179, 0.0000, 0.0000, 0.3976],
        [0.2129, 0.1179, 0.0000, 0.0000, 0.3976]], grad_fn=<ReluBackward0>)

## isinstance(객체, 자료형)

#### 객체가 자료형과 맞는지 여부를 반환

In [191]:
isinstance(1, int)

True

In [192]:
isinstance('hi', str)

True

In [193]:
myList = []
isinstance(myList, list)

True

#### 클래스 객체 여부도 확인 가능

In [194]:
class myClass:
    pass

In [195]:
testClass = myClass()
isinstance(testClass, myClass)

True

## tensor.to(device=None, dtype=None)

#### tensor의 device혹은 data type을 바꾸고 싶을 때 사용

In [196]:
x = torch.rand(2, 3)
x

tensor([[0.2508, 0.7258, 0.2338],
        [0.1036, 0.5457, 0.9753]])

In [197]:
x.dtype

torch.float32

In [198]:
y = torch.ones(2, 3)
y

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [199]:
y.dtype

torch.float32

In [200]:
x = torch.rand(2, 3, dtype=torch.double)
x

tensor([[0.4087, 0.6687, 0.6007],
        [0.1189, 0.8963, 0.2887]], dtype=torch.float64)

In [201]:
y = y.to(dtype=x.dtype)
y.dtype

torch.float64

## iter & next method

#### iter() 로 iterator 객체 형태 반환
#### next() method로 객체의 처음부터 하나씩 반환

In [2]:
x = iter(['a', 'b', 'c'])
y = next(x)
print(y)
y = next(x)
print(y)
y = next(x)
print(y)

a
b
c


In [5]:
x = iter(['a', 'b', 'c'])
y = next(x)
print(y)
print(x)
y = next(x)
print(y)
print(x)

a
<list_iterator object at 0x000001CE4D34DCF8>
b
<list_iterator object at 0x000001CE4D34DCF8>


#### 마지막 요소를 지나서면 에러 발생

In [203]:
next(x)

StopIteration: 

## class NN.parameter()

#### 신경망 객체에 선언된 parameters 를 가져오는 method

In [204]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
#         test = torch.ones(3, 4)
        print(self.parameter())
        
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        print('conv2 > max_pooling 후의 x shpae\t', x.size())
        x = x.view(-1, self.num_flat_features(x))
        print('after reshape of x\t', x.size())
        x = F.relu(self.fc1(x))
        print('after linear function\t', x.size())
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        
        for s in size:
            num_features *= s
        return num_features
    

In [205]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [206]:
input = Variable(torch.randn(1, 1, 32, 32), 
                requires_grad=True) # nSample, nChannel, Height, Width

In [207]:
for p in net.parameters():
    print(p.name, p.size())

None torch.Size([6, 1, 5, 5])
None torch.Size([6])
None torch.Size([16, 6, 5, 5])
None torch.Size([16])
None torch.Size([120, 400])
None torch.Size([120])
None torch.Size([84, 120])
None torch.Size([84])
None torch.Size([10, 84])
None torch.Size([10])


## class NN.named_parameters()

#### 신경망의 parameter를 (이름, 데이터 값) tuple 형태로 반환 

In [208]:
for n, p in list(net.named_parameters()):
    print(n)
    print(p)

conv1.weight
Parameter containing:
tensor([[[[-1.3551e-02,  7.1723e-02,  1.3095e-01, -1.7946e-01, -1.9144e-01],
          [-6.3771e-02,  5.0846e-02, -1.6471e-01, -1.5248e-01, -1.9750e-01],
          [ 1.7952e-01,  9.8860e-03, -4.5322e-02, -7.7297e-03,  5.0834e-02],
          [ 1.4284e-01, -6.8309e-02, -1.1847e-01,  1.3641e-01,  1.9590e-01],
          [ 1.7032e-01,  6.5213e-02,  8.5182e-02,  5.6357e-02,  1.7980e-02]]],


        [[[ 7.1066e-02, -1.3164e-01, -1.0162e-01,  1.3573e-01,  5.5445e-02],
          [-6.8366e-02, -1.5092e-01, -2.4752e-03, -3.5418e-03,  1.6175e-01],
          [ 4.9975e-02, -1.0374e-01,  1.4506e-01, -9.8533e-02,  1.1970e-01],
          [-1.6559e-01,  1.6889e-01, -5.4293e-02,  9.1411e-02, -1.4294e-01],
          [-1.3175e-01, -8.1450e-02, -1.3395e-02,  4.7471e-02,  1.0807e-01]]],


        [[[-8.5364e-02, -8.2596e-02,  5.4527e-02, -1.6190e-01,  1.3557e-01],
          [-1.2136e-04, -1.3412e-01, -1.2526e-01, -7.2968e-02,  1.0480e-01],
          [ 4.9474e-03,  1.9435e-

       requires_grad=True)
conv2.bias
Parameter containing:
tensor([ 0.0326, -0.0050, -0.0077, -0.0480, -0.0015, -0.0358,  0.0006,  0.0553,
        -0.0527,  0.0112, -0.0385,  0.0051, -0.0284,  0.0680, -0.0107, -0.0194],
       requires_grad=True)
fc1.weight
Parameter containing:
tensor([[ 0.0262,  0.0459,  0.0487,  ...,  0.0470, -0.0373,  0.0406],
        [ 0.0166,  0.0253,  0.0189,  ..., -0.0276, -0.0463,  0.0207],
        [-0.0243, -0.0230, -0.0048,  ..., -0.0162,  0.0360, -0.0359],
        ...,
        [-0.0294, -0.0273,  0.0160,  ...,  0.0118,  0.0343,  0.0084],
        [-0.0492,  0.0010, -0.0361,  ...,  0.0125, -0.0481,  0.0306],
        [-0.0320, -0.0362,  0.0367,  ...,  0.0168,  0.0160, -0.0276]],
       requires_grad=True)
fc1.bias
Parameter containing:
tensor([-1.1413e-03,  2.4427e-02,  3.8341e-02,  4.7606e-03,  4.5988e-02,
         1.7020e-02, -2.7459e-02, -2.4151e-02, -1.5629e-02, -2.2665e-02,
        -1.2155e-02, -8.9984e-04,  5.1179e-03, -4.7015e-02, -3.4684e-03,
        

         -0.0779, -0.0652,  0.0259,  0.0619]], requires_grad=True)
fc3.bias
Parameter containing:
tensor([ 0.0735,  0.0597, -0.0657, -0.0325, -0.1080,  0.0378,  0.0782,  0.0246,
        -0.0205, -0.0889], requires_grad=True)


In [209]:
list(net.named_parameters())

[('conv1.weight', Parameter containing:
  tensor([[[[-1.3551e-02,  7.1723e-02,  1.3095e-01, -1.7946e-01, -1.9144e-01],
            [-6.3771e-02,  5.0846e-02, -1.6471e-01, -1.5248e-01, -1.9750e-01],
            [ 1.7952e-01,  9.8860e-03, -4.5322e-02, -7.7297e-03,  5.0834e-02],
            [ 1.4284e-01, -6.8309e-02, -1.1847e-01,  1.3641e-01,  1.9590e-01],
            [ 1.7032e-01,  6.5213e-02,  8.5182e-02,  5.6357e-02,  1.7980e-02]]],
  
  
          [[[ 7.1066e-02, -1.3164e-01, -1.0162e-01,  1.3573e-01,  5.5445e-02],
            [-6.8366e-02, -1.5092e-01, -2.4752e-03, -3.5418e-03,  1.6175e-01],
            [ 4.9975e-02, -1.0374e-01,  1.4506e-01, -9.8533e-02,  1.1970e-01],
            [-1.6559e-01,  1.6889e-01, -5.4293e-02,  9.1411e-02, -1.4294e-01],
            [-1.3175e-01, -8.1450e-02, -1.3395e-02,  4.7471e-02,  1.0807e-01]]],
  
  
          [[[-8.5364e-02, -8.2596e-02,  5.4527e-02, -1.6190e-01,  1.3557e-01],
            [-1.2136e-04, -1.3412e-01, -1.2526e-01, -7.2968e-02,  1.0480e-0

## model.train() 과 model.eval()

#### model 혹은 하나의 신경망을 생성하면 train()과 eval() attribute가 내장되어 있음

In [210]:
net.train()

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [211]:
net.eval()

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

#### predict() attribute는 없음

In [212]:
net.predict()

AttributeError: 'Net' object has no attribute 'predict'

In [213]:
net.pred()

AttributeError: 'Net' object has no attribute 'pred'

## self.apply(fn)

#### fn은 주로 Module 로 쓰지만 큰 의미는 없다고 한다
#### 주로 model의 parameter 를 초기화할 때 이용한다

In [214]:
def init_weights(m):
    print(m)
    if type(m) == nn.Linear:
        m.weight.data.fill_(1.0)
        print(m.weight)

In [215]:
net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
net.apply(init_weights)

Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)


Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)

## getattr(class 객체, '가져올 객체 이름')

#### 어떠한 신경망의 구성에서 가져오고 싶은 weight를 가져옴
#### 단, 단계적으로 접근해야함

#### 현재 net에는 2가지의 레이어가 존재 각자 이름은 주어지지 않아 0 과 1로 되어 있다

In [216]:
net

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)

In [217]:
getattr(net, '0')

Linear(in_features=2, out_features=2, bias=True)

#### 하나이 레이어에는 weight와 bias가 기본적으로 내장

In [218]:
getattr(getattr(net, '0'), 'weight')

Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)

In [219]:
getattr(getattr(net, '0'), 'bias')

Parameter containing:
tensor([-0.1563, -0.1596], requires_grad=True)