以下、Bをバッチサイズ、Cをチャネル数、Wをウィンドウサイズ、Lをコンテキスト長、Tを予測長とする。

* Context: (B, C, W, L, 1)
* Positive Sampleを1個（B, C, W, T, 1)
* Negative Samples (B, C, W, T)
  * same seq, different loc
  * same user, different seq
  * different user, different seq

* 方針1：Samplerで頑張る
  * Pro：データセットは基本的に今のままでいい
  * Con:NegativeのSamplerもSamplerを複数作れば色んな種類を試しやすい
  * Con:Samplerの仕様をよく把握してない、最終的なコードがわかりにくくなる
* 方針2:データセット自体で頑張る
  * Pro:多分簡単
  * 

In [2]:
import torch
from torch import nn


class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return x


class Encoder(nn.Module):
    def __init__(self, input_shape, hidden_size=400, activation='relu'):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_shape = input_shape
        linear_size = 20 * input_shape[1] * 2

        if activation == 'relu':
            activation = nn.ReLU
        elif activation == 'lrelu':
            activation = nn.LeakyReLU
        
        self.feature = nn.Sequential(
            nn.Conv2d(input_shape[0], 50, kernel_size=(1, 5)), 
            activation(),
            nn.MaxPool2d(kernel_size=(1, 2)), 
            nn.Conv2d(50, 40, kernel_size=(1, 5)), 
            activation(),
            nn.MaxPool2d(kernel_size=(1, 2)), 
            nn.Conv2d(40, 20, kernel_size=(1, 3)), 
            activation(),
            nn.Dropout(0.5),
            Flatten(),
            nn.Linear(linear_size, self.hidden_size), 
            activation(),
            nn.Dropout(0.5),
        )

    def forward(self, input_data):
        feature = self.feature(input_data)
        return feature

    def output_shape(self):
        return (None, self.hidden_size)
    
    
class ContextEncoder(nn.Module):
    def __init__(self, input_shape, hidden_size=200, num_layers=2):
        super(ContextEncoder, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = 200
        self.gru = nn.GRU(input_shape[1], hidden_size=self.hidden_size, num_layers=self.num_layers)
        
    def forward(self, X):
        h0 = Variable(torch.zeros(self.num_layers, X.shape[1], self.hidden_size)).cuda()
        return self.gru(X, h0)
    

        
class Predictor(nn.Module):
    def __init__(self, input_shape, hidden_size, max_steps):
        super(Predictor, self).__init__()
        self.max_steps = max_steps
        self.linears = nn.ModuleList([nn.Linear(input_shape[1], hidden_size) for i in range(max_steps)])
        
    def forward(self, c, k):
        """
        predict the k step forward future from the context vector c
        
        Parameter
        ---------
        c : torch.Variable
            context vector
        k : int
            the number of forward steps, which is used to determine the weight matrix 
        """
        
        return self.linears[k](c)
    
    
def predictor_init(m):
    if isinstance(m, nn.Linear):
        m.weight.data.fill_(0.00) # copy_(torch.zeros_like(m.weight.data))
        m.bias.data.fill_(0.00)
    
# predictor.apply(predictor_init)
"""
class ContextEncoderCell(nn.Module):
    def __init__(self, input_shape, hidden_size=200):
        super(ContextEncoderCell, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRUCell(input_shape[1], hidden_size=self.hidden_size)
        
    def forward(self, X, h):
        return self.gru(X, h) 
"""

'\nclass ContextEncoderCell(nn.Module):\n    def __init__(self, input_shape, hidden_size=200):\n        super(ContextEncoderCell, self).__init__()\n        self.hidden_size = hidden_size\n        self.gru = nn.GRUCell(input_shape[1], hidden_size=self.hidden_size)\n        \n    def forward(self, X, h):\n        return self.gru(X, h) \n'

In [3]:
from datasets import _SingleUserSingleADL, OppG
import torch.utils.data as data
T = 3 # maximum prediction steps (sequence length of future sequences)
L = 12  # context size

dataset = OppG('S1', 'Gestures', l_sample=30, interval=15, T=T+L)
loader = data.DataLoader(dataset, batch_size=128, shuffle=True)
neg_dataset = OppG('S2', 'Gestures', l_sample=30, interval=15, T=T)  # marginal sample come from a different user for simplicity
neg_loader = data.DataLoader(neg_dataset, batch_size=128, shuffle=True)

In [4]:
g_enc = Encoder(input_shape=dataset.get('input_shape'), hidden_size=100).cuda()
c_enc = ContextEncoder(input_shape=g_enc.output_shape(), num_layers=2, hidden_size=50).cuda()
predictor = Predictor((None, c_enc.hidden_size), g_enc.output_shape()[1], max_steps=T).cuda()
# predictor.apply(predictor_init)

print(g_enc)
print(c_enc)
print(predictor)

Encoder(
  (feature): Sequential(
    (0): Conv2d(1, 50, kernel_size=(1, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), dilation=(1, 1), ceil_mode=False)
    (3): Conv2d(50, 40, kernel_size=(1, 5), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), dilation=(1, 1), ceil_mode=False)
    (6): Conv2d(40, 20, kernel_size=(1, 3), stride=(1, 1))
    (7): ReLU()
    (8): Dropout(p=0.5)
    (9): Flatten(
    )
    (10): Linear(in_features=4520, out_features=100, bias=True)
    (11): ReLU()
    (12): Dropout(p=0.5)
  )
)
ContextEncoder(
  (gru): GRU(100, 200, num_layers=2)
)
Predictor(
  (linears): ModuleList(
    (0): Linear(in_features=200, out_features=100, bias=True)
    (1): Linear(in_features=200, out_features=100, bias=True)
    (2): Linear(in_features=200, out_features=100, bias=True)
  )
)


In [5]:
def get_context(X, g_enc, c_enc):
    z_context = []
    nb_context = X.shape[-1]

    h = Variable(torch.zeros(X.shape[0], c_enc.hidden_size).cuda(), requires_grad=False)
    for i in range(nb_context):
        z_context.append(g_enc(X[..., i]))

    o, h = c_enc(torch.stack(z_context))
    c = h[-1]
    return c

In [7]:
from torch.autograd import Variable
from torch import optim
optimizer = optim.Adam(list(g_enc.parameters()) + list(c_enc.parameters()) + list(predictor.parameters()), lr=0.0001)

criterion = nn.BCELoss()


for num_iter in range(10000):
    X, _ = loader.__iter__().__next__()
    X_m, _ = neg_loader.__iter__().__next__()

    optimizer.zero_grad()
    X = Variable(X.float()).cuda()
    X_m = Variable(X_m.float()).cuda()
    c = get_context(X[..., :L], g_enc, c_enc)
    loss = 0
    for i in range(T):
        z_j = g_enc(X[..., L+i]) 
        z_m = g_enc(X_m[..., i])
        z_p = predictor(c, i)
        score_j = torch.sigmoid(torch.bmm(z_j.unsqueeze(1), z_p.unsqueeze(2)).squeeze(2))
        score_m = torch.sigmoid(torch.bmm(z_m.unsqueeze(1), z_p.unsqueeze(2)).squeeze(2))
        loss += criterion(score_j, Variable(torch.ones((len(score_j), 1))).cuda()) + criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())
        # loss += criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())
    loss = loss / (2*T) 
    loss.backward()
    optimizer.step()
    
    if (num_iter+1) % 10 == 0:
        print(num_iter+1, loss.data[0])

(10, 0.6918329000473022)
(20, 0.6870383620262146)
(30, 0.6842116117477417)
(40, 0.6404867172241211)
(50, 0.566656231880188)
(60, 0.48404228687286377)
(70, 0.4226868152618408)
(80, 0.3439512252807617)
(90, 0.3530319333076477)
(100, 0.2860561013221741)
(110, 0.2278597056865692)
(120, 0.19529810547828674)
(130, 0.17787134647369385)
(140, 0.11570079624652863)
(150, 0.1130598783493042)
(160, 0.1038263663649559)
(170, 0.12228446453809738)
(180, 0.11137580871582031)
(190, 0.08600395917892456)
(200, 0.07485218346118927)
(210, 0.04993542283773422)
(220, 0.0547170452773571)
(230, 0.054931506514549255)
(240, 0.053934745490550995)
(250, 0.029630355536937714)
(260, 0.03375762328505516)
(270, 0.028681498020887375)
(280, 0.050952088087797165)
(290, 0.030947193503379822)
(300, 0.02747584506869316)
(310, 0.01894466020166874)
(320, 0.03502185270190239)
(330, 0.04334148019552231)
(340, 0.01848229393362999)
(350, 0.02440539561212063)
(360, 0.013370182365179062)
(370, 0.019521210342645645)
(380, 0.02374095

KeyboardInterrupt: 

In [8]:
criterion(score_j, Variable(torch.ones((len(score_j), 1))).cuda()), criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())

(Variable containing:
 1.00000e-03 *
   1.3218
 [torch.cuda.FloatTensor of size 1 (GPU 0)], Variable containing:
 1.00000e-03 *
   3.8247
 [torch.cuda.FloatTensor of size 1 (GPU 0)])

Variable containing:
-0.5376
 0.0000
-0.2556
 0.2598
 0.0334
 0.3356
 0.1516
 0.3498
 0.1674
-0.2075
-0.5773
-0.4136
-0.4082
-0.2005
-0.2326
 0.1670
 0.4051
-0.3416
 0.1434
 0.0000
-0.3829
 0.2251
 0.2381
-0.3092
-0.3388
 0.5055
 0.3117
-0.3075
 0.1922
 0.2009
 0.3714
-0.2663
-0.3674
-0.4595
 0.2275
 0.1806
 0.5348
-0.2914
 0.3089
 0.4450
-0.3086
-0.3533
-0.3393
 0.1842
-0.3149
-0.3159
 0.3443
 0.1646
-0.4574
 0.2962
 0.3052
 0.2909
-0.4097
-0.1889
 0.1828
 0.3752
-0.3989
-0.1065
 0.4027
 0.4138
-0.3753
 0.4191
-0.0154
-0.1927
 0.5721
 0.1792
-0.1974
-0.3906
-0.4063
-0.9203
 0.4499
-0.4651
-0.4710
 0.1755
 0.0000
 0.2089
 0.3390
-0.3094
-0.4811
 0.1548
 0.3919
 0.4328
-0.5044
 0.2818
-0.3609
-0.4347
-0.1564
-0.0942
 0.2925
-0.3358
-0.3748
 0.2301
-0.4198
-0.4209
-0.3177
-0.3782
-0.0321
 0.3876
-0.2524
 0.3748
[torch.cuda.FloatTensor of size 100 (GPU 0)]

In [78]:
criterion = nn.BCEWithLogitsLoss()

for i in range(T):
    z_j = g_enc(X[..., L+i]) 
    z_m = g_enc(X[..., -i])  # Replace here
    z_p = predictor(c, i)
    score_j = torch.sigmoid(torch.bmm(z_j.unsqueeze(1), z_p.unsqueeze(2)).squeeze(1))
    score_m = torch.sigmoid(torch.bmm(z_m.unsqueeze(1), z_m.unsqueeze(2)).squeeze(1))
    loss = criterion(score_j, Variable(torch.ones((len(score_j), 1))).cuda()) + criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())
loss.backward()

In [77]:
loss

Variable containing:
 1.6947
[torch.cuda.FloatTensor of size 1 (GPU 0)]

(torch.Size([128, 1, 400]), torch.Size([128, 400, 1]))

In [26]:
predictor(c).shape

torch.Size([128, 400])