以下、Bをバッチサイズ、Cをチャネル数、Wをウィンドウサイズ、Lをコンテキスト長、Kを予測長とする。

* Context: (B, C, W, L)
* Positive Sampleを1個（B, C, W, K)
* Negative Samples (B, C, W, K)
  * same seq, different loc
  * same user, different seq
  * different user, different seq

* 方針1：Samplerで頑張る
  * Pro：データセットは基本的に今のままでいい
  * Con:NegativeのSamplerもSamplerを複数作れば色んな種類を試しやすい
  * Con:Samplerの仕様をよく把握してない、最終的なコードがわかりにくくなる
* 方針2:データセット自体で頑張る
  * Pro:多分簡単
  * 

In [1]:
import torch
from torch import nn


class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return x


class Encoder(nn.Module):
    """
    Correspond to g_enc in the CPC paper
    """
    def __init__(self, input_shape, hidden_size=400, activation='relu'):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_shape = input_shape
        linear_size = 20 * input_shape[1] * 2

        if activation == 'relu':
            activation = nn.ReLU
        elif activation == 'lrelu':
            activation = nn.LeakyReLU
        
        self.feature = nn.Sequential(
            nn.Conv2d(input_shape[0], 50, kernel_size=(1, 5)), 
            activation(),
            nn.MaxPool2d(kernel_size=(1, 2)), 
            nn.Conv2d(50, 40, kernel_size=(1, 5)), 
            activation(),
            nn.MaxPool2d(kernel_size=(1, 2)), 
            nn.Conv2d(40, 20, kernel_size=(1, 3)), 
            activation(),
            nn.Dropout(0.5),
            Flatten(),
            nn.Linear(linear_size, self.hidden_size), 
            activation(),
            nn.Dropout(0.5),
        )

    def forward(self, input_data):
        feature = self.feature(input_data)
        return feature

    def output_shape(self):
        return (None, self.hidden_size)
    
    
class ContextEncoder(nn.Module):
    """
    Some autoregressive models to emmbedding observations into a context vector. We use GRU here. 
    
    Caution: in the original paper, they say, "The output of the GRU at every timestep is used as the context c", 
    but this code only uses final output of the GRU. 
    """
    
    def __init__(self, input_shape, hidden_size=200, num_layers=2):
        super(ContextEncoder, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = 200
        self.gru = nn.GRU(input_shape[1], hidden_size=self.hidden_size, num_layers=self.num_layers)
        
    def forward(self, X):
        h0 = Variable(torch.zeros(self.num_layers, X.shape[1], self.hidden_size)).cuda()
        return self.gru(X, h0)
    

class Predictor(nn.Module):
    """
    Predict the k step forward future using a context vector c, and k dependent weight matrix. 
    """
    def __init__(self, input_shape, hidden_size, max_steps):
        super(Predictor, self).__init__()
        self.max_steps = max_steps
        self.linears = nn.ModuleList([nn.Linear(input_shape[1], hidden_size) for i in range(max_steps)])
        
    def forward(self, c, k):
        """
        predict the k step forward future from the context vector c
        
        Parameter
        ---------
        c : torch.Variable
            context vector
        k : int
            the number of forward steps, which is used to determine the weight matrix 
        """
        
        return self.linears[k](c)
    


In [2]:
# prepare datasets

from datasets import OppG
import torch.utils.data as data
K = 3 # maximum prediction steps (sequence length of future sequences)
L = 12  # context size

dataset = OppG('S1', 'Gestures', l_sample=30, interval=15, T=K+L)
loader = data.DataLoader(dataset, batch_size=128, shuffle=True)
neg_dataset = OppG('S2', 'Gestures', l_sample=30, interval=15, T=K)  # marginal sample come from a different user for simplicity
neg_loader = data.DataLoader(neg_dataset, batch_size=128, shuffle=True)

In [3]:
# initialize network

g_enc = Encoder(input_shape=dataset.get('input_shape'), hidden_size=100).cuda()
c_enc = ContextEncoder(input_shape=g_enc.output_shape(), num_layers=2, hidden_size=50).cuda()
predictor = Predictor((None, c_enc.hidden_size), g_enc.output_shape()[1], max_steps=K).cuda()

print(g_enc)
print(c_enc)
print(predictor)

Encoder(
  (feature): Sequential(
    (0): Conv2d(1, 50, kernel_size=(1, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(50, 40, kernel_size=(1, 5), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(40, 20, kernel_size=(1, 3), stride=(1, 1))
    (7): ReLU()
    (8): Dropout(p=0.5)
    (9): Flatten()
    (10): Linear(in_features=4520, out_features=100, bias=True)
    (11): ReLU()
    (12): Dropout(p=0.5)
  )
)
ContextEncoder(
  (gru): GRU(100, 200, num_layers=2)
)
Predictor(
  (linears): ModuleList(
    (0): Linear(in_features=200, out_features=100, bias=True)
    (1): Linear(in_features=200, out_features=100, bias=True)
    (2): Linear(in_features=200, out_features=100, bias=True)
  )
)


In [4]:
def get_context(X, g_enc, c_enc):
    z_context = []
    nb_context = X.shape[-1]

    h = Variable(torch.zeros(X.shape[0], c_enc.hidden_size).cuda(), requires_grad=False)
    for i in range(nb_context):
        z_context.append(g_enc(X[..., i]))

    o, h = c_enc(torch.stack(z_context))
    c = h[-1]
    return c

In [None]:
from torch.autograd import Variable
from torch import optim
optimizer = optim.Adam(list(g_enc.parameters()) + list(c_enc.parameters()) + list(predictor.parameters()), lr=0.0001)

criterion = nn.BCELoss()


for num_iter in range(1000):
    X, _ = loader.__iter__().__next__()
    X_m, _ = neg_loader.__iter__().__next__()

    optimizer.zero_grad()
    X = Variable(X.float()).cuda()
    X_m = Variable(X_m.float()).cuda()
    c = get_context(X[..., :L], g_enc, c_enc)
    loss = 0
    for i in range(K):
        z_j = g_enc(X[..., L+i]) 
        z_m = g_enc(X_m[..., i])
        z_p = predictor(c, i)
        score_j = torch.sigmoid(torch.bmm(z_j.unsqueeze(1), z_p.unsqueeze(2)).squeeze(2))
        score_m = torch.sigmoid(torch.bmm(z_m.unsqueeze(1), z_p.unsqueeze(2)).squeeze(2))
        loss += criterion(score_j, Variable(torch.ones((len(score_j), 1))).cuda()) + criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())
        # loss += criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())
    loss = loss / (2*K)
    loss.backward()
    optimizer.step()
    
    if (num_iter+1) % 10 == 0:
        print(num_iter+1, loss.item())

10 0.6900404691696167
20 0.6866204142570496
30 0.6744613647460938
40 0.6368961334228516
50 0.5539376735687256
60 0.4298185110092163
70 0.3314841091632843
80 0.2630160450935364
90 0.22764258086681366
100 0.16308638453483582
110 0.1586257666349411
120 0.14230069518089294
130 0.15530186891555786
140 0.09757449477910995
150 0.0972842201590538
160 0.08111138641834259
170 0.06529131531715393
180 0.059362005442380905


In [8]:
criterion(score_j, Variable(torch.ones((len(score_j), 1))).cuda()), criterion(score_m, Variable(torch.zeros((len(score_j), 1))).cuda())

(Variable containing:
 1.00000e-03 *
   1.3218
 [torch.cuda.FloatTensor of size 1 (GPU 0)], Variable containing:
 1.00000e-03 *
   3.8247
 [torch.cuda.FloatTensor of size 1 (GPU 0)])