# Maximum Likelihood Density Ratio Estimation for MI

paper: http://proceedings.mlr.press/v4/suzuki08a/suzuki08a.pdf

In [5]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import sys
import numpy as np
import pycuda.driver as cuda
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from models import DNN, CNN

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
use_cuda = torch.cuda.is_available()
print("Setting Info")
print("=========")
print("- use_cuda: ", use_cuda)
print("- Path: ", os.getcwd())
print("- PyTorch", torch.__version__)
print("- Python: ", sys.version)

Setting Info
- use_cuda:  True
- Path:  /home/uchiumi/JNNS2019/mnist_pytorch
- PyTorch 1.0.1.post2
- Python:  3.5.2 (default, Nov 12 2018, 13:43:14) 
[GCC 5.4.0 20160609]


## Load Model

In [7]:
# model reload
model = DNN()
PRETRAINED_MODEL_PATH = "/home/uchiumi/JNNS2019/mnist_pytorch/train_log/dnn_mnist__2019-0425-1923.pth"
model.load_state_dict(torch.load(PRETRAINED_MODEL_PATH))

## Data Loader

cf.) https://www.aiworkbox.com/lessons/examine-mnist-dataset-from-pytorch-torchvision

In [8]:
mnist_trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=None)
mnist_testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=None)

In [9]:
X_train_0 = np.asarray(mnist_trainset[0][0]) # image
y_train_0 = mnist_trainset[0][1] # label

In [10]:
# train
train_loader_for_MINE = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('./data', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=1, shuffle=False)

In [11]:
# test
test_loader_for_MINE = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('./data', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=1, shuffle=False)

## Get layer values (the state of each nodes)

In [12]:
def get_nodes_with_train_data(model):
    model.eval()
    list = []
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(train_loader_for_MINE):
            result = model(data)
            list.append(result)
    return list

In [13]:
def get_nodes_with_test_data(model):
    model.eval()
    list = []
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader_for_MINE):            
            result = model(data)
            list.append(result)
    return list

In [14]:
list = get_nodes_with_train_data(model)

In [15]:
len(list)

60000

In [16]:
list[0].keys()

dict_keys(['model_output', 'model_input', 'fc1_output', 'fc3_output', 'fc4_output', 'fc2_output'])

In [17]:
print("model_input", list[0]["model_input"].shape)
print("fc1_output",list[0]["fc1_output"].shape)
print("fc2_output",list[0]["fc2_output"].shape)
print("fc3_output",list[0]["fc3_output"].shape)
print("fc4_output",list[0]["fc4_output"].shape)
print("model_output",list[0]["model_output"].shape)

model_input torch.Size([1, 784])
fc1_output torch.Size([1, 1024])
fc2_output torch.Size([1, 512])
fc3_output torch.Size([1, 256])
fc4_output torch.Size([1, 128])
model_output torch.Size([1, 10])


In [18]:
model_input = []
fc1_output = []
fc2_output = []
fc3_output = []
fc4_output = []
model_output = []

for i in range(len(train_loader_for_MINE)):
    model_input.append(list[i]["model_input"].data.numpy().flatten())
    fc1_output.append(list[i]["fc1_output"].data.numpy().flatten())
    fc2_output.append(list[i]["fc2_output"].data.numpy().flatten())
    fc3_output.append(list[i]["fc3_output"].data.numpy().flatten())
    fc4_output.append(list[i]["fc4_output"].data.numpy().flatten())
    model_output.append(list[i]["model_output"].data.numpy().flatten())

In [19]:
model_input = np.array(model_input)
fc1_output = np.array(fc1_output)
fc2_output = np.array(fc1_output)
fc3_output = np.array(fc1_output)
fc4_output = np.array(fc1_output)
model_output = np.array(model_output)

## Maximum Likelihood Density Ratio Estimation for MI

In [20]:
x = model_input
y = fc3_output
z = np.concatenate([x, y], axis=1)

In [21]:
print(x.shape)
print(y.shape)
print(z.shape)

(60000, 784)
(60000, 1024)
(60000, 1808)


#### Kernel function

In [22]:
class RBFkernel():
    def __init__(self, sigma=0.5):
        self.sigma = sigma
        
    def __call__(self, x, y):
        numerator = -1 * np.sum((x - y)**2)
        denominator = 2 * (self.sigma**2)
        return np.exp(numerator / denominator)
    
    def get_params(self):
        return self.sigma
    
    def set_params(self, sigma):
        self.sigma = sigma

$$
    \underset{\alpha \in \mathbb{R}^{b}}{\rm maximize} \, \sum_{i=1}^{n} \log \left( {\alpha}^{\mathrm{T}} \phi(x_i, y_i) \right) \\
    s.t. \, \frac{1}{n(n-1)} \sum_{i \neq j}^{n(n - 1)} {\alpha}^{\mathrm{T}} \phi(x_i, y_j) = 1, \, \alpha \geq 0
$$

つまり．

$$
    \underset{\alpha \in \mathbb{R}^{b}}{\rm maximize} L(\alpha) \\
    L(\alpha) = \sum_{i=1}^{n} \log \left( {\alpha}^{\mathrm{T}} \phi(x_i, y_i) \right) 
    - \lambda \left( \frac{1}{n(n-1)} \sum_{i \neq j}^{n(n - 1)} {\alpha}^{\mathrm{T}} \phi(x_i, y_j) - 1 \right)
$$

$$
    \alpha \in \mathbb{R}^{b}, \, \phi() \in \mathbb{R}^{b}
$$

$$
    \phi(z_i) = \left( \begin{array}{c} \phi_1(z_i) \\ \phi_2(z_i) \\ \vdots \\ \phi_b(z_i) \end{array} \right)
    = \left( \begin{array}{c} k(z_i, c_1) \\ k(z_i, c_2) \\ \vdots \\ k(z_i, c_b) \end{array} \right)
    = \left( \begin{array}{c} \exp\left( - \frac{{|| z_i - c_1||}^{2}}{2 \sigma^2} \right) \\ \exp\left( - \frac{{|| z_i - c_2||}^{2}}{2 \sigma^2} \right) \\ \vdots \\ \exp\left( - \frac{{|| z_i - c_b||}^{2}}{2 \sigma^2} \right) \end{array} \right)
$$


$$
    \frac{\partial L(\alpha)}{\partial \alpha} = 
    \left( \begin{array}{c} 
        \sum_{i=1}^{n} \frac{\phi_{1}(z_i)}{{\alpha}^{\mathrm{T}}\phi(z_i)} \\
        \sum_{i=1}^{n} \frac{\phi_{2}(z_i)}{{\alpha}^{\mathrm{T}}\phi(z_i)} \\
        \vdots \\ 
        \sum_{i=1}^{n} \frac{\phi_{b}(z_i)}{{\alpha}^{\mathrm{T}}\phi(z_i)} \\
     \end{array} \right) 
     + \frac{\lambda}{n(n-1)}
     \left( \begin{array}{c} 
         \sum_{i \neq j}^{n(n-1)} \phi_{1}(x_i, y_j) \\
         \sum_{i \neq j}^{n(n-1)} \phi_{2}(x_i, y_j) \\
        \vdots \\ 
        \sum_{i \neq j}^{n(n-1)} \phi_{b}(x_i, y_j) 
     \end{array} \right)
$$

#### Density Ratio Estimation

In [41]:
class DensityRatioEstimation():
    def __init__(self, kernel, b):
        self.kernel = kernel # basis function
        self.b = b # dim of the basis function
        self.alpha = np.ones(self.b)
        self.n = x.shape[0] # sample size
    
    def fit(self, x, y, lr=0.01, nb_epoch=10000, log_freq=100):
        self.x = x
        self.y = y 
        self.z = np.concatenate([x, y], axis=1)
        
        c_index = np.random.randint(0, len(self.z), self.b) # random choise from z
        c = self.z[c_index] # center for the basis function
        
        for epoch in range(nb_epoch):
            loss, grad_loss = self.loss(self.alpha, self.c)
            self.alpha = self.alpha + lr * grad_loss
            
            if epoch % log_freq == 0:
                print("epoch: {} \t loss of sample data: {:.4f}".format(epoch, self.loss))
        
    def loss(self, alpha, c):
        loss = 0.0
        loss_grad = np.zeros(self.b)
        inner_product = np.zeros(self.n)
        
        # loss
        for i in range(self.n):
            basis_fn = np.zeros(self.b)
            for bi in range(self.b):
                basis_fn[bi] = self.kernel(self.z[i], c[bi])
            
            inner_product[i] = np.dot(alpha, basis_fn)
            loss += np.log(inner_product[i])
        loss = -1 * loss
            
        # loss_grad
        for bi in range(self.b):
            for i in range(self.n):
                loss_grad[bi] += self.kernel(self.z[i], c[bi]) / inner_product[i]
        grad_loss = -1 * grad_loss
        
        return loss, grad_loss

In [35]:
print(x.shape)
print(y.shape)
print(z.shape)

(60000, 784)
(60000, 1024)
(60000, 1808)


In [36]:
len(z)

60000

In [38]:
c_index = np.random.randint(1, len(z), 20) # random choise from z
c = z[c_index]

In [40]:
c.shape

(20, 1808)

In [37]:
np.random.randint(1, len(z), self.b)

NameError: name 'self' is not defined

In [None]:
class KernelRegression():
    def __init__(self, kernel):
        self.kernel = kernel
        
    def fit_kernel(self, X, y, lr=0.01, nb_epoch=1000, log_freq=50):
        self.X = X
        self.y = y
        self.n = X.shape[0] # sample size
        self.alpha = np.full(self.n, 1) # param alpha: initialize
        self.gram_matrix = np.zeros((self.n, self.n))
        
        # Gradient Descent Algorithm to optimize alpha
        for epoch in range(nb_epoch):
            
            # Gram Matrix
            for i in range(self.n):
                for j in range(self.n):
                    self.gram_matrix[i][j] = self.kernel(self.X[i], self.X[j])
                    self.loss, self.loss_grad = self.mse(self.X, self.y, self.alpha, self.gram_matrix)
                    self.alpha = self.alpha - lr * self.loss_grad
                    
            if epoch % log_freq == 0:
                print("epoch: {} \t MSE of sample data: {:.4f}".format(epoch, self.loss))
                        
                        
    def mse(self, X, y, alpha, gram_matrix):
        loss = np.dot((y - np.dot(gram_matrix, alpha)), (y - np.dot(gram_matrix, alpha)))
        loss_grad = -2 * np.dot(gram_matrix.T, (y - np.dot(gram_matrix, alpha)))
        return loss, loss_grad
    
    def predict(self, X_new):
        n_new = X_new.shape[0]
        y_new = np.zeros(n_new)
        for i in range(n_new):
            for j in range(self.n):
                y_new[i] += self.alpha[j] * self.kernel(X_new[i], self.X[j])
        return y_new