In [1]:
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
from sklearn.cluster import KMeans
## load mnist dataset
use_cuda = torch.cuda.is_available()
root = './data'
if not os.path.exists(root):
    os.mkdir(root)
#trans = transforms.Compose([transforms.Resize(32), transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
# if not exist, download mnist dataset
train_set = dset.MNIST(root=root, train=True, transform=trans, download=True)
test_set = dset.MNIST(root=root, train=False, transform=trans, download=True)
batch_size = 100
train_loader = torch.utils.data.DataLoader(
                 dataset=train_set,
                 batch_size=batch_size,
                 shuffle=True)
test_loader = torch.utils.data.DataLoader(
                dataset=test_set,
                batch_size=batch_size,
                shuffle=False)

In [2]:
import time
timingResult = {}
def logTime(theName, currentTime):
    if theName not in timingResult:
        timingResult[theName] = time.time() - currentTime
    else:
        timingResult[theName] = timingResult[theName] + (time.time() - currentTime)
    currentTime = time.time()
    return currentTime

def printTiming(name):
    print('======== timing for {}: {} ======='.format(name,timingResult[name]))


In [3]:
class DEC_AE(nn.Module):
    def __init__(self):
        super(DEC_AE,self).__init__()
        self.dropout = nn.Dropout(p=0.1)
        self.conv_ae1 = nn.Conv2d(1,50,4,stride=2,padding=2)
        self.conv_ae2 = nn.Conv2d(50,50,5,stride=2,padding=2)
        self.leReLU = nn.LeakyReLU()
        self.fc1 = nn.Linear(50*8*8,48)
        self.tanh = nn.Tanh()
        self.fc_de = nn.Linear(48,50*8*8)
        self.conv_de2 = nn.ConvTranspose2d(50,50,5,stride=2,padding=2)
        self.conv_de1 = nn.ConvTranspose2d(50,1,4,stride=2,padding=2)
        self.pretrainMode = True
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform(m.weight)

    def pretrainMode(self,mode):
        """To set training mode to pretrain or not, 
        so that it can control to run only the AE or AE+DECODER"""
        self.pretrainMode = mode
        
    def forward(self,x):
        # 32x32x1
        x = self.dropout(x)
        # 32x32x1
        x = self.conv_ae1(x)
        # 17x17x50
        x = self.leReLU(x)
        # 17x17x50
        x = self.dropout(x)
        # 17x17x50
        x = self.conv_ae2(x)
        # 9x9x50
        x = self.leReLU(x)
        # 9x9x50
        x = self.dropout(x)
        # 9x9x50
        x = x.view(-1, 50*8*8)
        # 1x4050
        x = self.fc1(x)
        # 1x68
        x = self.tanh(x)
        
        x_ae = x # this is the returned auto encoder
        
        #if not in pretrain mode, we only need encoder
        if self.pretrainMode == False:
            return x, x
        # 1x68
        ##### auto encoder is done, followed by decoder #####
        # 1x68
        x = self.fc_de(x)
        # 1x4050
        x = self.tanh(x)
        # 1x4050
        x = x.view(-1,50,8,8)
        # 9*9*50
        x = self.conv_de2(x)
        # 17x17x50
        x = self.leReLU(x)
        # 17x17x50
        x = self.conv_de1(x)
        # 32x32x1
        x = self.tanh(x)
        x_de = x # this is the returned decoder
        
        return x_ae, x_de

In [4]:
import numpy as np
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score

nmi = normalized_mutual_info_score
ari = adjusted_rand_score


def acc(y_true, y_pred):
    """
    Calculate clustering accuracy. Require scikit-learn installed
    # Arguments
        y: true labels, numpy.array with shape `(n_samples,)`
        y_pred: predicted labels, numpy.array with shape `(n_samples,)`
    # Return
        accuracy, in [0,1]
    """
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    from sklearn.utils.linear_assignment_ import linear_assignment
    ind = linear_assignment(w.max() - w)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size

In [5]:
class DEC:
    """The class for controlling the training process of DEC"""
    def __init__(self,n_clusters,alpha=1.0):
        self.n_clusters=n_clusters
        self.alpha = alpha
        
    @staticmethod
    def target_distribution(q):
        weight = q ** 2 / q.sum(0)
        return Variable((weight.t() / weight.sum(1)).t().data)
    def logAccuracy(self,pred,label):
        print(' '*8 + '|==>  acc: %.4f,  nmi: %.4f  <==|'
          % (acc(label, pred), nmi(label, pred)))
    
    def pretrain(self,train_loader, test_loader, epochs):
        
        dec_ae = DEC_AE().cuda() #auto encoder
        mseloss = nn.MSELoss()
        optimizer = optim.SGD(dec_ae.parameters(),lr = 1, momentum=0.9)
        best_acc = 0.0
        for epoch in range(epochs):
            dec_ae.train()
            running_loss=0.0
            x_eval = []
            label_eval = []
            for i,data in enumerate(train_loader):
                x, label = data
                x,label=Variable(x).cuda(),Variable(label).cuda()
                optimizer.zero_grad()
                x_ae,x_de = dec_ae(x)
                loss = F.mse_loss(x_de,x,reduce=True) #mseloss(x_de,x) # so the aim is to minimize the reconstruct error
                loss.backward()
                optimizer.step()
                x_eval = x.data.cpu().numpy()
                label_eval = label.data.cpu().numpy()
                # print statistics
                running_loss += loss.data.cpu().numpy()[0]
                if i % 100 == 99:    # print every 2000 mini-batches
                    print('[%d, %5d] loss: %.7f' %
                          (epoch + 1, i + 1, running_loss / 100))
                    #print('x_de:',x_de, x)
                    running_loss = 0.0
            #now we evaluate the accuracy with AE
            dec_ae.eval()
            print(x_eval.shape)
            x_ae,_ = dec_ae(Variable(torch.from_numpy(x_eval)).cuda())
            x_ae = x_ae.data.cpu().numpy()
            print(label_eval.shape)
            km = KMeans(n_clusters=len(np.unique(label_eval)), n_init=20, n_jobs=4)
            y_pred = km.fit_predict(x_ae)
            print(y_pred.shape)
            print(' '*8 + '|==>  acc: %.4f,  nmi: %.4f  <==|'
                      % (acc(label_eval, y_pred), nmi(label_eval, y_pred)))
            if acc(label_eval, y_pred) > best_acc:
                best_acc = acc(label_eval, y_pred)
                torch.save(dec_ae,'bestModel'.format(best_acc))
    def getTDistribution(self,x, clusterCenter):
        """ student t-distribution, as same as used in t-SNE algorithm.
         q_ij = 1/(1+dist(x_i, u_j)^2), then normalize it."""
        xe = torch.unsqueeze(x,1) - Variable(torch.from_numpy(clusterCenter.astype(np.float32))).cuda()
        q = 1.0 / (1.0 + (torch.sum(torch.mul(xe,xe), 2) / self.alpha))
        q = q ** (self.alpha + 1.0) / 2.0
        q = (q.t() / torch.sum(q, 1)).t() #due to divison, we need to transpose q
        return q
    
    def train(self,train_loader, test_loader, epochs):
        """This method will start training for DEC cluster"""
        ct = time.time()
        model = torch.load("bestModel").cuda()
        optimizer = optim.SGD(model.parameters(),lr = 0.00001, momentum=0.9)
        print('Initializing cluster center with pre-trained weights')
        km = KMeans(n_clusters=self.n_clusters, n_init=20)
        got_cluster_center = False
        for epoch in range(epochs):
            running_loss=0.0
            for i,data in enumerate(train_loader):
                x, label = data
                x = Variable(x).cuda()
                #step 1 - get cluster center from batch
                if not got_cluster_center:                
                    model.eval()
                    y_pred_ae,_ = model(x)
                    y_pred_ae = y_pred_ae.data.cpu().numpy()
                    print('ae prediction', y_pred_ae,y_pred_ae.shape)
                    y_pred = km.fit_predict(y_pred_ae) #seems we can only get a centre from batch
                    print('cluster center:',km.cluster_centers_.shape)
                    self.cluster_centers = km.cluster_centers_ #keep the cluster centers
                    got_cluster_center = True
                else:
                    model.train()
                    #now we start training with acquired cluster center
                    feature_pred,_ = model(x)
                    #output (batchSize,n_cluster)
                    q =  self.getTDistribution(feature_pred, self.cluster_centers)
                    #get target distribution
                    p = self.target_distribution(q)
                    #loss = kld(q,p)
                    loss = F.kl_div(q,p)
                    loss.backward()
                    optimizer.step()
                    running_loss = running_loss + loss.data.cpu().numpy()[0]
                    if i % 100 == 99:    # print every 2000 mini-batches
                        print('[%d, %5d] loss: %.7f' %
                              (epoch + 1, i + 1, running_loss / 100))
                        running_loss = 0.0
                        y_pred = np.argmax(q.data.cpu().numpy(),axis = 1)
                        self.logAccuracy(y_pred,label.cpu().numpy())
        
                

In [6]:
#now start training
import random
random.seed(1234)
dec = DEC(10)
dec.pretrain(train_loader, test_loader, 200)
dec.train(train_loader, test_loader, 50)

[1,   100] loss: 0.0931961
[1,   200] loss: 0.0362829
[1,   300] loss: 0.0219840
[1,   400] loss: 0.0169257
[1,   500] loss: 0.0141389
[1,   600] loss: 0.0124637
(60000, 1, 28, 28)
(60000,)
(60000,)
        |==>  acc: 0.5678,  nmi: 0.5325  <==|


  "type " + obj.__name__ + ". It won't be checked "


[2,   100] loss: 0.0113251
[2,   200] loss: 0.0104553
[2,   300] loss: 0.0098675
[2,   400] loss: 0.0094984
[2,   500] loss: 0.0090172
[2,   600] loss: 0.0086762
(60000, 1, 28, 28)


RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1518241554738/work/torch/lib/THC/generic/THCStorage.cu:58