In [1]:
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
from sklearn.cluster import KMeans
## load mnist dataset
use_cuda = torch.cuda.is_available()
root = './data'
if not os.path.exists(root):
    os.mkdir(root)
#trans = transforms.Compose([transforms.Resize(32), transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
# if not exist, download mnist dataset
train_set = dset.MNIST(root=root, train=True, transform=trans, download=True)
test_set = dset.MNIST(root=root, train=False, transform=trans, download=True)
batch_size = 100
train_loader = torch.utils.data.DataLoader(
                 dataset=train_set,
                 batch_size=batch_size,
                 shuffle=True)
test_loader = torch.utils.data.DataLoader(
                dataset=test_set,
                batch_size=batch_size,
                shuffle=False)

In [2]:
import time
timingResult = {}
def logTime(theName, currentTime):
    if theName not in timingResult:
        timingResult[theName] = time.time() - currentTime
    else:
        timingResult[theName] = timingResult[theName] + (time.time() - currentTime)
    currentTime = time.time()
    return currentTime

def printTiming(name):
    print('======== timing for {}: {} ======='.format(name,timingResult[name]))


In [13]:
class DEC_AE(nn.Module):
    def __init__(self, num_classes, num_features):
        super(DEC_AE,self).__init__()
        self.dropout = nn.Dropout(p=0.1)
        self.fc1 = nn.Linear(28*28,500)
        self.fc2 = nn.Linear(500,500)
        self.fc3 = nn.Linear(500,2000)
        self.fc4 = nn.Linear(2000,num_features)
        self.relu = nn.ReLU()
        self.fc_d1 = nn.Linear(500,28*28)
        self.fc_d2 = nn.Linear(500,500)
        self.fc_d3 = nn.Linear(2000,500)
        self.fc_d4 = nn.Linear(num_features,2000)
        self.alpha = 1.0
        self.clusterCenter = nn.Parameter(torch.zeros(num_classes,num_features))
        self.pretrainMode = True
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform(m.weight)

    def setPretrain(self,mode):
        """To set training mode to pretrain or not, 
        so that it can control to run only the AE or AE+DECODER"""
        self.pretrainMode = mode
    
    def updateClusterCenter(self, cc):
        self.clusterCenter.data = torch.from_numpy(cc)
        
    def getTDistribution(self,x):
        """ student t-distribution, as same as used in t-SNE algorithm.
         q_ij = 1/(1+dist(x_i, u_j)^2), then normalize it."""
        xe = torch.unsqueeze(x.cuda(),1) - self.clusterCenter
        q = 1.0 / (1.0 + (torch.sum(torch.mul(xe,xe), 2) / self.alpha))
        q = q ** (self.alpha + 1.0) / 2.0
        q = (q.t() / torch.sum(q, 1)).t() #due to divison, we need to transpose q
        return q
        
    def forward(self,x):
        x = x.view(-1, 1*28*28)
        # 32x32x1
        x = self.dropout(x)
        # 32x32x1
        x = self.fc1(x)
        # 17x17x50
        x = self.relu(x)
        # 17x17x50
        x = self.fc2(x)
        # 17x17x50
        x = self.relu(x)
        # 9x9x50
        x = self.fc3(x)
        # 17x17x50
        x = self.relu(x)
        
        x = self.fc4(x)
        
        # 9x9x50
        x_ae = x
        #if not in pretrain mode, we only need encoder
        if self.pretrainMode == False:
            p = self.getTDistribution(x)
            return x, p
        # 1x68
        ##### auto encoder is done, followed by decoder #####
        # 1x68
        x = self.fc_d4(x)
        # 1x4050
        x = self.relu(x)
        # 1x4050
        x = self.fc_d3(x)
        # 1x4050
        x = self.relu(x)
        x = self.fc_d2(x)
        # 1x4050
        x = self.relu(x)
        x = self.fc_d1(x)
        x_de = x.view(-1,1,28,28)
        # 1x4050
        return x_ae, x_de

In [14]:
import numpy as np
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score

nmi = normalized_mutual_info_score
ari = adjusted_rand_score


def acc(y_true, y_pred):
    """
    
    Calculate clustering accuracy. Require scikit-learn installed
    # Arguments
        y: true labels, numpy.array with shape `(n_samples,)`
        y_pred: predicted labels, numpy.array with shape `(n_samples,)`
    # Return
        accuracy, in [0,1]
    """
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    from sklearn.utils.linear_assignment_ import linear_assignment
    ind = linear_assignment(w.max() - w)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size

In [19]:
class DEC:
    """The class for controlling the training process of DEC"""
    def __init__(self,n_clusters,alpha=1.0):
        self.n_clusters=n_clusters
        self.alpha = alpha
        
    @staticmethod
    def target_distribution(q):
        weight = q ** 2 / q.sum(0)
        return Variable((weight.t() / weight.sum(1)).t().data)
    def logAccuracy(self,pred,label):
        print(' '*8 + '|==>  acc: %.4f,  nmi: %.4f  <==|'
          % (acc(label, pred), nmi(label, pred)))
    
    def pretrain(self,train_loader, test_loader, epochs):
        
        dec_ae = DEC_AE(10,10).cuda() #auto encoder
        mseloss = nn.MSELoss()
        print([i for i in dec_ae.parameters()])
        optimizer = optim.SGD(dec_ae.parameters(),lr = 1, momentum=0.9)
        best_acc = 0.0
        for epoch in range(epochs):
            dec_ae.train()
            running_loss=0.0
            x_eval = []
            label_eval = []
            for i,data in enumerate(train_loader):
                x, label = data
                x,label=Variable(x).cuda(),Variable(label).cuda()
                optimizer.zero_grad()
                x_ae,x_de = dec_ae(x)
                loss = F.mse_loss(x_de,x,reduce=True) #mseloss(x_de,x) # so the aim is to minimize the reconstruct error
                loss.backward()
                optimizer.step()
                x_eval = x.data.cpu().numpy()
                label_eval = label.data.cpu().numpy()
                # print statistics
                running_loss += loss.data.cpu().numpy()[0]
                if i % 100 == 99:    # print every 2000 mini-batches
                    print('[%d, %5d] loss: %.7f' %
                          (epoch + 1, i + 1, running_loss / 100))
                    #print('x_de:',x_de, x)
                    running_loss = 0.0
            #now we evaluate the accuracy with AE
            dec_ae.eval()
            print(x_eval.shape)
            x_ae,_ = dec_ae(Variable(torch.from_numpy(x_eval)).cuda())
            x_ae = x_ae.data.cpu().numpy()
            print(label_eval.shape)
            km = KMeans(n_clusters=len(np.unique(label_eval)), n_init=20, n_jobs=4)
            y_pred = km.fit_predict(x_ae)
            print(y_pred.shape)
            print(' '*8 + '|==>  acc: %.4f,  nmi: %.4f  <==|'
                      % (acc(label_eval, y_pred), nmi(label_eval, y_pred)))
            currentAcc = acc(label_eval, y_pred)
            print(dec_ae.state_dict().keys())
            if currentAcc > best_acc:                
                torch.save(dec_ae,'bestModel'.format(best_acc))
                best_acc = currentAcc
                
    def getTDistribution(self,x, clusterCenter):
        """ student t-distribution, as same as used in t-SNE algorithm.
         q_ij = 1/(1+dist(x_i, u_j)^2), then normalize it."""
        xe = torch.unsqueeze(x,1).cuda() - Variable(torch.from_numpy(clusterCenter.astype(np.float32))).cuda()
        q = 1.0 / (1.0 + (torch.sum(torch.mul(xe,xe), 2) / self.alpha))
        q = q ** (self.alpha + 1.0) / 2.0
        q = (q.t() / torch.sum(q, 1)).t() #due to divison, we need to transpose q
        return q
    
    def train(self,train_loader, test_loader, epochs):
        """This method will start training for DEC cluster"""
        ct = time.time()
        model = torch.load("bestModel").cuda()
        model.setPretrain(False)
        optimizer = optim.SGD([\
             {'params': model.parameters()}, \
            ],lr = 0.01, momentum=0.9)
        print('Initializing cluster center with pre-trained weights')
        km = KMeans(n_clusters=self.n_clusters, n_init=20)
        got_cluster_center = False
        for epoch in range(epochs):
            running_loss=0.0
            for i,data in enumerate(train_loader):
                x, label = data
                x = Variable(x).cuda()
                #step 1 - get cluster center from batch
                if not got_cluster_center:                
                    model.eval()
                    y_pred_ae,_ = model(x)
                    y_pred_ae = y_pred_ae.data.cpu().numpy()
                    print('ae prediction', y_pred_ae.shape)
                    y_pred = km.fit_predict(y_pred_ae) #seems we can only get a centre from batch
                    print('cluster center:',km.cluster_centers_.shape)
                    self.cluster_centers = km.cluster_centers_ #keep the cluster centers
                    model.updateClusterCenter(self.cluster_centers)
                    print('model',model.state_dict())
                    got_cluster_center = True
                else:
                    model.train()
                    
                    #now we start training with acquired cluster center
                    feature_pred,_ = model(x)
                    #output (batchSize,n_cluster)
                    q =  self.getTDistribution(feature_pred, self.cluster_centers)
                    #get target distribution
                    p = self.target_distribution(q)
                    #loss = kld(q,p)
                    loss = F.kl_div(q,p)
                    loss.backward()
                    optimizer.step()
                    running_loss = running_loss + loss.data.cpu().numpy()[0]
                    if i % 100 == 99:    # print every 2000 mini-batches
                        print('[%d, %5d] loss: %.7f' %
                              (epoch + 1, i + 1, running_loss / 100))
                        running_loss = 0.0
                        y_pred = np.argmax(q.data.cpu().numpy(),axis = 1)
                        self.logAccuracy(y_pred,label.cpu().numpy())
        
                

In [20]:
#now start training
import random
random.seed(1234)
dec = DEC(10)
#dec.pretrain(train_loader, test_loader, 10)
dec.train(train_loader, test_loader, 3)

Initializing cluster center with pre-trained weights
ae prediction (100, 10)
cluster center: (10, 10)
model OrderedDict([('clusterCenter', 
 2.4902 -0.4237  0.5157 -3.1956  0.0619  2.5189 -0.0285  2.6575 -1.2623 -3.0487
 0.4079 -0.2695 -4.3435  1.5530 -1.8933 -2.9724  1.4410 -1.6743  1.6491  0.1381
 3.4787 -1.2061 -0.8199  3.3801 -0.5616  0.0545  0.4076 -2.0650 -2.5112 -0.3577
-0.3212 -0.8037 -2.4732  2.0979  0.3179  0.1294  0.3345  2.3224 -0.0704  0.9107
 1.2109  0.3397 -0.0121  0.3399 -0.9986 -0.9832 -2.5460  2.3720  3.5069  1.7639
-0.5699 -3.7199 -3.9909 -2.3479  1.1653 -1.7321 -1.0598  2.2875  0.1055 -2.4138
 4.0967  0.3714 -0.0314  0.1961  3.2683 -1.7943 -1.5224  0.0201 -3.6618  0.7124
 2.1682 -2.6849 -1.2289  2.0534  0.2311  0.0047 -2.7016 -1.8598 -0.1003  0.6921
 1.6858  0.8306  0.2000 -0.8013 -2.7713 -1.1341 -1.7663  0.5399 -1.2303 -1.1217
 0.1444 -2.2742 -2.8888 -3.6588  1.8432 -0.8338  0.4286 -0.1951 -1.4972  3.1681
[torch.FloatTensor of size 10x10]
), ('fc1.weight', 
-3.4459

RuntimeError: Expected object of type Variable[torch.cuda.FloatTensor] but found type Variable[torch.FloatTensor] for argument #1 'other'