# Data Preperation

In [3]:
import numpy as np
from PIL import Image, ImageFilter
import os
import matplotlib.pyplot as plt
import torch
import torch.optim
import torch.utils.data as data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from scipy.cluster.vq import kmeans, whiten
from sklearn.decomposition import PCA
from tqdm import tqdm
#import clustering
import Models
from sklearn.preprocessing import normalize

def load_data(settings, batchSize, transformation=None, train_ratio=0.8, test_ratio=0.2):
    
    # preprocessing of data
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    tra = [transforms.Resize(256),
           transforms.CenterCrop(224),
           transforms.ToTensor(),
           normalize]
    
    dataset = datasets.ImageFolder(settings, transform=transforms.Compose(tra))
    
    n_train = int(train_ratio * len(dataset))
    n_test = int(test_ratio * len(dataset))
    
    data_train, data_test = torch.utils.data.random_split(dataset, [n_train, n_test])
    
    trainloader = torch.utils.data.DataLoader(data_train,
                                             batch_size=batchSize,
                                             pin_memory=True)
    
    testloader = torch.utils.data.DataLoader(data_test,
                                             batch_size=batchSize,
                                             pin_memory=True)
    
    return trainloader, testloader

    

In [5]:
def cluster_assign(images_lists, dataset):
    """Creates a dataset from clustering, with clusters as labels.
    Args:
        images_lists (list of list): for each cluster, the list of image indexes
                                    belonging to this cluster
        dataset (list): initial dataset
    Returns:
        ReassignedDataset(torch.utils.data.Dataset): a dataset with clusters as
                                                     labels
    """
    assert images_lists is not None
    pseudolabels = []
    image_indexes = []
    for cluster, images in enumerate(images_lists):
        image_indexes.extend(images)
        pseudolabels.extend([cluster] * len(images))

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    t = transforms.Compose([transforms.RandomResizedCrop(224),
                            transforms.RandomHorizontalFlip(),
                            transforms.ToTensor(),
                            normalize])

    return ReassignedDataset(image_indexes, pseudolabels, dataset, t)

In [4]:
def random(ds, k, random_state=42):
    """
    Create random cluster centroids.
    
    Parameters
    ----------
    ds : numpy array
        The dataset to be used for centroid initialization.
    k : int
        The desired number of clusters for which centroids are required.
    Returns
    -------
    centroids : numpy array
        Collection of k centroids as a numpy array.
    """

    np.random.seed(random_state)
    centroids = []
    m = np.shape(ds)[0]

    for _ in range(k):
        r = np.random.randint(0, m-1)
        centroids.append(ds[r])

    return np.array(centroids)


In [None]:
def preprocessing(model, features):
    #pca
    pca_reduced = PCA(features, n_components=256)
    #whitening
    whitened=whiten(features)
    #l2 normalization
    f_normalized = normalize(whitened)

In [None]:
def clustering(pre_data, k = 2):
    random_cen = random(pre_data,k)
    clustered_data = kmeans(pre_data,random_cen)

In [None]:
def train(dataLoader, model, crit, optimizer, epoch):
    for i, (input_tensor, target) in enumerate(dataLoader):
        # switch to train mode
        model.train()
        # create an optimizer for the last fc layer
        optimizer_tl = torch.optim.SGD(
            model.top_layer.parameters(),
            lr=lr,
            weight_decay=10**wd,
        )
        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input_tensor.cuda())
        target_var = torch.autograd.Variable(target)

        output = model(input_var)
        loss = crit(output, target_var)

        # record loss
        losses.update(loss.data[0], input_tensor.size(0))

        # compute gradient and do SGD step
        opt.zero_grad()
        optimizer_tl.zero_grad()
        loss.backward()
        opt.step()
        optimizer_tl.step()

    return losses.avg

In [None]:
#main method
def main(lr = 0.05, sobel = True, momentum = 0.9, wd = -5, ep = 10, bs = 2, k = 10):
    
    # load the data
    trainL, testL = load_data(path, 1)
    #load vgg
    model = models.__dict__["vgg16"](sobel) #choose classes?
    fd = int(model.top_layer.weight.size()[1]) #what is that?
    model.top_layer = None # why?
    model.features = torch.nn.DataParallel(model.features)
    model.cuda()
    cudnn.benchmark = True

    # create optimizer
    optimizer = torch.optim.SGD(
            filter(lambda x: x.requires_grad, model.parameters()),
            lr=lr,
            momentum=momentum,
            weight_decay=10**wd,
       )

    # define loss function
    criterion = nn.CrossEntropyLoss().cuda()
    
    # for all epochs
    for epoch in range(ep):
        # remove head
        model.top_layer = None
        model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])
        # get the features for the whole dataset
        features = compute_features(dataloader, model, len(dataset))
        pre_data = preprocessing(model, features)
        clus_data = clustering(pre_data)
       
        # pseudo labels
        train_dataset = cluster_assign(clus_data, dataset.imgs)

        # uniformly sample per target
        sampler = UnifLabelSampler(int(args.reassign * len(train_dataset)),
                                   clus_data)

        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch,
            num_workers=args.workers,
            sampler=sampler,
            pin_memory=True,
        )
        
        # set last fully connected layer
        mlp = list(model.classifier.children())
        mlp.append(nn.ReLU(inplace=True).cuda())
        model.classifier = nn.Sequential(*mlp)
        model.top_layer = nn.Linear(fd, len(deepcluster.images_lists))
        model.top_layer.weight.data.normal_(0, 0.01)
        model.top_layer.bias.data.zero_()
        model.top_layer.cuda()

        # train network with clusters as pseudo-labels
      
                
        loss = train(train_dataloader, model, criterion, optimizer, epoch)
        print(loss)

In [2]:
path = '/home/space/datasets/imagenet/2012/train_set_small'
    

for item in tqdm(trainL):
    print(item)
    plt.imshow(item[0].permute(2, 3, 1, 0).numpy().reshape(224, 224, 3))
    plt.show()
    
    

FileNotFoundError: [WinError 3] Das System kann den angegebenen Pfad nicht finden: '/home/space/datasets/imagenet/2012/train_set_small'

# Data Visualization

In [None]:
def show(data, outfile):
    
    # save a image using extension 
    image = image.save("geeks.jpg") 

# Model Protoype

# Training

In [None]:
for epoch in range(args.start_epoch, args.epochs):
        end = time.time()

        # remove head
        model.top_layer = None
        model.classifier = nn.Sequential(*list(model.classifier.children())[:-1])

        # get the features for the whole dataset
        features = compute_features(dataloader, model, len(dataset))

        # cluster the features
        if args.verbose:
            print('Cluster the features')
        clustering_loss = deepcluster.cluster(features, verbose=args.verbose)

        # assign pseudo-labels
        if args.verbose:
            print('Assign pseudo labels')
        train_dataset = clustering.cluster_assign(deepcluster.images_lists,
                                                  dataset.imgs)

        # uniformly sample per target
        sampler = UnifLabelSampler(int(args.reassign * len(train_dataset)),
                                   deepcluster.images_lists)

        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch,
            num_workers=args.workers,
            sampler=sampler,
            pin_memory=True,
        )

        # set last fully connected layer
        mlp = list(model.classifier.children())
        mlp.append(nn.ReLU(inplace=True).cuda())
        model.classifier = nn.Sequential(*mlp)
        model.top_layer = nn.Linear(fd, len(deepcluster.images_lists))
        model.top_layer.weight.data.normal_(0, 0.01)
        model.top_layer.bias.data.zero_()
        model.top_layer.cuda()

        # train network with clusters as pseudo-labels
        end = time.time()
        loss = train(train_dataloader, model, criterion, optimizer, epoch)