In [1]:
import torch
import torch.nn as nn
from IPython.display import clear_output
from IPython.core.debugger import set_trace
from torch.nn import functional as F
import matplotlib.pyplot as plt
from torch import optim
import pickle
import copy
import math
%load_ext autoreload
%autoreload 2
import dlc_practical_prologue as dl

In [21]:
sep = "#" * 20

In [2]:
train_input, train_target, test_input, test_target = dl.load_data(flatten=False)
train_input = torch.functional.F.avg_pool2d(train_input, kernel_size = 2)
test_input = torch.functional.F.avg_pool2d(test_input, kernel_size = 2)

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


In [3]:
class Net2(nn.Module):
    def __init__(self,n_hidden = 100,chan = 1):
        super(Net2,self).__init__()
        self.hidden = n_hidden
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(chan,32,kernel_size=3),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(32,64,kernel_size=3),
            nn.MaxPool2d(kernel_size=2,stride=2)
            ,nn.BatchNorm2d(64)
        )
        self.classifier = nn.Sequential(
            nn.Linear(256,n_hidden),
           # nn.Dropout(0.5),
            nn.Linear(n_hidden,10)
            #nn.Softmax2d()
        )
    def forward(self,x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.classifier(x.view(x.size(0),-1))
        return x

In [4]:
def get_accuracy(model,inputs,targets):
    assert(inputs.size(0) == targets.size(0))
    tot_loss = 0
    nb_correct = 0
    batch_size = 20
    for train,target in zip(inputs.split(batch_size),
                           targets.split(batch_size)):
        pred = model(train)
        pred = torch.argmax(pred,axis = 1)
        nb_correct += (pred == target).int().sum().item()
    accuracy = nb_correct /inputs.size(0)
    print("accuracy: %.2f" % (accuracy) )
    return accuracy

In [5]:
def train_model(model,train_input,train_target,nb_epochs=25):
    optimizer = optim.Adam(model.parameters(), lr = 1e-3)
    batch_size = 100
    criterion = nn.CrossEntropyLoss()
    for e in range(nb_epochs):
        clear_output(wait=True)
        print("Progression:{:.2f}".format(e/nb_epochs*100))
        for inputs,targets in zip(train_input.split(batch_size),
                            train_target.split(batch_size)):
            output = model(inputs)
            loss = criterion(output,targets)
            model.zero_grad()
            loss.backward()
            optimizer.step()

In [6]:
model = Net2()
train_model(model,train_input,train_target)
get_accuracy(model,train_input,train_target)
get_accuracy(model,test_input,test_target)

Progression:48.00


KeyboardInterrupt: 

In [7]:
def Kfold_CV(classtype,inputs,targets,K=4):
    assert(K>=2)
    N = inputs.size(0)
    indxes = torch.randperm(N)\
                  .split(int(N/K))
    accs = torch.empty(K)
    for k in range(K):
        model = classtype()
        
        test_indx = indxes[k]
        train_indx = torch.cat((indxes[:k]+indxes[k+1:]),0)
        
        train_inp,train_targ = inputs[train_indx],targets[train_indx]
        test_inp,test_targ = inputs[test_indx],targets[test_indx]
        train_model(model,train_inp,train_targ)
        acc = get_accuracy(model,test_inp,test_targ)
        accs[k] = acc
    print("Accuracies for {}-fold:{}".format(K,accs.tolist()))
    print("Mean acc:{}".format(accs.mean()))

In [8]:
Kfold_CV(Net2,train_input,train_target)

Progression:48.00


KeyboardInterrupt: 

## Dealing with double Images

### Models

In [11]:
class Naive(nn.Module):
    def __init__(self):
        super(Naive,self).__init__()
        self.net0 = Net2()
        self.net1 = Net2()
        
    def forward(self,x):
        x0 = self.net0(x[:,0].unsqueeze(1))
        x1 = self.net1(x[:,1].unsqueeze(1))
        comp = (x0.max(1)[1] <= x1.max(1)[1]).int()
        ret = torch.FloatTensor(comp.size(0),2).zero_()
        ret.scatter_(1, comp.long().unsqueeze(1), 1)
        return x0,x1,ret
    def __str__(self):
        stro = "Arch"
        if self.weightshare:
            stro += "W"
        if self.auxloss:
            stro += "A"
        return stro

In [12]:
class WeightAux(nn.Module):
    def __init__(self,weightshare=True,auxloss=True):
        super(WeightAux,self).__init__()
        self.weightshare = weightshare
        self.auxloss = auxloss
        self.net0 = Net2()
        self.net1 = Net2()
        self.linblock = nn.Sequential(nn.Linear(20,40),
                                     nn.ReLU(),
                                     nn.Linear(40,80),
                                     nn.ReLU(),
                                     nn.Linear(80,2))

        
    def forward(self,x):
        x0 = self.net0(x[:,0].unsqueeze(1))
        x1 = self.net0(x[:,1].unsqueeze(1)) if self.weightshare else self.net0(x[:,1].unsqueeze(1))
        comp = torch.cat((x0,x1),dim=1)
        comp = self.linblock(comp)
        return x0,x1,comp
    def __str__(self):
        stro = "Arch"
        if self.weightshare:
            stro += "W"
        if self.auxloss:
            stro += "A"
        return stro

### Methods

In [16]:
def accuracy_double_model(model,train_input,train_target,train_classes,verbose=False):
    assert(train_input.size(0) == train_target.size(0))
    N = train_input.size(0)
    tot_loss = 0
    nb_correct = 0
    batch_size = 20
    
    #given a prediction powre and the target, output the number of correctly classified samples
    add_res = lambda pred,target:(torch.argmax(pred,axis = 1) == target).int().sum().item()
    
    score0 = 0
    score1 = 0
    scorecomp = 0
    
    for inputs,comp_targs,classes in zip(train_input.split(batch_size),
                                           train_target.split(batch_size),
                                        train_classes.split(batch_size)):
        targ0 = classes[:,0]
        targ1 = classes[:,1]
        x0,x1,comp = model(inputs)
        
        score0 += add_res(x0,targ0)
        score1 += add_res(x1,targ1)
        scorecomp += add_res(comp,comp_targs)
        
    acc0 = score0 / N
    acc1 = score1 / N
    acc_comp = scorecomp / N
    
    if verbose:
        print("Accuracy 1st Network: {:^10.2f}".format(acc0) )
        print("Accuracy 2nd Network: {:^10.2f}".format(acc1))
        print("Accuracy comparison: {:^12.2f}".format(acc_comp))

    return acc0,acc1,acc_comp

In [17]:
def train_double_model(train_input,train_target,train_classes,\
                       model,crit_comp,optimizer,lr,lambd_,nb_epochs=5,verbose=False):
    """
    Args:
        model: the model to train (3 arch)
        crit_comp: the criterion for comparison (2 sorts)
        optimizer: the chosen optimizer (3 types)
        lr: learning rate (4 types)
        ratio loss: the amount of each loss (3 values)
        lambd_: ratio lambd_ * comp_loss + (1-lambd_) * class_loss
    """
    crit_comp = crit_comp()
    optimizer = optimizer(model.parameters(), lr = lr)

    batch_size = 100
    crit_class = nn.CrossEntropyLoss()
    
    for e in range(nb_epochs):
        if verbose:
            clear_output(wait=True)
            print("Progression:{:.2f}".format(e/nb_epochs*100))
        for inputs,comp_targs,classes in zip(train_input.split(batch_size),
                                           train_target.split(batch_size),
                                           train_classes.split(batch_size)):
            y_onehot = torch.FloatTensor(inputs.size(0),2).zero_()
            targ0 = classes[:,0]
            targ1 = classes[:,1]
            x0,x1,comp = model(inputs)
            loss_class = crit_class(x0,targ0) + crit_class(x1,targ1)
            if (isinstance(crit_comp,(nn.CrossEntropyLoss,nn.NLLLoss))):
                loss_comp = crit_comp(comp,comp_targs)
            else:
                y_onehot.zero_()
                y_onehot.scatter_(1, comp_targs.unsqueeze(1), 1)
                loss_comp = crit_comp(comp,y_onehot)
            if (isinstance(model,Naive)):
                totloss = loss_class
            elif model.auxloss:
                totloss = lambd_ * loss_comp + (1- lambd_) *loss_class
            else:
                totloss = loss_comp
            model.zero_grad()
            totloss.backward()
            optimizer.step()

In [18]:
def Kfold_CVdouble(inputs,targets,classes,\
                   model_template,crit_comp,optimizer,lr,lambd_,K=4,nb_epochs=5,verbose=False):
    """
    Args:
        model_template: the type of architecture for classif (3 arch)
        crit_comp: the criterion for comparison (2 sorts)
        optimizer: the chosen optimizer (3 types)
        lr: learning rate (4 types)
        ratio loss: the amount of each loss (3 values)
        lambd_: ratio lambd_ * comp_loss + (1-lambd_) * class_loss
    """
    assert(K>=2)
    N = inputs.size(0)
    indxes = torch.randperm(N)\
                  .split(int(N/K))
    accs = torch.empty(K,3)
    for k in range(K):
        model = copy.deepcopy(model_template)
        
        test_indx = indxes[k]
        train_indx = torch.cat((indxes[:k]+indxes[k+1:]),0)
        
        train_inp = inputs[train_indx]
        train_targ = targets[train_indx]
        train_classes = classes[train_indx]
        
        test_inp  = inputs[test_indx]
        test_targ = targets[test_indx]
        test_classes = classes[test_indx]
        
        train_double_model(train_inp,train_targ,train_classes,\
                          model,crit_comp,optimizer,lr,lambd_,nb_epochs=nb_epochs,verbose=verbose)
        res = accuracy_double_model(model,test_inp,test_targ,test_classes)
        #0th column: 1st group acc 1th column 2nd group acc 3rd column comp accuracy
        accs[k] = torch.Tensor(res)
    if verbose:
        print(sep + "Accuracies for {}-fold:".format(K) + sep)
        print("1st group acc:{:^14.2f}".format(accs[:,0].mean().item()))
        print("2nd group acc:{:^14.2f}".format(accs[:,1].mean().item()))
        print("Comparison acc:{:^12.2f}".format(accs[:,2].mean().item()))
    return accs[:,2]

### Implementations

In [19]:
N_SAMPLES = 500
a, b, c, d = dl.load_data(flatten=False)
train2_input, train2_target, train2_classes = dl.mnist_to_pairs(N_SAMPLES,a,b)
test2_input, test2_target, test2_classes = dl.mnist_to_pairs(N_SAMPLES,c,d)

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


In [28]:
net2 = WeightAux(True,True)
train_double_model(train2_input,train2_target,train2_classes,\
                   net2,nn.CrossEntropyLoss,optim.SGD,1e-0,0.75,nb_epochs=25)
print(sep + "Train accuracy:" + sep)
accuracy_double_model(net2,train2_input,train2_target,train2_classes,verbose=True)
print(sep+ "Test accuracy:" + sep)
_ = accuracy_double_model(net2,test2_input,test2_target,test2_classes,verbose=True)

####################Train accuracy:####################
Accuracy 1st Network:    1.00   
Accuracy 2nd Network:    1.00   
Accuracy comparison:     1.00    
####################Test accuracy:####################
Accuracy 1st Network:    0.94   
Accuracy 2nd Network:    0.93   
Accuracy comparison:     0.90    


In [20]:
Kfold_CVdouble(train2_input,train2_target,train2_classes,\
               WeightAux(True,True),nn.CrossEntropyLoss,optim.SGD,1e-0,0.75,K=5,verbose=True)

Progression:80.00
####################Accuracies for 5-fold:####################
1st group acc:     0.91     
2nd group acc:     0.91     
Comparison acc:    0.83    


tensor([0.8000, 0.8600, 0.8400, 0.8400, 0.8300])

## Grid Search Model

In [29]:
#1 Architecture
Archis = [WeightAux(True,True),WeightAux(True,False),WeightAux(False,True),WeightAux(False,False)]
#2 Comparison Loss Function
CompLoss = [nn.CrossEntropyLoss,nn.NLLLoss,nn.MSELoss]
#3 Optimizers
Optimizers = [optim.SGD,optim.Adam,optim.Adagrad,optim.AdamW]
#4 Learning Rates
LRs = [1e-4,1e-3,1e-2,1e-1,1]
#5 Ratios
Lambdas = [0.2,0.4,0.7,0.9]
HYPER_PARAMS = [Archis,CompLoss,Optimizers,LRs,Lambdas]

In [45]:
class Param():
    hyper_params = HYPER_PARAMS
    rando = lambda lista: lista[torch.randint(len(lista),(1,)).item()]

    @staticmethod
    def parse(classi):
        return str(classi).split('.')[-1].split("'")[0]

    def __init__(self,arch=None,loss=None,optimizer=None,lr=None,lambd_=None):
        if None in [arch,loss,optimizer,lr,lambd_]:
            self.params = self.generate_rand_params()
        else:
            self.params = {"arch":arch,
                           "loss":loss,
                           "optim":optimizer,
                           "lr":lr,
                           "lambda":lambd_}
        self.scores = []
        self.score_mean = -1
        self.individuality = -1
    def get_params(self):
        return list(self.params.values())
    
    def generate_rand_params(self):
        rand_values = [Param.rando(param) for param in Param.hyper_params]
        names = ["arch","loss","optim","lr","lambda"]
        return {name:val for name,val in zip(names,rand_values)}
    
    def mutate(self):
        param_indx = torch.randint(len(Param.hyper_params),(1,)).item()
        values = Param.hyper_params[param_indx]
        value_indx = torch.randint(len(values),(1,)).item()
        #new value the param needs to take
        value = values[value_indx]
        key = list(self.params.keys())[param_indx]
        #reassign to the random value
        self.params[key] = value
    
    def KFold(self,K=5,verbose=False):
        arch,loss,opt,lr,lambd_ = self.get_params()
        scores = Kfold_CVdouble(train2_input,train2_target,train2_classes,\
                arch,loss,opt,lr,lambd_,K=K,verbose=verbose)
        self.set_scores(scores)
        return scores
        
    def set_scores(self,scores):
        self.scores = scores.tolist()
        self.score_mean = scores.mean().item()
        self.score_std = scores.std().item()
                
         
    def __str__(self):
        returned = "{}_{}_{}_{}_{}_#ind#_{:.2f}_#score#_{:.2f}".format(Param.parse(self.params["arch"]),
                                               Param.parse(self.params["loss"]),
                                               Param.parse(self.params["optim"]),
                                               self.params["lr"],
                                               self.params["lambda"],
                                               self.individuality,
                                               self.score_mean)
        return returned
    def __repr__(self):
        return str(self)


In [31]:
HYPER_GRID = torch.empty(len(Archis),len(CompLoss),len(Optimizers),len(LRs),len(Lambdas)).tolist()
for a,archi in enumerate(Archis):
    for b,loss in enumerate(CompLoss):
        for c,optim_ in enumerate(Optimizers):
            for d,lr in enumerate(LRs):
                for e,lambd_ in enumerate(Lambdas):
                    HYPER_GRID[a][b][c][d][e] = Param(archi,loss,optim_,lr,lambd_)

In [32]:
def lin_view(HYPER_GRID):
    linHGRID = [e for a in HYPER_GRID for b in a for c in b for d in c for e in d]
    return linHGRID

In [33]:
def GridSearch(linGRID):
    for i,param in enumerate(linGRID):
        clear_output(wait=True)
        print("Grid Search progression: {} %".format(i/len(linGRID)*100))
        param.KFold()
    with open('./results/HYPERPARAM.pkl', 'wb') as f:
        pickle.dump(HYPER_GRID,f)
    print("Grid Search done! Hyperparam saved.")

In [34]:
linGRID = lin_view(HYPER_GRID)[:2]
GridSearch(linGRID)

Grid Search progression: 0.0 %


KeyboardInterrupt: 

In [35]:
def GetMax(HYPER_GRID):
    return max(lin_view(HYPER_GRID),key = lambda x:x.score_mean)

In [36]:
bestparam = GetMax(HYPER_GRID)

In [37]:
net,loss,opt,lr,lambd = bestparam.get_params()
net_dropout = WeightAux()
train_double_model(train2_input,train2_target,train2_classes,\
                   net,loss,opt,lr=lr,lambd_=lambd,nb_epochs=25,verbose=True)
print(sep + "Train accuracy:" + sep)
accuracy_double_model(net,train2_input,train2_target,train2_classes,verbose=True)
print(sep+ "Test accuracy:" + sep)
_ = accuracy_double_model(net,test2_input,test2_target,test2_classes,verbose=True)

Progression:96.00
####################Train accuracy:####################
Accuracy 1st Network:    0.14   
Accuracy 2nd Network:    0.13   
Accuracy comparison:     0.47    
####################Test accuracy:####################
Accuracy 1st Network:    0.11   
Accuracy 2nd Network:    0.14   
Accuracy comparison:     0.45    


## Genetic Algorithm

START<br>
Generate the initial population<br>
Compute fitness<br>
REPEAT<br>
    Selection<br>
    Crossover<br>
    Mutation<br>
    Compute fitness<br>
UNTIL population has converged<br>
STOP<br>

Assumption: complex : hill climbing algorithm might get stuck in.

We apply elitist selection with hyper-hyper parameter ratio = 0.5

In [38]:
def generate_population(n_pop=100):
    population = [Param() for _ in range(n_pop)]
    return population

In [39]:
def compute_individuality(population):
    count_matrix = [{i:0 for i in param} for param in Param.hyper_params]
    for ind in population:
        for val,dico in zip(ind.params.values(),count_matrix):
            if not val in dico:
                dico[val] = 0
            dico[val] += 1
    for ind in population:
        diffs = []
        for val,dico in zip(ind.params.values(),count_matrix):
            diffs.append(sum(dico.values()) - dico[val])
        indiv = (1/(sum([1/diff if diff!=0 else 6 for diff in diffs])))
        ind.individuality = indiv
    individualities = [ind.individuality for ind in population]
    maxo,mino = max(individualities),min(individualities)
    if maxo != mino:
        for ind in population:
            ind.individuality = (ind.individuality - mino)/(maxo-mino)

In [40]:
def compute_fitness(population,K=5,verbose=False):
     for ind in population:
            if ind.score_mean == -1:
                ind.KFold(K=K,verbose=verbose)

In [41]:
def selection(population,selec_ratio=0.5,lambd_ =0.2):
    #sort population
    top_decreas = sorted(population,key=lambda x: x.score_mean + lambd_ * x.individuality)[::-1]
    return top_decreas[:int((len(population)*selec_ratio))]    

In [42]:
def breed(top_pop,n_pop=100,N_CHANCE=50):
    assert(n_pop >= len(top_pop))
    top_pop = copy.deepcopy(top_pop)
    n_top = len(top_pop)
    n_miss = n_pop - n_top
    for i in range(n_miss):
        parindx1,parindx2 = torch.randperm(n_top).tolist()[:2]
        par1 = top_pop[parindx1]
        par2 = top_pop[parindx2]
        params1 = par1.get_params()
        params2 = par2.get_params()
        n_params = len(params1)
        bits = torch.randint(2,(n_params,)).tolist()
        params = [par1 if bit == 0 else par2 for par1,par2,bit in zip(params1,params2,bits)]
        child = Param(*params)
        n_rand = torch.randint(N_CHANCE,(1,)).item()
        if n_rand == 0:
            print("mutation!")
            child.mutate()
        top_pop.append(child)
    return top_pop

In [43]:
def plot_population(population):
    N = len(population)
    fig,ax = plt.subplots(1)
    ax.set_ylim([0,1])
    ax.bar(range(N),[ind.score_mean for ind in population])

In [46]:
N_POP = 10
N_ITER = 100
best_indiv = Param()
population = generate_population(N_POP)
compute_fitness(population,K=2,verbose=True)
compute_individuality(population)
for i in range(N_ITER):
    clear_output(wait=False)
    print("Population Progression: {} %".format(i/N_ITER * 100))
    plot_population(population)
    plt.pause(0.05)
    population = selection(population,selec_ratio=0.6)
    if population[0].score_mean > best_indiv.score_mean:
        best_indiv = population[0]
    population = breed(population,N_POP,N_CHANCE=10)
    compute_fitness(population,K=2)
    compute_individuality(population)

Progression:80.00


KeyboardInterrupt: 

In [47]:
a,b,c,d,e = population[6].get_params()

In [None]:
Kfold_CVdouble(train2_input,train2_target,train2_classes,\
               a,b,c,d,e,K=5,nb_epochs=25,verbose=True)

Progression:16.00
