<a href="https://colab.research.google.com/github/vdvf96/robustness/blob/master/MNIST_SMART_ENSEMBLE_CVXPY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import argparse
import numpy as np

In [5]:
class SelectNet(nn.Module):
    def __init__(self, n_models):
        super(SelectNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 2*n_models)
        self.fc2 = nn.Linear(2*n_models, n_models)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = x #F.log_softmax(x, dim=1)   JK try this
        return output

In [6]:
train_kwargs = {'batch_size': 32}
test_kwargs = {'batch_size': 1000}
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                   transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                       transform=transform)

train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)

test_loader = torch.utils.data.DataLoader(dataset2,**train_kwargs)

In [7]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.name = 'Net'

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d(p=0.1)
        self.fc1 = nn.Linear(320, 32)  # nn.Linear(320, 50)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, 10)  # nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

In [8]:
use_cuda = torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")

model = [Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device),
           Net().to(device),Net().to(device),Net().to(device),Net().to(device),Net().to(device)]

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
path='drive/MyDrive/MNIST_spec_models'
model[0].load_state_dict(torch.load(path+'/model_0',map_location=torch.device('cpu')))
model[1].load_state_dict(torch.load(path+'/model_1',map_location=torch.device('cpu')))
model[2].load_state_dict(torch.load(path+'/model_2',map_location=torch.device('cpu')))
model[3].load_state_dict(torch.load(path+'/model_3',map_location=torch.device('cpu')))
model[4].load_state_dict(torch.load(path+'/model_4',map_location=torch.device('cpu')))
model[5].load_state_dict(torch.load(path+'/model_5',map_location=torch.device('cpu')))
model[6].load_state_dict(torch.load(path+'/model_6',map_location=torch.device('cpu')))
model[7].load_state_dict(torch.load(path+'/model_7',map_location=torch.device('cpu')))
model[8].load_state_dict(torch.load(path+'/model_8',map_location=torch.device('cpu')))
model[9].load_state_dict(torch.load(path+'/model_9',map_location=torch.device('cpu')))
model[10].load_state_dict(torch.load(path+'/model_10',map_location=torch.device('cpu')))
model[11].load_state_dict(torch.load(path+'/model_12',map_location=torch.device('cpu')))
model[12].load_state_dict(torch.load(path+'/model_13',map_location=torch.device('cpu')))
model[13].load_state_dict(torch.load(path+'/model_14',map_location=torch.device('cpu')))
model[14].load_state_dict(torch.load(path+'/model_15',map_location=torch.device('cpu')))
model[15].load_state_dict(torch.load(path+'/model_16',map_location=torch.device('cpu')))
model[16].load_state_dict(torch.load(path+'/model_17',map_location=torch.device('cpu')))
model[17].load_state_dict(torch.load(path+'/model_18',map_location=torch.device('cpu')))
model[18].load_state_dict(torch.load(path+'/model_19',map_location=torch.device('cpu')))
model[19].load_state_dict(torch.load(path+'/model_20',map_location=torch.device('cpu')))
model[20].load_state_dict(torch.load(path+'/model_23',map_location=torch.device('cpu')))
model[21].load_state_dict(torch.load(path+'/model_24',map_location=torch.device('cpu')))
model[22].load_state_dict(torch.load(path+'/model_25',map_location=torch.device('cpu')))
model[23].load_state_dict(torch.load(path+'/model_26',map_location=torch.device('cpu')))
model[24].load_state_dict(torch.load(path+'/model_27',map_location=torch.device('cpu')))
model[25].load_state_dict(torch.load(path+'/model_28',map_location=torch.device('cpu')))
model[26].load_state_dict(torch.load(path+'/model_29',map_location=torch.device('cpu')))
model[27].load_state_dict(torch.load(path+'/model_30',map_location=torch.device('cpu')))
model[28].load_state_dict(torch.load(path+'/model_34',map_location=torch.device('cpu')))
model[29].load_state_dict(torch.load(path+'/model_35',map_location=torch.device('cpu')))
model[30].load_state_dict(torch.load(path+'/model_36',map_location=torch.device('cpu')))
model[31].load_state_dict(torch.load(path+'/model_37',map_location=torch.device('cpu')))
model[32].load_state_dict(torch.load(path+'/model_38',map_location=torch.device('cpu')))
model[33].load_state_dict(torch.load(path+'/model_39',map_location=torch.device('cpu')))
model[34].load_state_dict(torch.load(path+'/model_40',map_location=torch.device('cpu')))
model[35].load_state_dict(torch.load(path+'/model_45',map_location=torch.device('cpu')))
model[36].load_state_dict(torch.load(path+'/model_46',map_location=torch.device('cpu')))
model[37].load_state_dict(torch.load(path+'/model_47',map_location=torch.device('cpu')))
model[38].load_state_dict(torch.load(path+'/model_48',map_location=torch.device('cpu')))
model[39].load_state_dict(torch.load(path+'/model_49',map_location=torch.device('cpu')))
model[40].load_state_dict(torch.load(path+'/model_50',map_location=torch.device('cpu')))
model[41].load_state_dict(torch.load(path+'/model_56',map_location=torch.device('cpu')))
model[42].load_state_dict(torch.load(path+'/model_57',map_location=torch.device('cpu')))
model[43].load_state_dict(torch.load(path+'/model_58',map_location=torch.device('cpu')))
model[44].load_state_dict(torch.load(path+'/model_59',map_location=torch.device('cpu')))
model[45].load_state_dict(torch.load(path+'/model_60',map_location=torch.device('cpu')))
model[46].load_state_dict(torch.load(path+'/model_67',map_location=torch.device('cpu')))
model[47].load_state_dict(torch.load(path+'/model_68',map_location=torch.device('cpu')))
model[48].load_state_dict(torch.load(path+'/model_69',map_location=torch.device('cpu')))
model[49].load_state_dict(torch.load(path+'/model_70',map_location=torch.device('cpu')))
model[50].load_state_dict(torch.load(path+'/model_78',map_location=torch.device('cpu')))
model[51].load_state_dict(torch.load(path+'/model_79',map_location=torch.device('cpu')))
model[52].load_state_dict(torch.load(path+'/model_80',map_location=torch.device('cpu')))
model[53].load_state_dict(torch.load(path+'/model_89',map_location=torch.device('cpu')))
model[54].load_state_dict(torch.load(path+'/model_90',map_location=torch.device('cpu')))

<All keys matched successfully>

In [11]:
import cvxpy as cp
import cvxpylayer

In [12]:
from cvxpylayer import CvxpyLayer

In [None]:
n_models = len(model)  # how many models in the overall / initial ensemble
for i in range(n_models):
    for param in model[i].parameters():
        param.requires_grad_(False)
selection_net = SelectNet(n_models).to(device)

params=selection_net.parameters()

#loss_fun = utils.HammingLoss()
loss_fun = torch.nn.CrossEntropyLoss()
optimizer = optim.Adadelta(params)
#optimizer = optim.SGD(params, lr=0.001, momentum=0.9)
#for i in range (args.epochs):
train_data, test_data = [], []
for i in range (55):
  print(i+1)
  print("TRAINING")
  train_avg = []
  train_selection(selection_net, model, device, train_loader,optimizer,i+1,loss_fun, train_avg)
  train_data.append(train_avg)
      #scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
  print("TEST")
  test_avg = []
  test(selection_net, model, device, test_loader,i+1,test_avg)
  test_data.append(test_avg)

  with open('test.txt', 'w') as fp:
      fp.write(str(i+1)+"\n")
      for item in test_avg:
          # write each item on a new line
          fp.write(str(item)+"\n")
          
  with open(r'train.txt', 'w') as fp:
      fp.write(str(i+1)+"\n")
      for item in train_avg:
          # write each item on a new line
          fp.write(str(item)+"\n")

1
TRAINING




Loss function value: 
0.05391145497560501
Accuracy per batch size: 
0.96875
Average accuracy: 
0.9340625
Loss function value: 
0.06385114043951035
Accuracy per batch size: 
0.96875
Average accuracy: 
0.93296875


In [1]:
def train_selection(selection_net,model, device, train_loader, optimizer, c, loss_fun, avg):
    n_models = len(model)  # how many models in the overall / initial ensemble
    for m in model:
        m.eval()
    # This will predict the activations used to make a model selection

    C = c # how many models should be selected among n_models

    # simple unweighted knapsack solver, chooses C items with the largest scores
    # output is 0-1 vector where the 1's indicate chosen items
    def batch_knapsack(scores): #64000*55
        indices = torch.topk(scores, C).indices #64000*21
        choice = torch.zeros_like( scores )

        choice.scatter_(1,indices,torch.ones(indices.shape,device=device))
        return choice
    # A 'differentiable' knapsack solver
    # See Berthet et al. 'Learning with Differentiable Perturbed Optimizers'
    # The parameters to this function may need adjusting
    x = cp.Variable(n_models)
    c = cp.Parameter(n_models)
    eps = 0.5
    constraints = [cp.sum(x) == C, x >= 0]
    problem = cp.Problem(cp.Maximize(c @ x - eps * cp.norm(x, p=2)), constraints)
    assert problem.is_dpp()
    knapsack_layer = CvxpyLayer(problem, parameters=[c], variables=[x])

    error = 0
    total = 0
    #random_total = 0
    nIm = 0
    #total_correct_dump_pred = 0
    iteration = 1
    #total_correct_unselected_predictions = 0
    correct_array = []
    total_dumb_calls = 0
    dumb_calls_array = []

    for e in range(1):
        for data, target in train_loader:

            data, target = data.to(device), target.to(device)
        # probably not perfect wrt tensor orientation
            dim = target.shape[0]

            optimizer.zero_grad()

            predictions = torch.stack( [m(data) for m in model] ) #n_models*batch_size*10   #majority_voter(model,data)
            selection_vals = selection_net(data) #batch_size*n_models

            continuous_selections = knapsack_layer(selection_vals)[0]

            selections = batch_knapsack(continuous_selections)
            
            selection_vals = torch.nn.functional.normalize(selection_vals)

            selections = selections.repeat(10,1,1).T #n_models*batch_size*10

            predictions = predictions * selections * selection_vals.repeat(10,1,1).T

            ensemble_pred = torch.zeros((dim,10))

            ensemble_pred = ensemble_pred.to(device)


            for i in predictions:   # batch_size*10
                ensemble_pred += i

            #majority_pred = torch.softmax(ensemble_pred,1)

            binary_target = torch.zeros((dim,10))

            binary_target = binary_target.to(device)

            index = 0

            for t in target:
                binary_target[index,t.item()]=1
                index += 1

            loss = loss_fun(ensemble_pred,binary_target)
            

            index=0
            correct = 0

            app = binary_target.cpu().detach().numpy()
            app2 = ensemble_pred.cpu().detach().numpy()

            for i in range(dim):
                t_max_index = np.argmax(app[i,:])
                p_max_index = np.argmax(app2[i,:])
                if(t_max_index == p_max_index):
                    correct += 1

            total += correct
            nIm += dim

        # Choose a suitable loss function that compares the 'majority' prediction
        # to the labeled target one-hot. Note that both are 'hard' predictions,
        # being 0-1 and not softmax values. Try a simple dot product or Hamming distance.

            loss.backward()
            optimizer.step()

            if (iteration%100==0):
              print("Loss function value: ")
              print(str(loss.item()))
              print("Accuracy per batch size: ")
              print(str(correct/dim))
              print("Average accuracy: ")
              print(str(total/nIm))
              avg.append(total/nIm)
        #print("Random selection average accuracy: ", random_total/nIm)
        #print("Average of dumb calls :",total_dumb_calls/2/nIm)

        #print("Dumb calls per batch size: ",dumb_calls)

        #print("Correct dumb predictions per batch size: ", exact_dumb_pred)
        #print("Average correct dumb predictions: ", total_correct_dump_pred/iteration)
        #print("Percentage of correct unselected predictions: ",correct_unselected_predictions/(64*(n_models-C)))
        #print("Average percentage of correct unselected predictions: ",total_correct_unselected_predictions/(64*(n_models-C))/iteration)


            iteration += 1
            

In [2]:
def test(selection_net, age_model, device, test_loader,c,avg):
    #model.eval()
    test_loss = 0
    correct = 0
    C = c
    def batch_knapsack(scores): #64000*55
        indices = torch.topk(scores, C).indices #64000*21
        choice = torch.zeros_like( scores )

        #somma = torch.sum(scores(1,))


        choice.scatter_(1,indices,torch.ones(indices.shape,device=device))
        return choice

    x = cp.Variable(n_models)
    c = cp.Parameter(n_models)
    eps = 0.5
    constraints = [cp.sum(x) == C, x >= 0]
    problem = cp.Problem(cp.Maximize(c @ x - eps * cp.norm(x, p=2)), constraints)
    assert problem.is_dpp()
    knapsack_layer = CvxpyLayer(problem, parameters=[c], variables=[x])
    
    correct, total, p_correct, p_total, nIm = 0, 0, 0, 0, 0
    iteration = 0
    with torch.no_grad():

        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            # probably not perfect wrt tensor orientation
            dim = target.shape[0]
            age_predictions = torch.zeros(len(age_model), dim, 10)
            # gender_predictions = torch.zeros(len(age_model), dim, 2)
            # race_predictions = torch.zeros(len(age_model), dim, 5)

            age_predictions = torch.stack([m(data) for m in
                                           age_model])  # n_models*batch_size*_classes   #majority_voter(model,data)
            # gender_predictions = torch.stack( [m(data) for m in gender_model] )
            # race_predictions = torch.stack( [m(data) for m in race_model] )
            selection_vals = selection_net(data)  # batch_size*n_models
            # selection_vals.requires_grad_()
            
            #selection_vals = torch.nn.Relu(selection_vals)
            continuous_selections = knapsack_layer(selection_vals)[0]

            selection_vals = torch.nn.functional.normalize(selection_vals)

            selections = batch_knapsack(continuous_selections)

            #selection_vals = torch.nn.functional.normalize(selection_vals)  # before I applied L2 normalization - torch.nn.functional.normalize(selection_vals)
            age_predictions = age_predictions * selections.repeat(10, 1, 1).T * \
                              selection_vals.repeat(10, 1, 1).T

            age_majority_pred = torch.zeros((dim,10))

            age_majority_pred = age_majority_pred.to(device)

            for i in age_predictions:   # batch_size*10
                age_majority_pred += i

            age_binary_target = torch.zeros((dim, 10))

            age_binary_target = age_binary_target.to(device)

            index = 0
            p = 0
            for t in target:
                age_binary_target[index, t.item()] = 1
                index += 1


            index = 0
            age_correct = 0

            for b in age_binary_target:

                a = torch.argmax(b)
                pred = torch.argmax(age_majority_pred[index, :])

                if (a == pred):
                    age_correct += 1
                index += 1

            total += age_correct  # + gender_correct + race_correct

            nIm += dim
            # print("Gender accuracy: ",gender_correct/dim)
            #print("Age accuracy: ")
            #print(str(age_correct / dim))
            # print("Race accuracy: ",race_correct/dim)
            # print("Accuracy per batch size: ",(age_correct+gender_correct+race_correct)/(3*dim))
            print("Average accuracy: ")
            print(str(total / (nIm)))
            avg.append(str(total / (nIm)))

In [10]:
!pip install diffcp==1.0.19  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting diffcp==1.0.19
  Downloading diffcp-1.0.19-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 15.3 MB/s 
[?25hCollecting pybind11>=2.4
  Downloading pybind11-2.10.0-py3-none-any.whl (213 kB)
[K     |████████████████████████████████| 213 kB 75.0 MB/s 
Installing collected packages: pybind11, diffcp
Successfully installed diffcp-1.0.19 pybind11-2.10.0
