# Characterizing Datasets

TJ Kim <br/>
3/14/21

Updated <br/>
3/14/21

#### Objective: 
Characterize dataset that are not i.i.d distributed: <br/>

Deliverables
- Guassian extrapolation of mean and variance of each "feature node"
- PAC representation of the point clusters 

The different clusters will be divided along:
- nth data set for each client
- across class lines
- combination of 2 prior


In [1]:
cd '/home/ubuntu/FedAtk/' 

/home/ubuntu/FedAtk


### Load Relevant Libraries and Modules

Load the relevant libraries for the federated learning code.

In [2]:
# Transferer
from transfer_attacks.Transferer import *
from configs.overwrite_config import *

# General Libraries
import torch
import numpy as np
import os
import pandas as pd

In [3]:
from federated_training.femnist_dataloader import Dataloader
from federated_training.cnn_head import CNN_Head
from federated_training.cnn_neck import CNN_Neck
from federated_training.cnn_server import Server
from federated_training.cnn_client import Client
from federated_training.data_manager import DataManager
from federated_training.utils import cuda, where
from federated_training.utilities import freeze_layers

from cw_attack.cw import *

import numpy as np
import torch
import matplotlib.pyplot as plt
import random
import csv
import os
import pickle
from torch.autograd import Variable
import copy

In [4]:
import itertools

# Import Relevant Libraries
from transfer_attacks.Transferer import *

class DA_Transferer(Transferer): 
    """
    - Load all the datasets but separate them
    - Intermediate values of featues after 2 convolution layers
    """
    
    def __init__(self, filename:str, config_name = None):
        super(DA_Transferer, self).__init__(filename=filename, config_name=config_name)
        
        # Hold Onto the data
        self.DA_x = {} # Indexed by client id of dataset
        self.DA_y = {} # Also can be indexed by class - double dictionary for class-client pair
        self.DA_intermed = {}
        self.loader_i = {}
        
        self.mode = None
        self.client_idxs = None
        
    def load_niid_data(self, clients = [0,1,2,3,4,5,6,7]):
        """
        Store all data in dictionary (pre-load) separated by client idx
        """
        
        self.client_idxs = clients
        
        # Import Data Loader for this FL set
        file_indices = [i for i in range(self.config['num_sets'])]
        client_slice = len(file_indices)//self.config['num_clients']
        
        for client_idx in clients:
            self.loader_i[client_idx] = Dataloader(file_indices,[client_idx*(client_slice),min((client_idx+1)*(client_slice),35)])  
            self.loader_i[client_idx].load_training_dataset()
            self.loader_i[client_idx].load_testing_dataset()
        
         
    def set_data(self, mode='client', datasets = range(8), batch_size = 50, classes = [0,1]):
        """
        - fill DA_x, DA_y with relevant data according to dictionary
        modes:
            - 'client' - load all data for specified clients without class filtering
            - 'class'  - load all data and filters by class for different classes separately for single client
            - 'both'   - load all data and filters by class for different classes separately for multiple
        datasets:
            which clients to take dataset from
        """
        
        self.mode = mode
        self.DA_x = {} # Reset
        self.DA_y = {}
        
        
        # store data differently based on what the desired mode is
        if mode == 'client':
            for i in datasets:
                image_data = self.loader_i[i].load_batch(batch_size, mode='test')
                self.DA_x[i] = torch.Tensor(image_data['input']).reshape(batch_size,1,28,28)
                self.DA_y[i] = torch.Tensor(image_data['label']).type(torch.LongTensor)
            
        elif mode == 'class':
            idx = datasets[0] # If given multiple classes take the first one
            loader = self.loader_i[idx]
            y = np.array(loader.test_dataset['user_data']['y'])
            for c in classes:
                args = np.argwhere(y==c)
                np.random.shuffle(args)
                
                # If not enough samples
                if args.shape[0] < batch_size:
                    batch_size_temp = args.shape[0]
                else: 
                    batch_size_temp = batch_size
                
                args = args[0:batch_size_temp]
                args = args.ravel()
                
                # Append data point one by one
                self.DA_x[c] = torch.Tensor(np.array(loader.test_dataset['user_data']['x'])[args]).reshape(batch_size_temp,1,28,28)
                self.DA_y[c] = torch.Tensor(np.array(loader.test_dataset['user_data']['y'])[args])
        
        elif mode == 'both':
            for i in datasets:
                loader = self.loader_i[i]
                y = np.array(loader.test_dataset['user_data']['y'])
                
                self.DA_x[i] = {}
                self.DA_y[i] = {}
                
                for c in classes:
                    args = np.argwhere(y==c)
                
                    # If not enough samples
                    if args.shape[0] < batch_size:
                        batch_size_temp = args.shape[0]
                    else: 
                        batch_size_temp = batch_size

                    args = args[0:batch_size_temp]
                    args = args.ravel()
                    self.DA_x[i][c] = torch.Tensor(np.array(loader.test_dataset['user_data']['x'])[args]).reshape(batch_size_temp,1,28,28)
                    self.DA_y[i][c] = torch.Tensor(np.array(loader.test_dataset['user_data']['y'])[args])
                    
        else:
            raise Exception("Invalid data analysis mode") 
        
    def forward_neck(self, x):
        """
        Only forward through neck to get upto intermediate flattened layer
        """
    
        if torch.cuda.is_available():
                x = x.cuda()
        
        x = self.advNN.neck.forward(x)
        
        return x
    
    def forward_pass(self):
        
        # Turn off dropout 
        self.advNN.eval()
        
        self.DA_intermed = {}

        if self.mode == 'client' or self.mode == 'class':
            for client_idx, value in self.DA_x.items():
                self.DA_intermed[client_idx] = self.forward_neck(value)

        elif self.mode == 'both':
            for client_idx, classes in self.DA_x.items():
                self.DA_intermed[client_idx] = {}
                for class_idx, value in classes.items():
                    self.DA_intermed[client_idx][class_idx] = self.forward_neck(value)
    
    
    def obtain_gaussian(self):
        """
        Run data points through neck of model and obtain output after each flattened layer
        
        
        # Collect data outputs
        self.foward_pass()
        
        group_x = {}
        group_y = {}
        group_label = {}
            
        # Each of these modes separate group_x, group_y analysis
        if self.mode is 'client':
            
        elif self.mode is 'class':
        
        elif self.mode is 'both':
            
        else:
            raise Exception("Invalid data analysis mode") 
            """
        
        if self.mode == 'client' or self.mode == 'class':
            for client_idx, value in self.DA_intermed.items():
                self.DA_intermed[client_idx] = self.forward_neck(value)

        elif self.mode == 'both':
            for client_idx, classes in self.DA_intermed.items():
                self.DA_intermed[client_idx] = {}
                for class_idx, value in classes.items():
                    self.DA_intermed[client_idx][class_idx] = self.forward_neck(value)
            

In [5]:
# Make directory in results for this experiment
# FL Architecture
client_idx = 1
victim_idxs = [0,1,2,3]

# Saved Neural Networks to Test on 
exp_names = ["exp4_neck2_0_head3"]

# Parameters to record for excel printing
num_clients = len(victim_idxs)
metrics = ['orig_acc','orig_sim','orig_acc_robust', 'orig_sim_robust', 
           'orig_acc_adv', 'orig_sim_adv','adv_sim','adv_hit','g_align',
           'g_align_robust', 'g_align_adv']

In [6]:
transferer = DA_Transferer(filename = exp_names[0])
transferer.generate_advNN(client_idx = client_idx)
transferer.generate_victims(client_idxs = victim_idxs)
print('generated model')

Loading  all_data_0_niid_0_keep_0_train_9.json
Loading  all_data_34_niid_0_keep_0_train_9.json
Loading  all_data_17_niid_0_keep_0_train_9.json
Loading  all_data_13_niid_0_keep_0_train_9.json
generated model




In [7]:
transferer.load_niid_data(clients=[0,1])

Loading  all_data_12_niid_0_keep_0_train_9.json
Loading  all_data_20_niid_0_keep_0_train_9.json
Loading  all_data_11_niid_0_keep_0_train_9.json
Loading  all_data_18_niid_0_keep_0_train_9.json
Loading  all_data_0_niid_0_keep_0_train_9.json
Loading  all_data_34_niid_0_keep_0_train_9.json
Loading  all_data_17_niid_0_keep_0_train_9.json
Loading  all_data_13_niid_0_keep_0_train_9.json


In [8]:
transferer.set_data(mode='both', datasets = range(2), batch_size = 2, classes = [0,1])
transferer.DA_y

{0: {0: tensor([0., 0.]), 1: tensor([1., 1.])},
 1: {0: tensor([0., 0.]), 1: tensor([1., 1.])}}

In [9]:
transferer.forward_pass()

In [21]:
transferer.DA_intermed[1][1][:,0]

tensor([ 0.7373, -0.3862], device='cuda:0', grad_fn=<SelectBackward>)