In [1]:
#Train set creation
import numpy as np
import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
nb_classes= 10
y_test = tf.keras.utils.to_categorical(y_test, nb_classes)

# Define the number of shards and clients
NUM_SHARDS = 100
NUM_CLIENTS = 100

# Create 100 shards with 480 data points of each label
shards_480 = [[] for i in range(NUM_SHARDS)]
for label in range(10):
    label_indices = np.where(y_train == label)[0]
    np.random.shuffle(label_indices)
    num_shards_per_label = NUM_SHARDS // 10
    for i in range(num_shards_per_label):
        shard_indices = label_indices[i*480:(i+1)*480]
        for j in shard_indices:
            shards_480[label*num_shards_per_label+i].append((x_train[j], label))

# Create 100 shards with 120 random data points
shards_120 = [[] for i in range(NUM_SHARDS)]
all_indices = np.arange(len(y_train))
for i in range(NUM_SHARDS):
    shard_indices = np.random.choice(all_indices, size=120, replace=False)
    for j in shard_indices:
        label = y_train[j]
        shards_120[i].append((x_train[j], label))

# Create 100 clients, each taking one shard from each set of shards
clients = [[] for i in range(NUM_CLIENTS)]
for i in range(NUM_CLIENTS):
    clients[i].extend(shards_480[i])
    clients[i].extend(shards_120[i])
    np.random.shuffle(clients[i])
    

# Randomly shuffle the clients
indices = np.random.permutation(NUM_CLIENTS)
clients = [clients[i] for i in indices]



In [2]:
clients[0]

[(array([0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [3]:
def create_client(image_list, label_list, num_clients,initial='clients'):
    #create a list of client names
    client_names = ['{}_{}'.format(initial, i+1) for i in range(num_clients)]
    #randomize the data
    data = list(zip(image_list, label_list))
    random.shuffle(data)

    #shard data and place at each client
    size = len(data)//num_clients
    shards = [data[i:i + size] for i in range(0, size*num_clients, size)]

    #number of clients must equal number of shards
    assert(len(shards) == len(client_names))
    return shards, client_names

In [2]:
#Test set creation
import numpy as np
import tensorflow as tf

(bla, sla), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_test = x_test.reshape(10000, 784)
x_test = x_test.astype('float32')
x_test /= 255
nb_classes= 10


# Define the number of shards and clients
NUM_SHARDS = 100
NUM_CLIENTS = 100

# Create 100 shards with 480 data points of each label
shards_80 = [[] for i in range(NUM_SHARDS)]
for label in range(10):
    label_indices = np.where(y_test == label)[0]
    np.random.shuffle(label_indices)
    num_shards_per_label = NUM_SHARDS // 10
    for i in range(num_shards_per_label):
        shard_indices = label_indices[i*80:(i+1)*80]
        for j in shard_indices:
            shards_80[label*num_shards_per_label+i].append((x_test[j], label))

# Create 100 shards with 120 random data points
shards_20 = [[] for i in range(NUM_SHARDS)]
all_indices = np.arange(len(y_test))
for i in range(NUM_SHARDS):
    shard_indices = np.random.choice(all_indices, size=20, replace=False)
    for j in shard_indices:
        label = y_test[j]
        shards_20[i].append((x_test[j], label))

# Create 100 clients, each taking one shard from each set of shards
clients = [[] for i in range(NUM_CLIENTS)]
for i in range(NUM_CLIENTS):
    clients[i].extend(shards_80[i])
    clients[i].extend(shards_20[i])
    np.random.shuffle(clients[i])
    

# Randomly shuffle the clients
indices = np.random.permutation(NUM_CLIENTS)
clients = [clients[i] for i in indices]



In [4]:
# train data creation extreme
import numpy as np
import tensorflow as tf


(x_train, y_train), (bla, sla) = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784)

x_train = x_train.astype('float32')

x_train /= 255

nb_classes= 10




# Define the number of shards and clients
NUM_SHARDS = 200
NUM_CLIENTS = 100
SHARD_SIZE = 300

# Create 200 shards with unique labels
shards = [[] for i in range(NUM_SHARDS)]
for label in range(10):
    label_indices = np.where(y_train == label)[0]
    np.random.shuffle(label_indices)
    num_shards_per_label = NUM_SHARDS // 10
    for i in range(num_shards_per_label):
        shard_indices = label_indices[i*SHARD_SIZE:(i+1)*SHARD_SIZE]
        for j in shard_indices:
            shards[label*num_shards_per_label+i].append((x_train[j], label))

# Randomly combine two shards to create 100 clients
client_shards = [[] for i in range(NUM_CLIENTS)]
shard_indices = np.arange(NUM_SHARDS)
np.random.shuffle(shard_indices)
for i in range(NUM_CLIENTS):
    client_shards[i].extend(shards[shard_indices[i*2]])
    client_shards[i].extend(shards[shard_indices[i*2+1]])
    np.random.shuffle(client_shards[i])


In [6]:
import pickle
file_name = f"non_iid_extreme_mnistt.pkl"

open_file = open(file_name, "wb")
pickle.dump(client_shards, open_file)
open_file.close()

In [7]:
# test data creation extreme
import numpy as np
import tensorflow as tf


(bla, sla), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_test = x_test.reshape(10000, 784)
x_test = x_test.astype('float32')
x_test /= 255
nb_classes= 10





# Define the number of shards and clients
NUM_SHARDS = 200
NUM_CLIENTS = 100
SHARD_SIZE = 50

# Create 200 shards with unique labels
shards = [[] for i in range(NUM_SHARDS)]
for label in range(10):
    label_indices = np.where(y_train == label)[0]
    np.random.shuffle(label_indices)
    num_shards_per_label = NUM_SHARDS // 10
    for i in range(num_shards_per_label):
        shard_indices = label_indices[i*SHARD_SIZE:(i+1)*SHARD_SIZE]
        for j in shard_indices:
            shards[label*num_shards_per_label+i].append((x_train[j], label))

# Randomly combine two shards to create 100 clients
client_shards = [[] for i in range(NUM_CLIENTS)]
shard_indices = np.arange(NUM_SHARDS)
np.random.shuffle(shard_indices)
for i in range(NUM_CLIENTS):
    client_shards[i].extend(shards[shard_indices[i*2]])
    client_shards[i].extend(shards[shard_indices[i*2+1]])
    np.random.shuffle(client_shards[i])


In [8]:
import pickle
file_name = f"non_iid_extreme_mnist_test.pkl"

open_file = open(file_name, "wb")
pickle.dump(client_shards, open_file)
open_file.close()

In [3]:
import pickle
file_name = f"non_iid_mnist_test.pkl"

open_file = open(file_name, "wb")
pickle.dump(clients, open_file)
open_file.close()

In [90]:
client_data = []
client_labels = []
for client in clients:
    data = []
    labels = []
    for point in client:
        data.append(point[0])
        labels.append(point[1])
    client_data.append(data)
    client_labels.append(labels)

In [91]:
client_labels[1]

[8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 6,
 8,
 8,
 8,
 8,
 9,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 0,
 9,
 8,
 8,
 8,
 8,
 8,
 7,
 8,
 0,
 8,
 7,
 9,
 4,
 8,
 8,
 8,
 8,
 8,
 5,
 8,
 9,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 3,
 8,
 8,
 9,
 8,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 5,
 8,
 8,
 8,
 8,
 8,
 3,
 6,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 6,
 8,
 7,
 8,
 9,
 9,
 8,
 8,
 8,
 8,
 8,
 5,
 8,
 8,
 8,
 8,
 8,
 6,
 5,
 8,
 8,
 8,
 9,
 8,
 2,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 2,
 8,
 8,
 8,
 8,
 8,
 2,
 8,
 8,
 8,
 8,
 8,
 4,
 8,
 8,
 8,
 6,
 8,
 8,
 8,
 8,
 8,
 6,
 8,
 8,
 8,
 8,
 7,
 8,
 8,
 8,
 8,
 0,
 8,
 8,
 6,
 7,
 8,
 8,
 8,
 8,
 4,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 5,
 8,
 8,
 8,
 9,
 8,
 8,
 8,
 8,
 3,
 5,
 8,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,


In [92]:
def batch_data_non_iid(client_data, client_label, bs=32):
    '''Takes in a clients data shard and create a tfds object off it
    args:
        shard: a data, label constituting a client's data shard
        bs:batch size
    return:
        tfds object'''
    #seperate shard into data and labels lists
    data, label = client_data,client_label
    label=tf.keras.utils.to_categorical(label, 10)
    dataset = tf.data.Dataset.from_tensor_slices((list(data), list(label)))
    return dataset.shuffle(len(label)).batch(bs)

In [93]:
#process and batch the training data for each client

clients_batched = dict()
for i in range(100):
    clients_batched[i] = batch_data(client_data[i], client_labels[i], bs=32)

In [94]:
clients_batched

{0: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>,
 1: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>,
 2: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>,
 3: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>,
 4: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>,
 5: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>,
 6: <BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, na

In [95]:
test_batched = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(len(y_test))

In [96]:
test_batched

<BatchDataset element_spec=(TensorSpec(shape=(None, 784), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.float32, name=None))>

In [98]:
dataset=[clients_batched,test_batched]

In [104]:

def open_file(file_name):
    open_file = open(file_name, "rb")
    Dataset = pickle.load(open_file)
    open_file.close()
    return Dataset

In [105]:
file_name = "non_iid_mnist.pkl"
bal= open_file(file_name)

In [109]:
def create_non_iid(x_train, y_train, num_client, percent):
    # Define the number of shards and clients
    
    NUM_SHARDS = num_client
    NUM_CLIENTS = num_client
    data_point=int(len(x_train)/num_client)
    major= int(round(data_point*percent))
    minor= int(data_point-major)

    # Create 100 shards with 480 data points of each label
    shards_480 = [[] for i in range(NUM_SHARDS)]
    for label in range(10):
        label_indices = np.where(y_train == label)[0]
        np.random.shuffle(label_indices)
        num_shards_per_label = NUM_SHARDS // 10
        for i in range(num_shards_per_label):
            shard_indices = label_indices[i*major:(i+1)*major]
            for j in shard_indices:
                shards_480[label*num_shards_per_label+i].append((x_train[j], label))

    # Create 100 shards with 120 random data points
    shards_120 = [[] for i in range(NUM_SHARDS)]
    all_indices = np.arange(len(y_train))
    for i in range(NUM_SHARDS):
        shard_indices = np.random.choice(all_indices, size=minor, replace=False)
        for j in shard_indices:
            label = y_train[j]
            shards_120[i].append((x_train[j], label))

    # Create 100 clients, each taking one shard from each set of shards
    clients = [[] for i in range(NUM_CLIENTS)]
    for i in range(NUM_CLIENTS):
        clients[i].extend(shards_480[i])
        clients[i].extend(shards_120[i])
        np.random.shuffle(clients[i])


    # Randomly shuffle the clients
    indices = np.random.permutation(NUM_CLIENTS)
    clients = [clients[i] for i in indices]
    return clients
    

In [110]:
bla=create_non_iid(x_train, y_train, num_client=100, percent=.5)

In [None]:
bla

In [118]:
import multiprocessing as mp
import time

def append_value(i, my_list):
    time.sleep(2)
    my_list.append(i)
    return my_list


def parallel_loop(func, start, end, my_list):
    with mp.Pool() as pool:
        results = [pool.apply_async(func, args=(i, my_list)) for i in range(start, end)]
        updated_list = [p.get() for p in results]
    return updated_list


if __name__ == '__main__':
    my_list = []
    start = 10
    end = 1100
    updated_list = parallel_loop(append_value, start, end, my_list)
    print(updated_list)


[[10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], [141], [142], [143], [144], [145], [146], [147], [148], [149], [150], [151], [152], [153], [154], [155], [156], [157], [158], [159], [160], [161], [162], [163], [164], [165