# Medical insurance dataset Federated
https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification?hl=en
### Stand 09.11.
* Tensorflow Federated scheint zu funktionieren
    * Ergebnisse sehen deutlich schlechter aus als zentralisiert.
    * MAE geht nicht unter ~8700 (vs. ~2900 im zentralisierten Modell)
        * R² ist negativ!

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
%reload_ext tensorboard

In [3]:
import collections

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff

np.random.seed(0)

tff.federated_computation(lambda: 'Hello, World!')()

2023-01-11 15:07:06.961766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-11 15:07:07.415546: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-11 15:07:07.415562: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-11 15:07:07.458607: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-11 15:07:08.716289: W tensorflow/stream_executor/platform/de

b'Hello, World!'

In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

dataset = pd.read_csv('data/med-insurance/insurance.csv')
# categorical data columns
dataset['sex'] = dataset['sex'].astype('category').cat.codes
dataset['region'] = dataset['region'].astype('category').cat.codes
dataset['smoker'] = dataset['smoker'].astype('category').cat.codes

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# von 0-1 skalieren
scaler.fit(dataset[['age', 'bmi', 'children']])
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


## One client per region (default scenario):

In [6]:
NUM_CLIENTS = 4
NUM_EPOCHS = 5#2
BATCH_SIZE = 10#5
SHUFFLE_BUFFER = 20
PREFETCH_BUFFER = 5

In [7]:
# create federated dataset, one per region
# federated_train_data = [dataset[dataset['region'] == i].drop(columns=['region']) for i in range(NUM_CLIENTS-1)]

In [8]:
dataset.shape

(1338, 7)

In [11]:
#from sdv.tabular import CTGAN

test_size_per_region = 20
syn_samples_per_region = 1000

def get_dataset_for_region(dataset, region_index):
    region_ds = dataset[dataset['region'] == region_index]
    region_ds = region_ds.drop(columns=['region'])
    len = region_ds.shape[0]

    # synthetic data:
    #syn_model = CTGAN()

    #syn_model.fit(region_ds)
    #syn_region_ds = syn_model.sample(num_rows=syn_samples_per_region)

    #syn_region_ds[['age', 'bmi', 'children']] = scaler.transform(syn_region_ds[['age', 'bmi', 'children']])
    region_ds[['age', 'bmi', 'children']] = scaler.transform(region_ds[['age', 'bmi', 'children']])

    X_test = region_ds.head(test_size_per_region)
    y_test = X_test.pop('charges')

    X_train = region_ds.tail(len - test_size_per_region)
    #X_train = pd.concat([X_train, syn_region_ds], sort=False)
    y_train = X_train.pop('charges')

    fed_train_dataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))

    return (
        fed_train_dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER),
        (X_test, y_test)
    )

federated_insurance_data = [get_dataset_for_region(dataset, i) for i in range(NUM_CLIENTS-1)]

In [38]:
federated_insurance_data[0][0]

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 5), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [43]:
# test: random clients, independent from region:
dataset[['region0', 'region1', 'region2', 'region3']] = pd.get_dummies(dataset['region'])
dataset.pop('region')

KeyError: 'region'

In [44]:
dataset[['age', 'bmi', 'children']] = scaler.transform(dataset[['age', 'bmi', 'children']])
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges,region0,region1,region2,region3
0,0.021739,0,0.321227,0.0,1,16884.92400,0,0,0,1
1,0.000000,1,0.479150,0.2,0,1725.55230,0,0,1,0
2,0.217391,1,0.458434,0.6,0,4449.46200,0,0,1,0
3,0.326087,1,0.181464,0.0,0,21984.47061,0,1,0,0
4,0.304348,1,0.347592,0.0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,0.695652,1,0.403820,0.6,0,10600.54830,0,1,0,0
1334,0.000000,0,0.429379,0.0,0,2205.98080,1,0,0,0
1335,0.000000,0,0.562012,0.0,0,1629.83350,0,0,1,0
1336,0.065217,0,0.264730,0.0,0,2007.94500,0,0,0,1


In [46]:
test_size_per_client = 20
size_of_client_ds = int(dataset.shape[0] / 4)

dataset_to_split = dataset.copy()
random_client_ds = []
for i in range(4):
    sampled = dataset_to_split.sample(n=size_of_client_ds)
    dataset_to_split.drop(sampled.index)

    X_test = sampled.head(test_size_per_region)
    y_test = X_test.pop('charges')

    X_train = sampled.tail(size_of_client_ds - test_size_per_region)
    #X_train = pd.concat([X_train, syn_region_ds], sort=False)
    y_train = X_train.pop('charges')

    fed_train_dataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))


    train_set = fed_train_dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER)
    test_set = (X_test, y_test)


    random_client_ds.append((train_set, test_set))

In [48]:
def create_keras_model():
    return tf.keras.models.Sequential([
        # without region: tf.keras.layers.InputLayer(input_shape=(5,)),
        tf.keras.layers.InputLayer(input_shape=(9,)),
        tf.keras.layers.Dense(16, kernel_initializer='zeros'),
        tf.keras.layers.Dense(6, kernel_initializer='zeros'),
        tf.keras.layers.Dense(1, kernel_initializer='zeros'),
    ])

In [51]:
import tensorflow_addons as tfa
from keras import backend as K

def coeff_determination(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def model_fn():
    # We _must_ create a new model here, and _not_ capture it from an external
    # scope. TFF will call this within different graph contexts.
    keras_model = create_keras_model()
    return tff.learning.from_keras_model(
        keras_model,
        # without region: input_spec=federated_insurance_data[0][0].element_spec,
        input_spec=random_client_ds[0][0].element_spec,
        loss=tf.keras.losses.MeanAbsoluteError(),
        metrics=[
            tf.keras.metrics.MeanAbsoluteError(),
            #tfa.metrics.RSquare()
        ])

In [52]:
iterative_process = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.8),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=3))

In [53]:
RUNNAME = '0,8-3(150)-5-epochs-10-batch-WithRegion/'

In [54]:
print(iterative_process.initialize.type_signature.formatted_representation())

( -> <
  global_model_weights=<
    trainable=<
      float32[9,16],
      float32[16],
      float32[16,6],
      float32[6],
      float32[6,1],
      float32[1]
    >,
    non_trainable=<>
  >,
  distributor=<>,
  client_work=<>,
  aggregator=<
    value_sum_process=<>,
    weight_sum_process=<>
  >,
  finalizer=<
    int64
  >
>@SERVER)


In [55]:
NUM_ROUNDS = 150

In [56]:
#@test {"skip": true}
logdir = "/tmp/logs/scalars/training/"

In [57]:
summary_writer = tf.summary.create_file_writer(logdir+RUNNAME)
state = iterative_process.initialize()

In [58]:
#@test {"skip": true}
with summary_writer.as_default():
    for round_num in range(1, NUM_ROUNDS):
        result = iterative_process.next(state, [f[0] for f in random_client_ds]) #[f[0] for f in federated_insurance_data])
        state = result.state
        metrics = result.metrics
        for name, value in metrics['client_work']['train'].items():
            tf.summary.scalar(name, value, step=round_num)

In [23]:
#@test {"skip": true}
!ls {logdir}
%tensorboard --logdir {logdir} --port=6006

'0,8-3(150)-5-epochs-10-batch-With1000SynData'


Launching TensorBoard...

In [35]:
X_test = pd.concat([f[1][0] for f in federated_insurance_data])
y_test = pd.concat([f[1][1] for f in federated_insurance_data])
#test_data = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_test), tf.convert_to_tensor(y_test)))

test_sets = [tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(np.expand_dims(el[1][0], axis=0)), tf.convert_to_tensor(np.expand_dims(el[1][1], axis=0)))) for el in federated_insurance_data]

In [36]:
evaluation = tff.learning.build_federated_evaluation(model_fn)
#print(evaluation.type_signature.formatted_representation())
model_weights = iterative_process.get_model_weights(state)
train_metrics = evaluation(model_weights, test_sets)
str(train_metrics)


2023-01-11 15:22:56.404703: W tensorflow/core/data/root_dataset.cc:266] Optimization loop failed: CANCELLED: Operation was cancelled
2023-01-11 15:22:56.404741: W tensorflow/core/data/root_dataset.cc:266] Optimization loop failed: CANCELLED: Operation was cancelled


"OrderedDict([('eval', OrderedDict([('mean_absolute_error', 8828.337), ('loss', 8828.338), ('num_examples', 60), ('num_batches', 3)]))])"

In [37]:
from tensorflow_addons.metrics import RSquare

model = create_keras_model()
model_weights.assign_weights_to(model)
model.compile(
    loss=tf.losses.mae,
    optimizer=tf.optimizers.Adam(),
    metrics=["mae", 'mean_squared_error', RSquare()]
)
model.evaluate(X_test, y_test)



[8828.337890625, 8828.337890625, 156564000.0, -0.07493412494659424]

## Random clients (not ordered by region, as a test scenario):

In [None]:
# test: random clients, independent from region:
dataset[['region0', 'region1', 'region2', 'region3']] = pd.get_dummies(dataset['region'])
dataset.pop('region')
dataset[['age', 'bmi', 'children']] = scaler.transform(dataset[['age', 'bmi', 'children']])
dataset



In [None]:
test_size_per_client = 20
size_of_client_ds = int(dataset.shape[0] / 4)

dataset_to_split = dataset.copy()
random_client_ds = []
for i in range(4):
    sampled = dataset_to_split.sample(n=size_of_client_ds)
    dataset_to_split.drop(sampled.index)

    X_test = sampled.head(test_size_per_region)
    y_test = X_test.pop('charges')

    X_train = sampled.tail(size_of_client_ds - test_size_per_region)
    #X_train = pd.concat([X_train, syn_region_ds], sort=False)
    y_train = X_train.pop('charges')

    fed_train_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))

    train_set = fed_train_dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(BATCH_SIZE).prefetch(
        PREFETCH_BUFFER)
    test_set = (X_test, y_test)

    random_client_ds.append((train_set, test_set))




In [None]:
def create_keras_model():
    return tf.keras.models.Sequential([
        # without region: tf.keras.layers.InputLayer(input_shape=(5,)),
        tf.keras.layers.InputLayer(input_shape=(9,)),
        tf.keras.layers.Dense(16, kernel_initializer='zeros'),
        tf.keras.layers.Dense(6, kernel_initializer='zeros'),
        tf.keras.layers.Dense(1, kernel_initializer='zeros'),
    ])




In [None]:
import tensorflow_addons as tfa
from keras import backend as K


def coeff_determination(y_true, y_pred):
    SS_res = K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return (1 - SS_res / (SS_tot + K.epsilon()))




In [None]:
def model_fn():
    # We _must_ create a new model here, and _not_ capture it from an external
    # scope. TFF will call this within different graph contexts.
    keras_model = create_keras_model()
    return tff.learning.from_keras_model(
        keras_model,
        # without region: input_spec=federated_insurance_data[0][0].element_spec,
        input_spec=random_client_ds[0][0].element_spec,
        loss=tf.keras.losses.MeanAbsoluteError(),
        metrics=[
            tf.keras.metrics.MeanAbsoluteError(),
            #tfa.metrics.RSquare()
        ])




In [None]:
iterative_process = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.8),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=3))


In [None]:
RUNNAME = '0,8-3(150)-5-epochs-10-batch-WithRegion/'


In [None]:
print(iterative_process.initialize.type_signature.formatted_representation())
NUM_ROUNDS = 150
#@test {"skip": true}


In [None]:
logdir = "/tmp/logs/scalars/training/"
summary_writer = tf.summary.create_file_writer(logdir + RUNNAME)
state = iterative_process.initialize()



In [None]:
#@test {"skip": true}
with summary_writer.as_default():
    for round_num in range(1, NUM_ROUNDS):
        result = iterative_process.next(state,
                                        [f[0] for f in random_client_ds])  #[f[0] for f in federated_insurance_data])
        state = result.state
        metrics = result.metrics
        for name, value in metrics['client_work']['train'].items():
            tf.summary.scalar(name, value, step=round_num)
#@test {"skip": true}


In [None]:
!ls {logdir}
% tensorboard --logdir {logdir} --port=6006

In [59]:
X_test = pd.concat([f[1][0] for f in random_client_ds])
y_test = pd.concat([f[1][1] for f in random_client_ds])
#test_data = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_test), tf.convert_to_tensor(y_test)))

test_sets = [tf.data.Dataset.from_tensor_slices(
    (tf.convert_to_tensor(np.expand_dims(el[1][0], axis=0)), tf.convert_to_tensor(np.expand_dims(el[1][1], axis=0))))
    for el in random_client_ds]


In [61]:
evaluation = tff.learning.build_federated_evaluation(model_fn)
#print(evaluation.type_signature.formatted_representation())
model_weights = iterative_process.get_model_weights(state)
train_metrics = evaluation(model_weights, test_sets)
str(train_metrics)

2023-01-11 15:51:09.698035: W tensorflow/core/data/root_dataset.cc:266] Optimization loop failed: CANCELLED: Operation was cancelled


"OrderedDict([('eval', OrderedDict([('mean_absolute_error', 8413.923), ('loss', 8413.923), ('num_examples', 80), ('num_batches', 4)]))])"

In [62]:
from tensorflow_addons.metrics import RSquare

model = create_keras_model()
model_weights.assign_weights_to(model)
model.compile(
    loss=tf.losses.mae,
    optimizer=tf.optimizers.Adam(),
    metrics=["mae", 'mean_squared_error', RSquare()]
)
model.evaluate(X_test, y_test)



[8413.921875, 8413.921875, 170050096.0, -0.10584330558776855]

In [29]:
tf.keras.models.load_model(model_weights)

OSError: Unable to load model. Filepath is not an hdf5 file (or h5py is not available) or SavedModel. Received: filepath=ModelWeights(trainable=[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32), array([0., 0., 0., 0., 0., 0.], dtype=float32), array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32), array([9508.416], dtype=float32)], non_trainable=[])