# federated-ml-health 
**Notatnik przystosowany do zajęć z SI w informatyce biomedycznej**.

Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# To install TFF and dependencies
!pip install --quiet --upgrade tensorflow-federated
!pip install --quiet --upgrade nest-asyncio






# Przygotowanie danych

Na początku wykorzystamy zbiór `pima`. W dalszej kolejności będziemy pracować na odpowiednio przygotowanej wersji zbioru MIMIC-III.

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import collections
import matplotlib.pyplot as plt
import nest_asyncio
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import tensorflow_federated as tff
from collections import defaultdict 
from matplotlib.pyplot import figure
from numpy import loadtxt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
nest_asyncio.apply()
tff.framework.set_default_context(tff.backends.native.create_thread_debugging_execution_context(clients_per_thread=50))
np.random.seed(10)
tf.random.set_seed(10)

def get_diabetes_labels():
  return [
      "intercept", "Number of times pregnant", "Plasma glucose concentration",
      "Diastolic blood pressure", "Triceps skin fold thickness",
      "2-Hour serum insulin (mu U/ml)", "BMI", "Diabetes pedigree function",
      "Age (years)", "Diabetic?"
  ]

labels = get_diabetes_labels()

dataset = loadtxt("pima.csv", delimiter=",", skiprows=0)
num_col = np.size(dataset, 1)
X = dataset[:, 0:num_col-1]
y = dataset[:, num_col-1]

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

D = np.column_stack((X, y))
ds = pd.DataFrame(
    data=D,
    columns=labels[1:])

ds

# Podejście scentralizowane

In [None]:
TRAIN_PROPORTION = 0.8
NUM_FEATURES = np.size(X, 1)
NUM_ROUNDS = 8
n_train = round(TRAIN_PROPORTION * np.size(X, 0))
NUM_CLIENTS = 6
NUM_PARTICIPATING_PER_ROUND = round(NUM_CLIENTS/3)

data_train = X[:n_train]
labels_train =  y[:n_train]
data_test = X[n_train:]
labels_test =  y[n_train:]

## Regresja - scikit-learn


In [None]:
sk_model = LogisticRegression(random_state=0, solver='liblinear').fit(data_train, labels_train)
proba = sk_model.predict_proba(data_test)
labels_proba = proba[:,1]
fpr_skl_liblinear, tpr_skl_liblinear, threshold_skl_liblinear = sklearn.metrics.roc_curve(labels_test, labels_proba)
roc_auc_skl_liblinear = sklearn.metrics.auc(fpr_skl_liblinear, tpr_skl_liblinear)
print(f'AUC-LIN = {roc_auc_skl_liblinear:.4}')

sk_model = LogisticRegression(random_state=0, solver='sag').fit(data_train, labels_train)
proba = sk_model.predict_proba(data_test)
labels_proba = proba[:,1]
fpr_skl_sag, tpr_skl_sag, threshold_skl_sag = sklearn.metrics.roc_curve(labels_test, labels_proba)
roc_auc_skl_sag = sklearn.metrics.auc(fpr_skl_sag, tpr_skl_sag)
print(f'AUC-SAG = {roc_auc_skl_sag:.4}')

sk_model = LogisticRegression(random_state=0, solver='lbfgs').fit(data_train, labels_train)
proba = sk_model.predict_proba(data_test)
labels_proba = proba[:,1]
fpr_skl, tpr_skl, threshold_skl = sklearn.metrics.roc_curve(labels_test, labels_proba)
roc_auc_skl = sklearn.metrics.auc(fpr_skl, tpr_skl)
print(f'AUC-SKL = {roc_auc_skl:.4}')

## Regresja - TF

Wyjaśnienie odnośnie definicji modelu (oryginalny projekt): *Adam optimization method is used to mimic the sklearn solver as close as possible (leveraging second derivatives of gradient).*

In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices((data_train, labels_train)).batch(n_train)
dataset_test = tf.data.Dataset.from_tensor_slices((data_test, labels_test)).batch(n_train)

def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.Dense(
          1,
          activation='sigmoid',
          input_shape=(NUM_FEATURES,),
          kernel_regularizer=tf.keras.regularizers.l2(0.01),
      )
  ])

def create_keras_model_deeper():
  initializer = tf.keras.initializers.GlorotNormal(seed=10)
  m = tf.keras.models.Sequential()
  m.add(tf.keras.Input(shape=(NUM_FEATURES,)))
  m.add(tf.keras.layers.Dense(6, activation='sigmoid', kernel_initializer=initializer))
  m.add(tf.keras.layers.Dense(3, activation='sigmoid', kernel_initializer=initializer))
  m.add(tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=initializer, kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.0001, l2=0.01)))
  return m
  

tf_model = create_keras_model()
tf_model.compile(
              optimizer=tf.keras.optimizers.Nadam(learning_rate=0.5),   
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[
                       tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                       tf.keras.metrics.AUC(name='auc'),
                       ]
              )

batch_size = round(n_train/3)
tf_model.fit(dataset_train, validation_data=dataset_test, epochs=NUM_ROUNDS, batch_size=batch_size, verbose=1, use_multiprocessing=True)

labels_proba = tf_model.predict(dataset_test)
fpr_tf, tpr_tf, threshold = sklearn.metrics.roc_curve(labels_test, labels_proba)
roc_auc_tf = sklearn.metrics.auc(fpr_tf, tpr_tf)
print(f'AUC-TF = {roc_auc_tf:0.4}')

# Regresja - TF Federated

Optimizing the same model architecture as above in the TF case, but here trained in a federated (distributed) way.

In [None]:
"""
    Tworzymy ramki danych z danymi i etykietami uczącymi i testującymi
"""
df_data_train = ds.iloc[:n_train, :-1]
df_labels_train = ds.iloc[:n_train, -1:]

df_data_test = ds.iloc[n_train:, :-1]
df_labels_test = ds.iloc[n_train:, -1:]


In [None]:
"""
    Przypisanie identyfikatorów (indeksów) przykładów uczących do poszczególnych klientów. Obecnie wszyscy klienci
    otrzymują taką samą liczbę przykładów, przy czym rozkład klas nie jest zachowywany
"""
def assign_samples_to_clients(data, n_clients):
    from sklearn.model_selection import KFold
    client_sample_ids = []
    kf = KFold(n_splits=n_clients, shuffle=True, random_state=42)
    for _, test_ids in kf.split(data):
        client_sample_ids.append(test_ids)
    return client_sample_ids

In [None]:
NUM_CLIENTS = 20
client_ids = list(range(NUM_CLIENTS))

In [None]:
client_sample_ids = assign_samples_to_clients(data_train, 20)

In [None]:
def create_client_dataset(data, labels, client_ids, client_sample_ids):
  def create_dataset_fn(client_id):
    sample_ids = client_sample_ids[client_id]
    return tf.data.Dataset.from_tensor_slices((data.iloc[sample_ids, :].values, labels.iloc[sample_ids, :].values))

  return tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
      client_ids=client_ids,
      serializable_dataset_fn=create_dataset_fn)
  
def preprocess(dataset):
    card = dataset.cardinality()
    batch_size = 1 if card ==  tf.data.INFINITE_CARDINALITY or tf.data.UNKNOWN_CARDINALITY else round(card.numpy()/3)
    return dataset.batch(batch_size)

def make_federated_data(client_data, client_ids):
  return [
      preprocess(client_data.create_tf_dataset_for_client(id))
      for id in client_ids
  ]

In [None]:
client_dataset_train = create_client_dataset(df_data_train, df_labels_train, client_ids, client_sample_ids)

In [None]:
preprocessed_example_dataset = preprocess(client_dataset_train.create_tf_dataset_for_client(client_ids[0]))

def model_fn():
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
               tf.keras.metrics.AUC(name='auc')])
  
# Create TFF interative process.
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0),
    server_optimizer_fn=lambda: tf.keras.optimizers.Nadam(learning_rate=0.5),
    use_experimental_simulation_loop = True
)

In [None]:
state = iterative_process.initialize()
tff_model = create_keras_model()
tff_auc = defaultdict(lambda:0)

In [None]:
# Test various sizes of subsets of eligible devices participating in each round.
for n_clients in list(range(1, NUM_CLIENTS, 5)):
  for i_round in range(NUM_ROUNDS):
    federated_train_data = make_federated_data(client_dataset_train, np.random.choice(range(NUM_CLIENTS), size=n_clients, replace=False))
    state, metrics = iterative_process.next(state, federated_train_data)
    print(n_clients, i_round, str(metrics))
    state.model.assign_weights_to(tff_model)
    labels_proba = tff_model.predict(dataset_test)
    fpr, tpr, _ = sklearn.metrics.roc_curve(labels_test, labels_proba)
    test_auc = sklearn.metrics.auc(fpr, tpr)
    test_loss = tf.keras.losses.binary_crossentropy(labels_test, np.reshape(labels_proba, [-1]))
    print(f'AUC = {test_auc:0.4}, Loss={test_loss:0.4}')


In [None]:
state.model.assign_weights_to(tff_model)
labels_proba = tff_model.predict(dataset_test)
fpr_tff_sgd, tpr_tff_sgd, threshold_tff_sgd = sklearn.metrics.roc_curve(labels_test, labels_proba)
roc_auc_tff_sgd = sklearn.metrics.auc(fpr_tff_sgd, tpr_tff_sgd)

In [None]:
print(f'AUC-TFF = {roc_auc_tff_sgd:0.4}')

### Porównanie stworzonych modeli

In [None]:
figure(num=None, figsize=(8, 6), dpi=150, facecolor='w', edgecolor='k')
plt.title('ROC')
plt.plot(fpr_skl_liblinear, tpr_skl_liblinear, label = 'sklearn LR LIN AUC = %0.3f' % roc_auc_skl_liblinear)
plt.plot(fpr_skl_sag, tpr_skl_sag, label = 'sklearn LR SAG AUC = %0.3f' % roc_auc_skl_sag)
plt.plot(fpr_tf, tpr_tf, label = 'TF Centralized LR AUC = %0.3f' % roc_auc_tf)
plt.plot(fpr_tff_sgd, tpr_tff_sgd, label = 'TF Federated LR SGDM AUC = %0.3f' % roc_auc_tff_sgd)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()