In [None]:
import pandas as pd
import numpy as np
import random
import math
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


dataset link: [subset of research dataset](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction/)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/thesis/dataset/preprocessed_data.csv')
data = data[data.columns[1:]]

# # making testing data: last 100 unseen
# test_data = data.loc[data.shape[0]-100:]
# data = data[:data.shape[0]-101]

In [None]:
class DataDistribution:
  def __init__(self, showing_intermediate_steps = False):
    self.showing_intermediate_steps = showing_intermediate_steps
    self.synthetically_distributed_data = None

  def error_handling(self, splitting, number_of_clusters):
    sum = 0
    for i in splitting:
      sum+=i
    if len(splitting)==number_of_clusters and sum==1.0:
      return True
    else:
      return False

  def scale_training_X(self, x):
    scaling = StandardScaler()
    x = scaling.fit_transform(x)
    return x

  def setup(self, data, number_of_clusters, number_of_clients_in_each_cluster, splitting):
    if self.error_handling(splitting, number_of_clusters)==False:
      raise Exception(f"there is some error in splitting, please follow the standards")

    splitting.pop()
    total_data = data.shape[0]
    temp_index = []
    count = 0
    for frac in splitting:
      count += frac
      temp_index.append(math.floor(count*total_data))
    temp_index.append(total_data-1)
    splitting = temp_index

    if self.showing_intermediate_steps:
      print(splitting)

    splitting_index = []
    starting = 0
    for i in splitting:
      temp = [starting, i]
      starting = i + 1
      splitting_index.append(temp)

    if self.showing_intermediate_steps:
      print(splitting_index)

    splitting_index_with_cluster_clients = []
    for cluster_index in splitting_index:
      starting = cluster_index[0]
      ending = cluster_index[1]
      data_per_client = math.floor((ending-starting+1) / number_of_clients_in_each_cluster)
      subending = starting
      for _ in range(number_of_clients_in_each_cluster):
        subending += data_per_client
        temp = [starting, subending]
        splitting_index_with_cluster_clients.append(temp)
        starting = subending + 1

    if self.showing_intermediate_steps:
      print(splitting_index_with_cluster_clients)

    curr_counter = 0
    cluster_client_data = []
    for i in range(number_of_clusters):
      cluster_data = []
      for j in range(number_of_clients_in_each_cluster):
        starting_index = splitting_index_with_cluster_clients[curr_counter][0]
        ending_index = splitting_index_with_cluster_clients[curr_counter][1]
        curr_client_data = data.loc[starting_index:ending_index]
        x = curr_client_data.drop('HeartDisease',axis=1)
        y = curr_client_data.HeartDisease
        x = self.scale_training_X(x)
        curr_client_data_X_Y = [x, y]
        curr_counter += 1
        cluster_data.append(curr_client_data_X_Y)
      cluster_client_data.append(cluster_data)

    self.synthetically_distributed_data = cluster_client_data
    print(f"data distributed synthetically!")

In [None]:
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier

class Model1:
  def __init__(self):
    m = LogisticRegression()
    self.model = m
    self.name = 'LogisticRegression'

  def fit(self, x_train, y_train):
    self.model.fit(x_train, y_train)

  def score(self, x, y):
    return self.model.score(x, y)

  def get_name(self):
    return self.name

  def get_coef(self):
    print(self.model.coef_)

class Model2:
  def __init__(self):
    m = RidgeClassifier()
    self.model = m
    self.name = 'RidgeClassifier'

  def fit(self, x_train, y_train):
    self.model.fit(x_train, y_train)

  def score(self, x, y):
    return self.model.score(x, y)

  def get_name(self):
    return self.name

  def get_coef(self):
    print(self.model.coef_)

class Model3:
  def __init__(self):
    m = SGDClassifier()
    self.model = m
    self.name = 'SGDClassifier'

  def fit(self, x_train, y_train):
    self.model.fit(x_train, y_train)

  def score(self, x, y):
    return self.model.score(x, y)

  def get_name(self):
    return self.name

  def get_coef(self):
    print(self.model.coef_)

In [None]:
class MultiModelCFL:
  def __init__(self, data, number_of_clusters, number_of_clients_in_each_cluster, number_of_clusters_every_iteration, splitting):
    self.number_of_clusters = number_of_clusters
    self.number_of_clients_in_each_cluster = number_of_clients_in_each_cluster
    self.number_of_clusters_every_iteration = number_of_clusters_every_iteration
    self.splitting = splitting
    if self.error_handling()==False:
      raise Exception(f"something went wrong, please follow the standards")
    data_dist = DataDistribution()
    data_dist.setup(data, self.number_of_clusters, self.number_of_clients_in_each_cluster, self.splitting)
    self.synthetically_distributed_data = data_dist
    print(f"setup is ready to be executed!")

  def error_handling(self):
    if(self.number_of_clusters<=0 or \
       self.number_of_clients_in_each_cluster<=0 or \
       self.number_of_clusters_every_iteration<=0 or \
       self.number_of_clusters_every_iteration>self.number_of_clusters):
      return False

  def cluster_selection_algo(self):
    selected_clusters = []
    count = 0
    while(count<self.number_of_clusters_every_iteration):
      current_cluster_number = random.randint(0, self.number_of_clusters-1)
      if current_cluster_number not in selected_clusters:
        selected_clusters.append(current_cluster_number)
        count = count + 1
    return selected_clusters

  def model_selection_algo(self):
    current_model_index = random.randint(0, self.number_of_models-1)
    return current_model_index

  def client_level_aggregation_algo(self, client_level_models):
    result = client_level_models[0]
    for i in range(len(client_level_models)):
      result = np.add(result, client_level_models[i])
    result = result / len(client_level_models)
    print(result)
    return result

  def cluster_level_aggregation_algo(self, cluster_level_models, current_model_index):
    result = cluster_level_models[0]
    for i in range(len(cluster_level_models)):
      result = np.add(result, cluster_level_models[i])
    result = result / len(cluster_level_models)
    self.global_models[current_model_index].model.coef_ = result

  def execute(self, number_of_iterations):
    # initiate the global models (number of global models can be 3)
    self.number_of_models = 3
    self.global_models = []
    for index in range(self.number_of_models):
      if index==0:
        self.global_models.append(Model1())
      elif index==1:
        self.global_models.append(Model2())
      else:
        self.global_models.append(Model3())

    # strategy: for one iteration the algorithm is using one algorithm only for all of the clients

    count = 0
    for itr in range(number_of_iterations):
      current_model_index = self.model_selection_algo()
      current_model = self.global_models[current_model_index]
      cluster_indices = self.cluster_selection_algo()
      cluster_level_models = []
      for cluster_index in cluster_indices:
        client_level_models = []
        for client_index in range(self.number_of_clients_in_each_cluster):
          current_client_data = self.synthetically_distributed_data.synthetically_distributed_data[cluster_index][client_index]
          current_client_X = current_client_data[0]
          current_client_Y = current_client_data[1]
          current_model.fit(current_client_X, current_client_Y)
          s = current_model.score(current_client_X, current_client_Y)
          print(f"iteration: {count+1} || cluster number: {cluster_index+1} || client number: {client_index+1} || model: {current_model.get_name()} || score: {s}")
          count += 1
          client_level_models.append(current_model.model.coef_)
        coef = self.client_level_aggregation_algo(client_level_models)
        cluster_level_models.append(coef)
      self.cluster_level_aggregation_algo(cluster_level_models, current_model_index)
    print(f"architecture executed successfully!")

  def _scale_training_X(self, x):
    scaling = StandardScaler()
    x = scaling.fit_transform(x)
    return x

  def voting(self, predictions):
    sum = predictions[0]
    for i in range(1, len(predictions)):
      sum = np.add(sum, predictions[i])
    result = (sum > (len(predictions) / 2)).astype(int)
    return result

  def calculate_score_testing(self, final_prediction, y):
    y = y.to_numpy()
    matching_elements = np.sum(final_prediction == y)
    score = matching_elements / len(final_prediction)
    return score

  def predict(self, df):
    x = df.drop('HeartDisease',axis=1)
    y = df.HeartDisease
    x = self._scale_training_X(x)
    predictions = []
    for index in range(self.number_of_models):
      s = self.global_models[0].model.predict(x)
      predictions.append(s)
    final_prediction = self.voting(predictions)
    score = self.calculate_score_testing(final_prediction, y)
    return score

In [None]:
mmcfl = MultiModelCFL(data, 3, 2, 2, [0.5, 0.3, 0.2])

data distributed synthetically!
setup is ready to be executed!


In [None]:
mmcfl.execute(10)

iteration: 1 || cluster number: 2 || client number: 1 || model: RidgeClassifier || score: 0.9057971014492754
iteration: 2 || cluster number: 2 || client number: 2 || model: RidgeClassifier || score: 0.8102189781021898
[[ 0.03814735 -0.16324741  0.19303315  0.07752907 -0.02382248  0.04393618
   0.04341956 -0.21848717 -0.24971646  0.1889617   0.4075732 ]]
iteration: 3 || cluster number: 3 || client number: 1 || model: RidgeClassifier || score: 0.8804347826086957
iteration: 4 || cluster number: 3 || client number: 2 || model: RidgeClassifier || score: 0.8681318681318682
[[ 0.23000706 -0.43246735  0.29564532  0.04605559  0.02450925 -0.06033144
   0.05371288  0.00523752 -0.40849944  0.25784449  0.30012875]]
iteration: 5 || cluster number: 1 || client number: 1 || model: LogisticRegression || score: 0.9437229437229437
iteration: 6 || cluster number: 1 || client number: 2 || model: LogisticRegression || score: 0.9260869565217391
[[-0.40138565 -0.7696251   0.78717809  0.55613482 -0.2526454   1

In [None]:
mmcfl.predict(data)

0.8474945533769063