In [None]:
!pip install tenseal
!pip install cryptography

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tenseal
  Downloading tenseal-0.3.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tenseal
Successfully installed tenseal-0.3.14
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier

import pickle
import tenseal as ts
import urllib3

from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.asymmetric import padding

import os
import shutil
import time

In [None]:
def get_encryption_keys():
  # read keys from remote server
  http = urllib3.PoolManager()

  bytes_private_key = http.request('GET', 'https://personal.utdallas.edu/~pxn210006/keys/private_key.pem')
  bytes_public_key = http.request('GET', 'https://personal.utdallas.edu/~pxn210006/keys/public_key.pem')

  private_key = serialization.load_pem_private_key(
      bytes_private_key.data,
      password=None,
      backend=default_backend()
  )

  public_key = serialization.load_pem_public_key(
      bytes_public_key.data,
      backend=default_backend()
  )

  return private_key, public_key

In [None]:
def decrypt_master_model(file_name, decrypted_model):
  # Decrypting the model
  input = open(decrypted_model, 'ab')

  # get keys
  private_key, public_key = get_encryption_keys()

  with open(file_name, 'rb') as output:
    while True:
      encrypt = output.read(256)

      if not encrypt:
        break

      original_message = private_key.decrypt(
          encrypt,
          padding.OAEP(
              mgf=padding.MGF1(algorithm=hashes.SHA256()),
              algorithm=hashes.SHA256(),
              label=None
          )
      )

      input.write(original_message)

  input.close()

In [None]:
#normalization of dataframe data
def normalize_df(df):
  for column in df.columns:
    df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
  return df

from collections import Counter
# handling outlier data in dataframe
def outlier_detection(df, n, columns):
    rows = []
    will_drop_train = []
    for col in columns:
        Q1 = np.nanpercentile(df[col], 25)
        Q3 = np.nanpercentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_point = 1.5 * IQR
        rows.extend(df[(df[col] < Q1 - outlier_point)|(df[col] > Q3 + outlier_point)].index)
    for r, c in Counter(rows).items():
        if c >= n: will_drop_train.append(r)
    return will_drop_train

def preprocess_data(csv_file, predict_flag=False):
  df = pd.read_csv(csv_file) #passing address of csv file to create data frame

  # renaming columns
  df.rename(columns={'height(cm)':'height', 'weight(kg)':'weight','waist(cm)':'waist',
                          'eyesight(left)':'eyesight_left', 'eyesight(right)':'eyesight_right',
                          'hearing(left)':'hearing_left', 'hearing(right)':'hearing_right',
                          'fasting blood sugar':'fasting_blood_sugar',  'Cholesterol':'cholesterol',
                          'HDL':'hdl','LDL':'ldl','Urine protein':'urine_protein',
                          'serum creatinine':'serum_creatinine', 'AST':'ast','ALT':'alt',
                          'Gtp':'gtp', 'dental caries' : 'dental_caries'}, inplace=True)

  #converting non-numeric columns to numeric data type
  df['gender'] = df['gender'].str.replace('F','0')
  df['gender'] = df['gender'].str.replace('M','1')
  df['gender'] = pd.to_numeric(df['gender'])

  df['tartar'] = df['tartar'].str.replace('N','0')
  df['tartar'] = df['tartar'].str.replace('Y','1')
  df['tartar'] = pd.to_numeric(df['tartar'])

  df['oral'] = df['oral'].str.replace('N','0')
  df['oral'] = df['oral'].str.replace('Y','1')
  df['oral'] = pd.to_numeric(df['oral'])

  #cleaning data by observation
  df = df.drop(['ID'], axis=1)

  # removing oral column due to skewed data
  df = df.drop("oral", axis='columns')

  #handling outliers in df
  will_drop_train = outlier_detection(df, 3, df.select_dtypes(["float", "int"]).columns)
  df.drop(will_drop_train, inplace = True, axis = 0)

  #creating x and y split where y is the resultant classification data
  y=None
  x = df[['height','weight','waist','hdl','ldl','serum_creatinine','alt','gtp','dental_caries','tartar','triglyceride','hemoglobin']]
  if predict_flag==False:
    y = df['smoking']

  #normalizing x data to maintain the scale necessary for creation of model
  x = normalize_df(x)

  return x, y

In [None]:
def train_model(sgd_model, x_train, y_train, print_flag=False):
  sgd_model.partial_fit(x_train, y_train, classes = np.unique(y_train))

  if print_flag:
    x_train_prediction = sgd_model.predict(x_train)
    training_data_accuracy = accuracy_score(x_train_prediction, y_train)
    print('Training data accuracy: ',training_data_accuracy)

    x_test, y_test = preprocess_data('https://personal.utdallas.edu/~pxn210006/dataset/dataset_test.csv')
    score = sgd_model.score(x_test, y_test)
    print('New model accuracy on test data', score)

def print_test_accuracy(sgd_model, input_csv_data):
  x_in, y_in = preprocess_data(input_csv_data, predict_flag=True) #y_in is None since we are pre-processing for prediction
  result = sgd_model.predict(x_in)
  print('Result for input data ', result)

  score = sgd_model.score(x_in, result)
  print('Model accuracy on input data ', score)



In [None]:
def encrypt_model_parameters(sgd_model, encrypted_model):
  sgd_params = np.hstack((sgd_model.intercept_[:,None], sgd_model.coef_))

  def context():
    context = ts.context(ts.SCHEME_TYPE.CKKS, 8192, coeff_mod_bit_sizes=[60, 40, 40, 60])
    context.global_scale = pow(2, 40)
    context.generate_galois_keys()
    return context

  context = context()

  sgd_params_encrypted = ts.ckks_tensor(context, sgd_params)
  params_encrypted = sgd_params_encrypted.serialize()

  with open('encrypted_model', 'wb') as file:
    file.write(params_encrypted)

  # encrypt the model
  output = open(encrypted_model, 'ab')

  # get keys
  private_key, public_key = get_encryption_keys()

  with open('encrypted_model', 'rb') as input:
    while True:
      msg = input.read(100)

      if not msg:
        break

      encrypted = public_key.encrypt(
          msg,
          padding.OAEP(
              mgf=padding.MGF1(algorithm=hashes.SHA256()),
              algorithm=hashes.SHA256(),
              label=None
          )
      )

      output.write(encrypted)

  output.close()

In [None]:
def get_consent():
  x = input('Do you consent to use this data for improving the model? (Y/N) ')
  x = x.lower()
  if x=="y":
    return True
  elif x=="n":
    return False
  else:
    print("Invalid option selected. Please try again and choose 'y' or 'n' ")
    return get_consent()

In [None]:
def main():
  # after receiving model from master
  # decrypt the model
  decrypt_master_model('master_model', 'decrypted_master_model')

  # get orginal model back
  sgd_model = pickle.load(open('decrypted_master_model', 'rb'))

  # load and process data
  dataset_link = 'https://personal.utdallas.edu/~pxn210006/dataset/dataset3.csv'
  x_train, y_train = preprocess_data(dataset_link)

  # train the model
  train_model(sgd_model, x_train, y_train, True)

  # encrypt the model
  encrypt_model_parameters(sgd_model, 'worker_model')

# main()

In [None]:
#Worker node pickle file handling

import os
import shutil  # refer - https://www.geeksforgeeks.org/how-to-create-a-duplicate-file-of-an-existing-file-using-python/

#active file
act_file = "active_worker_model"

#passive file
pass_file = "passive_master_model"

#file received from Master Node
updated_file = "master_model"

while True:
  if act_file.isfile():

    # after receiving model from master
    # decrypt the model
    temp=None
    decrypt_master_model(pass_file, None)
    #os.remove(pass_file)

    #Using active file to get the orginal model back
    m = pickle.load(open(pass_file, 'rb'))

    #ask for input data file from user in csv format
    in_file_csv = input("Provide input csv filepath: ")

    #get user consent using input prompt
    if get_consent():
      # pre-process data
      x, y = preprocess_data(in_file_csv)
      a = train_model(m, x, y, print_flag=True) #train model using partial_fit on new data and print accuracy for this trained model

      # encrypt the model
      encrypt_model_parameters(a, 'worker_model')

      #send model to Master
      send_model(a)
    else:
      print_test_accuracy(m, in_file_csv) #print accuracy for this trained model

    #check for updated model files from master
    if pass_file.isfile():
      os.remove(act_file)
      act_file = shutil.copyfile(pass_file, act_file)
      os.remove(pass_file)
      print("Model update received from Master Node.")

  else:
    print("Waiting for initial model...")
    time.sleep(10) # resume after 30 seconds