In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns #modern plotting
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('https://personal.utdallas.edu/~pxn210006/smoking.csv')

In [None]:
# renaming columns
df.rename(columns={'height(cm)':'height', 'weight(kg)':'weight','waist(cm)':'waist',
                        'eyesight(left)':'eyesight_left', 'eyesight(right)':'eyesight_right',
                        'hearing(left)':'hearing_left', 'hearing(right)':'hearing_right',
                        'fasting blood sugar':'fasting_blood_sugar',  'Cholesterol':'cholesterol',
                        'HDL':'hdl','LDL':'ldl','Urine protein':'urine_protein',
                        'serum creatinine':'serum_creatinine', 'AST':'ast','ALT':'alt',
                        'Gtp':'gtp', 'dental caries' : 'dental_caries'}, inplace=True)

In [None]:
df['gender'] = df['gender'].str.replace('F','0')
df['gender'] = df['gender'].str.replace('M','1')
df['gender'] = pd.to_numeric(df['gender'])

df['tartar'] = df['tartar'].str.replace('N','0')
df['tartar'] = df['tartar'].str.replace('Y','1')
df['tartar'] = pd.to_numeric(df['tartar'])

df['oral'] = df['oral'].str.replace('N','0')
df['oral'] = df['oral'].str.replace('Y','1')
df['oral'] = pd.to_numeric(df['oral'])

In [None]:
#cleaning data by observation
df = df.drop(['ID'], axis=1)

In [None]:
# removing oral column due to skewed data
df = df.drop("oral", axis='columns')

In [None]:
#normalization
def normalize_df(df):
  for column in df.columns:
    df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
  return df

In [None]:
y = df['smoking']
x = df.drop("smoking", axis='columns')
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2)

In [None]:
#x_train = normalize_df(x_train)

In [None]:
df_new = x_train.join(y_train)
df = df_new

In [None]:
from collections import Counter
def outlier_detection(df, n, columns):
    rows = []
    will_drop_train = []
    for col in columns:
        Q1 = np.nanpercentile(df[col], 25)
        Q3 = np.nanpercentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_point = 1.5 * IQR
        rows.extend(df[(df[col] < Q1 - outlier_point)|(df[col] > Q3 + outlier_point)].index)
    for r, c in Counter(rows).items():
        if c >= n: will_drop_train.append(r)
    return will_drop_train

will_drop_train = outlier_detection(df, 3, df.select_dtypes(["float", "int"]).columns)

In [None]:
df.drop(will_drop_train, inplace = True, axis = 0)

#Data Clean up completed till here

In [None]:
# split dataset into age groups
df1 = df[(df['age'] < 40)]
df2 = df[(df['age'] >= 40) & (df['age'] <= 50)]
df3 = df[(df['age'] > 50)]

In [None]:
print(len(df1))
print(len(df2))
print(len(df3))

10082
20626
10228


In [None]:
df1.to_csv("dataset1.csv")
df2.to_csv("dataset2.csv")
df3.to_csv("dataset3.csv")

In [None]:
test_data = x_test.join(y_test)
y_testdf = test_data['smoking']
x_testdf = test_data.drop("smoking", axis='columns')

In [None]:
#we split x_test and y_test which is 20% of total data, further into two equal parts which is 10-10% of total data.
#One part of this will be used to create initial dataset for base model and the other will be used as test dataset to check accuracy of model later on
#Also the 80% of data that was split into 3 portions based on age attribute will be used to train base models of federated learning to show the significance of its training using Homomorphic Encryption
x_init_model, x_test, y_init_model, y_test = train_test_split(x_testdf,y_testdf, test_size=0.5, stratify=y_testdf, random_state=5)
#x_init_model = normalize_df(x_init_model)
df_init_data = x_init_model.join(y_init_model)
df_init_data.to_csv("dataset_initial_model.csv")

In [None]:
# now downloading the dataset for test (10% of total dataset)
df_test = x_test.join(y_test)
df_test.to_csv("dataset_test.csv")

In [None]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df_init_data.shape)
print(df_test.shape)

(10082, 27)
(20626, 27)
(10228, 27)
(5569, 27)
(5570, 27)


Now organizing df which holds train dataset

In [None]:
y = df['smoking']
x = df.drop(['smoking','urine_protein', 'ast', 'cholesterol', 'fasting_blood_sugar', 'eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'relaxation', 'systolic'], axis='columns')


((40905, 14), (40905,))

In [None]:
x_train = x
y_train = y
x_test = normalize_df(x_test)
x_test = x_test.drop(['urine_protein', 'ast', 'cholesterol', 'fasting_blood_sugar', 'eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'relaxation', 'systolic'], axis='columns')


In [None]:
lr = LogisticRegression(warm_start=True, max_iter=200)
lr.fit(x_train,y_train)

In [None]:
x_train_prediction = lr.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('Training data accuracy: ',training_data_accuracy)
print('Test data accuracy: ',lr.score(x_test, y_test))

Training data accuracy:  0.7448478181151449
Test data accuracy:  0.7391148217972888


In [None]:
import joblib
filename = 'finalized_model.sav'
joblib.dump(lr, filename)

['finalized_model.sav']

In [None]:
loaded_model = joblib.load(filename)

In [None]:
def create_model(df, print_flag=False):
  y_train = df['smoking']
  x_train = df.drop(['smoking','urine_protein', 'ast', 'cholesterol', 'fasting_blood_sugar', 'eyesight_left', 'eyesight_right', 'hearing_left', 'hearing_right', 'relaxation', 'systolic'], axis='columns')
  lr1 = LogisticRegression(warm_start=True, max_iter=200)
  lr1.fit(x_train,y_train)

  if print_flag:
    x_train_prediction = lr1.predict(x_train)
    training_data_accuracy = accuracy_score(x_train_prediction, y_train)
    print('Training data accuracy: ',training_data_accuracy)
  return lr1


In [None]:
lst=[]
for i in range(1,4):
  filename = "dataset"+str(i)+".csv"
  df = pd.read_csv(filename)
  df = df.iloc[:, 1:] # removing unnamed index column from file
  model = create_model(df)
  lst.append(model)

In [None]:
for i in range(0,3):
  print("Model coef ",str(i)," = ",lst[i].coef_)
  print("Model intercept ",str(i)," = ",lst[i].intercept_)

Model coef  0  =  [[ 2.41957102e-01 -2.99395594e-03  1.00771038e+00 -5.84837078e-01
  -5.60283407e-01  3.23377346e+00 -4.74306703e-01 -3.51080568e+00
   1.81395416e+00 -3.58312404e+00 -8.44441907e-01  7.39876374e+00
   4.59710017e-01  3.42654208e-01]]
Model intercept  0  =  [-2.12618245]
Model coef  1  =  [[ 2.9589353  -0.56347476  1.37757607 -1.73961334  0.02508684  4.05575906
   0.71388809 -3.04694471  2.21300561 -3.99621233 -1.32273699  6.65717003
   0.35610032  0.34599789]]
Model intercept  1  =  [-4.53919446]
Model coef  2  =  [[ 3.12540635 -3.37297001  1.88186766 -3.40778113  0.85923322  4.27558984
  -1.11847022 -1.35987345  1.24812573 -1.19353453 -1.08858923  4.4261181
   0.17210912  0.25063433]]
Model intercept  2  =  [-2.84469932]


In [None]:
lst[0].coef_[0][0] = 0.44195710194803882

In [None]:
lst[0].coef_[0][0]

0.44195710194803883

In [None]:
lst[0].classes_

array([0, 1])