In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
def test_train_split(db) -> tuple:
    random_suffled = db.iloc[np.random.permutation(len(db))]
    split_point = int(len(db)*0.3)
    return random_suffled[:split_point].reset_index(drop=True),random_suffled[split_point:].reset_index(drop=True)

In [3]:
dataset = pd.read_csv("Train_B_Bayesian.csv")
dataset=dataset.dropna().reset_index(drop=True)
dataset

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
574,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
575,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
576,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
577,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [4]:
dataset.gender.unique()

array(['Female', 'Male'], dtype=object)

In [5]:
static=pd.get_dummies(dataset['gender'],drop_first=True)
static
# 0 indicates Female and 1 indicates Male

Unnamed: 0,Male
0,0
1,1
2,1
3,1
4,1
...,...
574,1
575,1
576,1
577,1


In [6]:
dataset=pd.concat([dataset,static],axis=1)
dataset=dataset.drop('gender',axis=1)
dataset=dataset.rename(columns = {'Male':'gender'})
col=dataset.pop('gender')
dataset.insert(0,'gender', col)
dataset

Unnamed: 0,gender,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,0,65,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,1,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,1,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,1,58,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,1,72,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
574,1,60,0.5,0.1,500,20,34,5.9,1.6,0.37,2
575,1,40,0.6,0.1,98,35,31,6.0,3.2,1.10,1
576,1,52,0.8,0.2,245,48,49,6.4,3.2,1.00,1
577,1,31,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [7]:
MEAN=dataset.mean()
MEAN

gender                0.758204
age                  44.782383
tot_bilirubin         3.315371
direct_bilirubin      1.494128
tot_proteins        291.366149
albumin              81.126079
ag_ratio            110.414508
sgpt                  6.481693
sgot                  3.138515
alkphos               0.947064
is_patient            1.284974
dtype: float64

In [8]:
STD=dataset.std()
STD

gender                0.428542
age                  16.221786
tot_bilirubin         6.227716
direct_bilirubin      2.816499
tot_proteins        243.561863
albumin             183.182845
ag_ratio            289.850034
sgpt                  1.084641
sgot                  0.794435
alkphos               0.319592
is_patient            0.451792
dtype: float64

In [9]:
THRESHOLD=2*MEAN+5*STD
THRESHOLD

gender                 3.659116
age                  170.673696
tot_bilirubin         37.769325
direct_bilirubin      17.070749
tot_proteins        1800.541614
albumin             1078.166383
ag_ratio            1670.079187
sgpt                  18.386591
sgot                  10.249203
alkphos                3.492088
is_patient             4.828910
dtype: float64

In [10]:
#remove outliers
def remove_outlier(df_in, col_name, THRESHOLD):
  df_out = df_in[df_in[col_name]<=THRESHOLD[col_name]]
  return df_out

dataset_filtered = remove_outlier(dataset, dataset.columns,THRESHOLD)
dataset_filtered = dataset_filtered.dropna().reset_index(drop=True)
dataset_filtered

Unnamed: 0,gender,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,0,65,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.90,1
1,1,62,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,1,62,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,1,58,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.00,1
4,1,72,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
562,1,60,0.5,0.1,500.0,20.0,34.0,5.9,1.6,0.37,2
563,1,40,0.6,0.1,98.0,35.0,31.0,6.0,3.2,1.10,1
564,1,52,0.8,0.2,245.0,48.0,49.0,6.4,3.2,1.00,1
565,1,31,1.3,0.5,184.0,29.0,32.0,6.8,3.4,1.00,1


In [11]:
def normalize(dataset):
  normalized_dataset=(dataset-dataset.mean())/dataset.std()
  normalized_dataset['is_patient']=dataset['is_patient']
  return normalized_dataset

#dataset_filtered=normalize(dataset_filtered) 
#print(dataset_filtered)   

In [12]:
test ,train = test_train_split(dataset_filtered)
test

Unnamed: 0,gender,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,1,18,1.8,0.7,178.0,35.0,36.0,6.8,3.6,1.10,1
1,1,30,1.6,0.4,332.0,84.0,139.0,5.6,2.7,0.90,1
2,0,51,0.9,0.2,280.0,21.0,30.0,6.7,3.2,0.80,1
3,1,55,0.6,0.2,220.0,24.0,32.0,5.1,2.4,0.88,1
4,0,45,0.8,0.2,165.0,22.0,18.0,8.2,4.1,1.00,1
...,...,...,...,...,...,...,...,...,...,...,...
165,1,54,0.8,0.2,181.0,35.0,20.0,5.5,2.7,0.96,1
166,1,42,8.9,4.5,272.0,31.0,61.0,5.8,2.0,0.50,1
167,1,4,0.9,0.2,348.0,30.0,34.0,8.0,4.0,1.00,2
168,0,64,0.8,0.2,178.0,17.0,18.0,6.3,3.1,0.90,1


In [13]:
train

Unnamed: 0,gender,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,0,42,0.5,0.1,162.0,155.0,108.0,8.1,4.0,0.9,1
1,1,58,0.9,0.2,1100.0,25.0,36.0,7.1,3.5,0.9,1
2,1,40,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,1
3,0,40,0.9,0.3,293.0,232.0,245.0,6.8,3.1,0.8,1
4,1,50,0.9,0.3,194.0,190.0,73.0,7.5,3.9,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
392,1,72,0.7,0.1,196.0,20.0,35.0,5.8,2.0,0.5,1
393,0,40,0.9,0.3,293.0,232.0,245.0,6.8,3.1,0.8,1
394,1,64,1.1,0.4,201.0,18.0,19.0,6.9,4.1,1.4,1
395,0,36,0.8,0.2,650.0,70.0,138.0,6.6,3.1,0.8,1


In [14]:
#training

#calculate mean and STD of each column
#split train into train_positive and train_negative
#1 is liver patient and 2 not
train_positive=train[train['is_patient']==1]
train_negative=train[train['is_patient']==2]

MEAN_1=train_positive.mean()
STD_1=train_positive.std()
train_positive_gender_prob=[]
x = len(train_positive[train_positive['gender'] == 0])
y = len(train_positive[train_positive['gender'] == 1])

train_positive_gender_prob.append(x/(x+y))
train_positive_gender_prob.append(y/(x+y))

train_negative_gender_prob=[]
x = len(train_negative[train_negative['gender'] == 0])
y = len(train_negative[train_negative['gender'] == 1])

train_negative_gender_prob.append(x/(x+y))
train_negative_gender_prob.append(y/(x+y))

MEAN_2=train_negative.mean()
STD_2=train_negative.std()


# MEAN_1
# MEAN_2
# STD_1
# STD_2

In [15]:
def normal_pdf(x, mean, sd):
    # print(mean)
    # print(sd)
    var = float(sd)**2
    denom = (2*math.pi*var)**.5
    num = math.exp(-(float(x)-float(mean))**2/(2*var))
    return num/denom

In [16]:
def predict(test_data,column_headers,MEAN_1,STD_1,MEAN_2,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_is_patient,cnt_is_not_patient):
  prob_1=cnt_is_patient
  prob_2=cnt_is_not_patient
  
  # print(test_data)
  for feature in column_headers:
      if feature == "gender":
        prob_1=prob_1*train_positive_gender_prob[int(test_data["gender"])]
        prob_2=prob_2*train_negative_gender_prob[int(test_data["gender"])]
      if feature == "is_patient":
        break
      prob_1=prob_1*normal_pdf(test_data[feature],MEAN_1[feature],STD_1[feature])
      prob_2=prob_2*normal_pdf(test_data[feature],MEAN_2[feature],STD_2[feature])
  
  prediction = 1 if prob_1>prob_2 else 2
  if test_data["is_patient"]==prediction:
    return True
  else:
    return False

In [17]:
def find_accuracy(test,MEAN_1,MEAN_2,STD_1,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_is_patient,cnt_is_not_patient):
  column_headers=list(test.columns)
  test_positive=test[test['is_patient']==1]
  test_negative=test[test['is_patient']==2]
  tp=0      # true positive
  tn=0      # true negative
  fp=0      # false positive
  fn=0      # false negative

  test_positive = test_positive.dropna().reset_index(drop=True)
  test_negative = test_negative.dropna().reset_index(drop=True)

  # print(test_positive)
  
  for row in test_positive.iterrows():
      # print(row[1]['age'])
      ret = predict(row[1],column_headers,MEAN_1,STD_1,MEAN_2,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_is_patient,cnt_is_not_patient)
      if ret == True:
        tp=tp+1
      else: 
        fn=fn+1

  for row in test_negative.iterrows():
      ret = predict(row[1],column_headers,MEAN_1,STD_1,MEAN_2,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_is_patient,cnt_is_not_patient)
      if ret == True:
        tn=tn+1
      else: 
        fp=fp+1

  accuracy = (tp+tn)/(tp+tn+fp+fn)

  return accuracy*100

In [18]:
cnt_is_patient = len(dataset[dataset["is_patient"]==1])
cnt_is_not_patient = len(dataset[dataset["is_patient"]==2])
print(cnt_is_patient)
print(cnt_is_not_patient)
find_accuracy(test,MEAN_1,MEAN_2,STD_1,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_is_patient,cnt_is_not_patient)

414
165


55.88235294117647

In [19]:
# five fold cross validation
dataset_filtered.sample(frac=1,random_state=200)
column_list=dataset_filtered.columns                # obtaining list of column names
dataset_arr=dataset_filtered.values[:,0:11]         # convert pandas dataframe to numpy array
sets = np.array_split(dataset_arr,5)                # obtain 5 almost equal sized arrays

df_set=[]   # pandas dataframe 5 equal sized sets

for ind,ele in enumerate(sets):
  df_set.append(pd.DataFrame(ele, columns = column_list )) # convert numpy arrays to pandas dataframe
  df_set[ind].reset_index(drop=True)              # re -indexing
  print(df_set[ind])                              # debug



     gender   age  tot_bilirubin  direct_bilirubin  tot_proteins  albumin  \
0       0.0  65.0            0.7               0.1         187.0     16.0   
1       1.0  62.0           10.9               5.5         699.0     64.0   
2       1.0  62.0            7.3               4.1         490.0     60.0   
3       1.0  58.0            1.0               0.4         182.0     14.0   
4       1.0  72.0            3.9               2.0         195.0     27.0   
..      ...   ...            ...               ...           ...      ...   
109     1.0  48.0            3.2               1.6         257.0     33.0   
110     1.0  27.0            1.2               0.4         179.0     63.0   
111     1.0  74.0            0.6               0.1         272.0     24.0   
112     1.0  50.0            5.8               3.0         661.0    181.0   
113     1.0  50.0            7.3               3.6        1580.0     88.0   

     ag_ratio  sgpt  sgot  alkphos  is_patient  
0        18.0   6.8   3.3 

In [20]:
# considering test sets each of 5 sets one by one

acc = []

for ind,ele in enumerate(df_set):
  series=[(ser) for index,ser in enumerate(df_set) if index!=ind] 
  test_new=df_set[ind]
  train_new=pd.concat(series,axis=0)
  train_positive=train_new[train_new['is_patient']==1]
  train_negative=train_new[train_new['is_patient']==2]

  MEAN_1=train_positive.mean()
  STD_1=train_positive.std()
  train_positive_gender_prob=[]
  x = len(train_positive[train_positive['gender'] == 0])
  y = len(train_positive[train_positive['gender'] == 1])

  train_positive_gender_prob.append(x/(x+y))
  train_positive_gender_prob.append(y/(x+y))

  train_negative_gender_prob=[]
  x = len(train_negative[train_negative['gender'] == 0])
  y = len(train_negative[train_negative['gender'] == 1])

  train_negative_gender_prob.append(x/(x+y))
  train_negative_gender_prob.append(y/(x+y))

  MEAN_2=train_negative.mean()
  STD_2=train_negative.std()

  cnt_new=len(train_new[train_new["is_patient"]==1])
  cnt_not_new=len(train_new[train_new["is_patient"]==2])

  acc.append(find_accuracy(test_new,MEAN_1,MEAN_2,STD_1,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_new,cnt_not_new))
  print(acc[ind])

print(np.mean(acc))

61.40350877192983
51.75438596491229
45.13274336283185
56.63716814159292
68.14159292035397
56.61387983232417


In [21]:
# Laplace correction

train_positive=train[train['is_patient']==1]
train_negative=train[train['is_patient']==2]

MEAN_1=train_positive.mean()
STD_1=train_positive.std()
train_positive_gender_prob=[]
x = len(train_positive[train_positive['gender'] == 0])
y = len(train_positive[train_positive['gender'] == 1])

flag = 0

if x==0:
  x=1
  flag=1
if y==0:
  y=1
  flag=1
train_positive_gender_prob.append(x/(x+y))
train_positive_gender_prob.append(y/(x+y))

if flag==1:
  print('Laplace needed in train positive')

flag = 0

train_negative_gender_prob=[]
x = len(train_negative[train_negative['gender'] == 0])
y = len(train_negative[train_negative['gender'] == 1])
if x==0:
  x=1
  flag=1
if y==0:
  y=1
  flag=1
train_negative_gender_prob.append(x/(x+y))
train_negative_gender_prob.append(y/(x+y))

if flag==1:
  print('Laplace needed in train negative')

MEAN_2=train_negative.mean()
STD_2=train_negative.std()
find_accuracy(test,MEAN_1,MEAN_2,STD_1,STD_2,train_positive_gender_prob,train_negative_gender_prob,cnt_is_patient,cnt_is_not_patient)

55.88235294117647

In [22]:
# ignore cell
# x_train=train.values[:,0:11]
# x_test=test.values[:,0:11]
# print(x_test)
# x_train_sk=x_train[:,:-1]
# y_train_sk=x_train[:,-1]
# x_test_sk=x_test[:,:-1]
# y_test_sk=x_test[:,-1]
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# y_pred = gnb.fit(x_train_sk, y_train_sk).predict(x_test_sk)
# print("Number of mislabeled points out of a total %d points : %d"
#       % (x_test_sk.shape[0], (y_test_sk != y_pred).sum()))