### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Importing dataset

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/tanvipenumudy/ML-Stream-Interns-Summer-21/main/Khushi%20Jain/indian_liver_patient.csv')
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [None]:
data.describe()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [None]:
data['Dataset'].value_counts()

1    416
2    167
Name: Dataset, dtype: int64

### Data Preprocessing

### Handling Null values

In [None]:
data.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [None]:
data['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].mean(), inplace=True)

In [None]:
data.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

### Rename 'Dataset' feature to target 

In [None]:
data.rename(columns={'Dataset': 'target'}, inplace = True)

### Encoding categorical values

In [None]:
gender_dict = {
    'Male': 0,
    'Female': 1
}

In [None]:
data['Gender'] = data['Gender'].map(gender_dict)

In [None]:
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,target
0,65,1,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,0,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,0,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,0,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,0,3.9,2.0,195,27,59,7.3,2.4,0.4,1


### Feature Scaling
#### Normalization

In [None]:
for col in data.columns[:-1]:
  data[col] = ( data[col] - np.min(data[col]) ) / ( np.max(data[col]) - np.min(data[col]) )

In [None]:
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,target
0,0.709302,1.0,0.004021,0.0,0.060576,0.003015,0.001626,0.594203,0.521739,0.24,1
1,0.674419,0.0,0.140751,0.27551,0.310699,0.027136,0.018296,0.695652,0.5,0.176,1
2,0.674419,0.0,0.092493,0.204082,0.208598,0.025126,0.011791,0.623188,0.521739,0.236,1
3,0.627907,0.0,0.008043,0.015306,0.058134,0.00201,0.002033,0.594203,0.543478,0.28,1
4,0.790698,0.0,0.046917,0.096939,0.064485,0.008543,0.009961,0.666667,0.326087,0.04,1


### Splitting dependent and independent features

In [None]:
m = data.shape[0] # rows
n = data.shape[1] # cols
m, n

(583, 11)

In [None]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1:].values

In [None]:
x_ones = np.ones((m, 1))
X = np.concatenate((x_ones, X), axis=1)
print(X[:5])

[[1.         0.70930233 1.         0.00402145 0.         0.06057645
  0.00301508 0.00162635 0.5942029  0.52173913 0.24      ]
 [1.         0.6744186  0.         0.14075067 0.2755102  0.31069858
  0.02713568 0.0182964  0.69565217 0.5        0.176     ]
 [1.         0.6744186  0.         0.0924933  0.20408163 0.20859795
  0.02512563 0.01179101 0.62318841 0.52173913 0.236     ]
 [1.         0.62790698 0.         0.0080429  0.01530612 0.05813385
  0.00201005 0.00203293 0.5942029  0.54347826 0.28      ]
 [1.         0.79069767 0.         0.04691689 0.09693878 0.06448461
  0.00854271 0.00996137 0.66666667 0.32608696 0.04      ]]


In [None]:
theta = np.zeros((1, n))
theta

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
theta.T

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [None]:
print(X.shape)
print(Y.shape)
print(theta.shape)
print(theta.T.shape)

(583, 11)
(583, 1)
(1, 11)
(11, 1)


In [None]:
alpha = 0.01
iterations = 128

#### Define sigmoid / logistic function

In [None]:
def sigmoid_fn(z):
  return 1.0 / (1 + np.exp(-z))

#### Cost fn

In [None]:
def compute_cost(y, y_hat):
  return -np.mean( y * np.log(y_hat) - (1-y) * np.log(1-y_hat))

#### Define gradient descent

In [None]:
def gradient_descent(Y, alpha, theta):
  
  cost = np.zeros(iterations)
  for i in range(iterations):
    Y_pred = (X @ theta.T)
    Y_hat = sigmoid_fn(Y_pred)
    theta = theta - (alpha/m) * np.sum((Y_hat - Y) * X, axis = 0)
    cost[i] = compute_cost(Y, Y_hat)
  return (theta, cost)

In [None]:
theta, cost = gradient_descent(Y, alpha, theta)

In [None]:
theta

array([[0.77352135, 0.35089307, 0.20357636, 0.01966934, 0.0346846 ,
        0.07332378, 0.01896754, 0.01064493, 0.42477425, 0.39024835,
        0.2109673 ]])

In [None]:
cost
# min: 0.75343816

array([1.09025037, 1.0827604 , 1.07539729, 1.06815953, 1.06104561,
       1.05405403, 1.04718329, 1.04043189, 1.03379835, 1.02728116,
       1.02087886, 1.01458997, 1.00841302, 1.00234654, 0.99638909,
       0.99053922, 0.9847955 , 0.9791565 , 0.9736208 , 0.96818699,
       0.96285369, 0.9576195 , 0.95248306, 0.94744299, 0.94249795,
       0.9376466 , 0.93288761, 0.92821968, 0.92364149, 0.91915176,
       0.91474922, 0.9104326 , 0.90620065, 0.90205214, 0.89798586,
       0.89400058, 0.89009512, 0.8862683 , 0.88251895, 0.87884592,
       0.87524807, 0.87172427, 0.86827343, 0.86489443, 0.86158621,
       0.85834769, 0.85517782, 0.85207555, 0.84903987, 0.84606977,
       0.84316423, 0.84032228, 0.83754295, 0.83482528, 0.83216832,
       0.82957116, 0.82703286, 0.82455253, 0.82212927, 0.81976221,
       0.81745048, 0.81519323, 0.81298963, 0.81083884, 0.80874005,
       0.80669245, 0.80469527, 0.80274772, 0.80084903, 0.79899845,
       0.79719524, 0.79543867, 0.79372802, 0.79206257, 0.79044

In [None]:
# np.argmin(cost)

In [None]:
# cost[127]

In [None]:
Y_pred = X @ theta.T

In [None]:
Y_prob = sigmoid_fn(Y_pred)

In [None]:
Y_prob[:4]

array([[0.85033416],
       [0.82831544],
       [0.82535752],
       [0.82089536]])

In [None]:
Y_prob = Y_prob.reshape((m,))
Y_prob = pd.Series(Y_prob)
# multiclass classification left

In [None]:
def class_label(x):
  if x > 0.5:
    return 1
  return 0

In [None]:
Y_classlabel = Y_prob.apply(class_label)
Y_classlabel

0      1
1      1
2      1
3      1
4      1
      ..
578    1
579    1
580    1
581    1
582    1
Length: 583, dtype: int64

In [None]:
Y_classlabel.value_counts()

1    583
dtype: int64

In [None]:
c = 0
c2 = 0
for i in range(m):
  if Y_classlabel[i] == Y[i]:
    c += 1
  else:
    c2 += 1
c, c2

(416, 167)

#### accuracy

In [None]:
c/m*100

71.35506003430532