In [1]:
import numpy as np

class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors =  np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y==c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
            
        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]
            

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x-mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

In [3]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [4]:
X, y = datasets.make_classification(n_samples=10000, n_features=10, n_classes=2, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_train)

In [5]:
accuracy(y_train, predictions)

0.92025

In [6]:
predictions = nb.predict(X_test)
accuracy(y_test, predictions)

0.921

In [22]:
from google.colab import files
files.upload()

Saving kidney_disease.csv to kidney_disease (1).csv


{'kidney_disease.csv': b'id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification\n0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd\n1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd\n2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd\n3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd\n4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd\n5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,25.0,1.1,142.0,3.2,12.2,39,7800,4.4,yes,yes,no,good,yes,no,ckd\n6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,54.0,24.0,104.0,4.0,12.4,36,,,no,no,no,good,no,no,ckd\n7,24.0,,1.015,2.0,4.0,normal,abnormal,notpre

In [23]:
import pandas as pd

In [39]:
data = pd.read_csv('kidney_disease.csv')
data.head(30)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,54.0,24.0,104.0,4.0,12.4,36.0,,,no,no,no,good,no,no,ckd
7,7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,31.0,1.1,,,12.4,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,60.0,1.9,,,10.8,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [40]:
data['classification'].value_counts()

ckd       248
notckd    150
ckd\t       2
Name: classification, dtype: int64

In [41]:
data['classification'].replace('ckd\t','ckd',inplace=True)

In [42]:
data['classification'].value_counts()

ckd       250
notckd    150
Name: classification, dtype: int64

In [43]:
data = data.sample(frac=1).reset_index(drop=True)

In [44]:
data.shape

(400, 26)

In [45]:
data.isna().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [46]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [47]:
X = pd.DataFrame(data)
data = DataFrameImputer().fit_transform(X)

In [48]:
data.isna().sum()

id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [49]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [50]:
LE = LabelEncoder()
CateList = data.select_dtypes(include="object").columns
print(CateList)

Index(['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')


In [51]:
for i in CateList:
    data[i] = LE.fit_transform(data[i])

In [52]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,220,36.0,80.0,1.01,0.0,0.0,1,1,0,0,103.0,57.425722,3.072454,137.528754,4.627244,11.9,24,81,34,0,3,1,0,0,0,0
1,187,3.0,76.469072,1.01,2.0,0.0,1,1,0,0,148.036517,22.0,0.7,137.528754,4.627244,10.7,22,20,34,0,3,1,0,0,0,0
2,135,48.0,80.0,1.015,0.0,2.0,1,1,0,0,214.0,24.0,1.3,140.0,4.0,13.2,27,90,34,0,4,1,1,0,0,0
3,370,69.0,70.0,1.02,0.0,0.0,1,1,0,0,83.0,42.0,1.2,139.0,3.7,16.2,38,85,36,0,3,1,0,0,0,1
4,377,64.0,70.0,1.02,0.0,0.0,1,1,0,0,97.0,27.0,0.7,145.0,4.8,13.8,37,59,29,0,3,1,0,0,0,1


In [53]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,220,36.0,80.0,1.01,0.0,0.0,1,1,0,0,103.0,57.425722,3.072454,137.528754,4.627244,11.9,24,81,34,0,3,1,0,0,0,0
1,187,3.0,76.469072,1.01,2.0,0.0,1,1,0,0,148.036517,22.0,0.7,137.528754,4.627244,10.7,22,20,34,0,3,1,0,0,0,0
2,135,48.0,80.0,1.015,0.0,2.0,1,1,0,0,214.0,24.0,1.3,140.0,4.0,13.2,27,90,34,0,4,1,1,0,0,0
3,370,69.0,70.0,1.02,0.0,0.0,1,1,0,0,83.0,42.0,1.2,139.0,3.7,16.2,38,85,36,0,3,1,0,0,0,1
4,377,64.0,70.0,1.02,0.0,0.0,1,1,0,0,97.0,27.0,0.7,145.0,4.8,13.8,37,59,29,0,3,1,0,0,0,1


In [54]:
df = data.iloc[:,:-1]
mm = MinMaxScaler()
df[:]= mm.fit_transform(df[:])

In [56]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane
0,0.551378,0.386364,0.230769,0.25,0.0,0.0,1.0,1.0,0.0,0.0,0.173077,0.143583,0.03535,0.839298,0.047803,0.598639,0.55814,0.89011,0.708333,0.0,0.75,0.5,0.0,0.0,0.0
1,0.468672,0.011364,0.203608,0.25,0.4,0.0,1.0,1.0,0.0,0.0,0.269309,0.052632,0.003968,0.839298,0.047803,0.517007,0.511628,0.21978,0.708333,0.0,0.75,0.5,0.0,0.0,0.0
2,0.338346,0.522727,0.230769,0.5,0.0,0.4,1.0,1.0,0.0,0.0,0.410256,0.057766,0.011905,0.85489,0.033708,0.687075,0.627907,0.989011,0.708333,0.0,1.0,0.5,1.0,0.0,0.0
3,0.927318,0.761364,0.153846,0.75,0.0,0.0,1.0,1.0,0.0,0.0,0.130342,0.103979,0.010582,0.84858,0.026966,0.891156,0.883721,0.934066,0.75,0.0,0.75,0.5,0.0,0.0,0.0
4,0.944862,0.704545,0.153846,0.75,0.0,0.0,1.0,1.0,0.0,0.0,0.160256,0.065469,0.003968,0.886435,0.051685,0.727891,0.860465,0.648352,0.604167,0.0,0.75,0.5,0.0,0.0,0.0


In [57]:
X = df.values
y = data['classification'].values
print(X.shape, y.shape)

(400, 25) (400,)


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)

In [59]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(300, 25) (300,)
(100, 25) (100,)


In [61]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_train)

In [62]:
accuracy(y_train, predictions)

0.87333333333333335

In [63]:
predictions = nb.predict(X_test)
accuracy(y_test, predictions)

In [65]:
print(accuracy(y_test, predictions))

0.7965209187230954
