In [7]:
import pandas as pd
import numpy as np
from scipy.stats import norm

In [2]:
# 입력으로 주어지는 데이터는 모두 numpy array라고 가정하고 구현 
# 입력 특성의 개수에 관계없이 동작하도록 구현 

class MyGaussianNB: 
    def __init__(self):
        self.distXy0 = []
        self.distXy1 = []
        self.prior0 = 0
        self.prior1 = 0
        self.m0 = 0 # 훈련 데이터에서 레이블이 0인 샘플 개수
        self.m1 = 0 # 훈련 데이터에서 레이블이 0인 샘플 개수
        self.num_col = 0 # 특성벡터의 차원 
        
    
    def fit(self, X_train, y_train):
        self.num_col = X_train.shape[1] 
        
        X_train0 = X_train[y_train == 0]    ###################여기를 채우세요
        X_train1 = X_train[y_train == 1]    ###################여기를 채우세요
        self.m0 = (y_train == 0).sum()      ###################여기를 채우세요
        self.m1 = (y_train == 1).sum()      ###################여기를 채우세요
        
        # self.fit_dist() method는 코드 맨 끝에 정의되어 있음 
        for i in range(self.num_col):
            self.distXy0.append(self.fit_dist(X_train0,i)) # X_train0의 column index i열에 대한 정규분포를 distXy0 리스트 i번째에 추가
            self.distXy1.append(self.fit_dist(X_train1,i)) # X_train1의 column index i열에 대한 정규분포를 distXy1 리스트 i번째에 추가
        
        self.prior0 = self.m0/(self.m0 + self.m1)  ###################여기를 채우세요
        self.prior1 = self.m1/(self.m0 + self.m1)  ###################여기를 채우세요
        

    def predict_proba(self, X):
        prob0 = self.prior0
        prob1 = self.prior1
        for i in range(self.num_col):
            prob0 *=  self.distXy0[i].pdf(X[:,i])   ###################여기를 채우세요
            prob1 *=  self.distXy1[i].pdf(X[:,i])   ###################여기를 채우세요
        
        result = np.array([prob0, prob1])/ (prob0+prob1)
        return result.T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    
    def score(self, X, y):
        y_hat = self.predict(X)
        return (y == y_hat).sum()/len(y)          ###################여기를 채우세요
    
    def fit_dist(self,data, col_idx):
        mu = data[:,col_idx].mean()
        sigma = data[:,col_idx].std(ddof=0)
    
        dist = norm(mu, sigma)
        return dist

In [58]:
data = pd.read_csv('./dataset/PimaIndiansDiabetes.csv')
data_subset = data.loc[1:,['Blood Glucose', 'BMI', 'Class']]

In [59]:
data_subset[data_subset.isna() == True].sum()

Blood Glucose    0.0
BMI              0.0
Class            0.0
dtype: float64

In [60]:
bg_mask = data_subset.loc[:,"Blood Glucose"]!=0
bmi_mask = data_subset.loc[:,"BMI"]!=0
clean_data_subset = data_subset[bg_mask & bmi_mask]

X = clean_data_subset.loc[:, ['Blood Glucose', 'BMI']]
y = clean_data_subset.loc[:, 'Class']

ratio = 0.8
total_num = len(clean_data_subset)
train_num = int(ratio*total_num)

np.random.seed(42)
shuffled_idx = np.arange(total_num)
np.random.shuffle(shuffled_idx)

X_train = X.iloc[shuffled_idx[:train_num]]
y_train = y.iloc[shuffled_idx[:train_num]]
X_test = X.iloc[shuffled_idx[train_num:]]
y_test = y.iloc[shuffled_idx[train_num:]]

In [61]:
# 준비된 데이터가 pd.DataFrame일 때, 모두 to_numpy() 메소드를 이용하여 numpy array로 바꾼 뒤에 사용 

ClassifierNB = MyGaussianNB()

ClassifierNB.fit(X_train.to_numpy(), y_train.to_numpy())

ClassifierNB.predict(X_test.to_numpy())[:22]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [62]:
ClassifierNB.predict_proba(X_test.to_numpy())[:5]

array([[0.59372213, 0.40627787],
       [0.34812832, 0.65187168],
       [0.58593987, 0.41406013],
       [0.94950911, 0.05049089],
       [0.89883061, 0.10116939]])

In [63]:
ClassifierNB.score(X_train.to_numpy(), y_train.to_numpy())

0.7566666666666667

In [64]:
ClassifierNB.score(X_test.to_numpy(), y_test.to_numpy())

0.7880794701986755