## Naive Bayes Algorithm with Scikit-Learn

In [19]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

data_train = pd.read_csv('../../data/data_train.csv')
data_train = data_train[["battery_power", "mobile_wt", "px_height", "px_width", "ram", "price_range"]]
data_validation = pd.read_csv('../../data/data_validation.csv')
data_validation = data_validation[["battery_power", "mobile_wt", "px_height", "px_width", "ram", "price_range"]]
def naive_bayes_sklearn(data_train, data_validation): 
    X_train = data_train.drop(['price_range'], axis=1)
    y_train = data_train['price_range']

    model = GaussianNB()

    model.fit(X_train, y_train)

    X_val = data_validation.drop(['price_range'], axis=1)
    y_val = data_validation['price_range']

    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)

    return accuracy 

accuracy = naive_bayes_sklearn(data_train, data_validation)
print('Accuracy: ', accuracy)

Accuracy:  0.7933333333333333


## Naive Bayes Algorithm Model

In [20]:
import pandas as pd
import numpy as np
import pickle

class NaiveBayes:
    def __init__(self):
        self.data_train = pd.read_csv("../../data/data_train.csv")
        #Memilih kolom yang paling berpengaruh pada penilaian
        self.data_train = self.data_train[["battery_power", "mobile_wt", "px_height", "px_width", "ram", "price_range"]]
        #Mengubah kolom dari nilai numerik menjadi nilai kategorikal
        self.categorical_data_train = self.categorical(self.data_train)

    def categorical(self, dataset):
        batas = [[864.75, 1219.9, 1655.75], [111.0, 143.0, 172.0], [307.0, 576.5, 928.5], [874.0, 1249.0, 1639.75], [1220.75, 2286.0, 3114.5]]
        nama = ["battery_power", "mobile_wt", "px_height", "px_width", "ram"]
        for i in range(len(batas)):
            dataset.loc[dataset[nama[i]] <= batas[i][0], nama[i]] = 0
            dataset.loc[(dataset[nama[i]] > batas[i][0]) & (dataset[nama[i]] <= batas[i][1]), nama[i]] = 1
            dataset.loc[(dataset[nama[i]] > batas[i][1]) & (dataset[nama[i]] <= batas[i][2]), nama[i]] = 2
            dataset.loc[dataset[nama[i]] > batas[i][2], nama[i]] = 3
        return dataset
    
    def p_vj(self):
        list = []
        for i in np.unique(self.data_train["price_range"]):
            count = sum(self.data_train["price_range"] == i)
            list.append(count/len(self.data_train))
        return list

    def p_ai_vj(self):
        nama = ["battery_power", "mobile_wt", "px_height", "px_width", "ram"]
        list = []
        for i in range(len(nama)):
            kolom = []
            for j in np.unique(self.data_train["price_range"]):
                matrix = []
                count = sum((self.data_train["price_range"] == j) & (self.data_train[nama[i]] == 0))
                matrix.append(count/sum(self.data_train["price_range"] == j))
                count = sum((self.data_train["price_range"] == j) & (self.data_train[nama[i]] == 1))
                matrix.append(count/sum(self.data_train["price_range"] == j))
                count = sum((self.data_train["price_range"] == j) & (self.data_train[nama[i]] == 2))
                matrix.append(count/sum(self.data_train["price_range"] == j))
                count = sum((self.data_train["price_range"] == j) & (self.data_train[nama[i]] == 3))
                matrix.append(count/sum(self.data_train["price_range"] == j))
                kolom.append(matrix)
            list.append(kolom)    
        return list
    
    def posterior(self, data_validation):
        prior = self.p_vj()
        likelihood = self.p_ai_vj()
        data_validation = self.categorical(data_validation)
        nama = ["battery_power", "mobile_wt", "px_height", "px_width", "ram"]
        list = []
        for i in range(len(data_validation)):
            matrix = []
            for j in range(len(prior)):
                matrix.append(prior[j])
                for k in range(len(nama)):
                    matrix[j] = matrix[j] * likelihood[k][j][data_validation[nama[k]][i]]

            list.append(matrix)
        return list
    
    def predict(self, data_validation):
        Posterior = self.posterior(data_validation)

        list = []
        for i in range(len(Posterior)):
            list.append(np.argmax(Posterior[i]))
        count = 0
        for i in range(len(data_validation)):
            if list[i] == data_validation["price_range"][i]:
                count += 1
        return list, count/len(data_validation)
    
    def get_result(self, data_val, index):
        posterior = self.posterior(data_val)
        list = []
        for i in range(len(posterior)):
            list.append(np.argmax(posterior[i]))

        return list[index]
    
    def save_model(self, filename):
        pickle.dump(self, open("../model/"+ filename, 'wb'))

categorical_data_validation = pd.read_csv("../../data/data_validation.csv")
model = NaiveBayes()
hasil, persentase = model.predict(categorical_data_validation)
getResult = model.get_result(categorical_data_validation, 2)
model.save_model("NB_model.pkl")
print('Accuracy: ', persentase)
print('Hasil: ', hasil)


Accuracy:  0.7333333333333333
Hasil:  [2, 2, 3, 0, 3, 1, 3, 0, 3, 1, 2, 2, 3, 0, 3, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 2, 0, 2, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 0, 2, 0, 2, 2, 1, 1, 3, 1, 3, 1, 2, 3, 0, 1, 3, 2, 3, 3, 1, 1, 3, 3, 0, 3, 2, 2, 2, 2, 2, 2, 3, 1, 0, 1, 2, 0, 2, 1, 0, 3, 3, 0, 2, 3, 0, 3, 3, 0, 3, 1, 1, 0, 2, 1, 0, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 3, 1, 0, 3, 1, 3, 0, 2, 2, 3, 2, 1, 2, 2, 1, 3, 3, 1, 0, 0, 3, 0, 0, 1, 3, 0, 1, 3, 3, 1, 2, 2, 1, 2, 1, 2, 3, 1, 0, 2, 1, 0, 3, 0, 0, 3, 3, 3, 0, 2, 3, 0, 0, 0, 0, 2, 3, 1, 0, 1, 1, 1, 3, 0, 2, 0, 3, 1, 1, 2, 0, 2, 0, 3, 1, 0, 3, 2, 0, 1, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 2, 3, 3, 3, 0, 3, 0, 2, 3, 1, 2, 3, 2, 0, 0, 2, 2, 1, 3, 3, 0, 2, 0, 3, 0, 2, 1, 2, 2, 1, 3, 0, 1, 3, 3, 3, 0, 0, 1, 1, 3, 3, 1, 2, 0, 2, 1, 2, 3, 3, 0, 2, 1, 1, 1, 2, 0, 2, 2, 3, 3, 0, 1, 3, 3, 0, 1, 2, 1, 1, 1, 0, 3, 3, 2, 2, 0, 1, 0, 1, 1, 1, 1, 0, 3, 0, 3, 0, 1, 0, 1, 3, 2, 2, 0, 2, 1, 0, 1, 1, 1, 2, 0, 3, 1, 1, 0, 2, 3, 1, 3, 1, 3, 1, 2, 2, 0

##  Load Model

In [21]:
loaded_model = pickle.load(open("../model/NB_model.pkl", 'rb'))
data_validation = pd.read_csv("../../data/data_validation.csv")
hasil, persentase = loaded_model.predict(data_validation)
getAccuracy = loaded_model.get_result(data_validation, 2)
print('Accuracy: ', persentase)
print('Hasil: ', hasil)

Accuracy:  0.7333333333333333
Hasil:  [2, 2, 3, 0, 3, 1, 3, 0, 3, 1, 2, 2, 3, 0, 3, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 2, 0, 2, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 0, 2, 0, 2, 2, 1, 1, 3, 1, 3, 1, 2, 3, 0, 1, 3, 2, 3, 3, 1, 1, 3, 3, 0, 3, 2, 2, 2, 2, 2, 2, 3, 1, 0, 1, 2, 0, 2, 1, 0, 3, 3, 0, 2, 3, 0, 3, 3, 0, 3, 1, 1, 0, 2, 1, 0, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 3, 1, 0, 3, 1, 3, 0, 2, 2, 3, 2, 1, 2, 2, 1, 3, 3, 1, 0, 0, 3, 0, 0, 1, 3, 0, 1, 3, 3, 1, 2, 2, 1, 2, 1, 2, 3, 1, 0, 2, 1, 0, 3, 0, 0, 3, 3, 3, 0, 2, 3, 0, 0, 0, 0, 2, 3, 1, 0, 1, 1, 1, 3, 0, 2, 0, 3, 1, 1, 2, 0, 2, 0, 3, 1, 0, 3, 2, 0, 1, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 2, 3, 3, 3, 0, 3, 0, 2, 3, 1, 2, 3, 2, 0, 0, 2, 2, 1, 3, 3, 0, 2, 0, 3, 0, 2, 1, 2, 2, 1, 3, 0, 1, 3, 3, 3, 0, 0, 1, 1, 3, 3, 1, 2, 0, 2, 1, 2, 3, 3, 0, 2, 1, 1, 1, 2, 0, 2, 2, 3, 3, 0, 1, 3, 3, 0, 1, 2, 1, 1, 1, 0, 3, 3, 2, 2, 0, 1, 0, 1, 1, 1, 1, 0, 3, 0, 3, 0, 1, 0, 1, 3, 2, 2, 0, 2, 1, 0, 1, 1, 1, 2, 0, 3, 1, 1, 0, 2, 3, 1, 3, 1, 3, 1, 2, 2, 0