In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/machine-learning/apple_quality.csv')
# Menampilkan data
data.head()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good


In [None]:
# menampilkan informasi data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 0 to 4000
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4001 non-null   object 
 8   Quality      4000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 281.4+ KB


In [None]:
data.isnull().sum()

A_id           1
Size           1
Weight         1
Sweetness      1
Crunchiness    1
Juiciness      1
Ripeness       1
Acidity        0
Quality        1
dtype: int64

In [None]:
# Menghapus baris yang mengandung nilai yang hilang
data_cleaned = data.dropna()
print(data_cleaned.shape)

(4000, 9)


In [None]:
# Misalkan 'Quality' adalah kolom target dan sisanya adalah fitur
# Menggunakan One-Hot Encoding untuk kolom-kolom kategorikal
X = pd.get_dummies(data_cleaned.drop('Quality', axis=1))
y = data_cleaned['Quality']

# Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# Inisialisasi model
modelnb = GaussianNB()
# Melatih model
modelnb.fit(X_train, y_train)


In [None]:
# Prediksi menggunakan model yang sudah dilatih
y_pred = modelnb.predict(X_test)
y_pred

array(['bad', 'good', 'bad', 'good', 'bad', 'good', 'bad', 'bad', 'good',
       'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad',
       'good', 'bad', 'bad', 'good', 'bad', 'bad', 'good', 'good', 'bad',
       'bad', 'bad', 'bad', 'good', 'bad', 'good', 'bad', 'bad', 'bad',
       'good', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
       'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'good', 'good',
       'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'good', 'good', 'bad',
       'bad', 'bad', 'bad', 'good', 'good', 'good', 'bad', 'bad', 'good',
       'good', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad',
       'good', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
       'good', 'bad', 'bad', 'good', 'bad', 'bad', 'good', 'good', 'good',
       'bad', 'good', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad',
       'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'good',
       'bad', 'good', 'good', 'bad', 'bad', 'good', 

In [None]:
# Menentukan nilai probabilitas dari x_test
proba = modelnb.predict_proba(X_test)
proba

array([[0.67573838, 0.32426162],
       [0.47286011, 0.52713989],
       [0.97030917, 0.02969083],
       ...,
       [0.28702581, 0.71297419],
       [0.87050363, 0.12949637],
       [0.28460177, 0.71539823]])

In [None]:
# Evaluasi model dengan confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[341,  63],
       [182, 214]])

In [None]:
# Merapikan hasil confusion matrix
df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
df_confusion

Predicted,bad,good
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
bad,341,63
good,182,214


In [None]:
# Menghitung nilai akurasi dari klasifikasi naive bayes
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Akurasi: {accuracy}')
print(f'Laporan Klasifikasi:\n{report}')

Akurasi: 0.69375
Laporan Klasifikasi:
              precision    recall  f1-score   support

         bad       0.65      0.84      0.74       404
        good       0.77      0.54      0.64       396

    accuracy                           0.69       800
   macro avg       0.71      0.69      0.69       800
weighted avg       0.71      0.69      0.69       800

