In [1]:
# Import Library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Import Data
dataset = pd.read_excel('data_pkl.xlsx')
dataset.head(30)

Unnamed: 0,No,Nama,JK,Alamat,Usia (Tahun),Berat,Tinggi,Status Gizi
0,1,GIVAN ALVARO,L,sijunjung,4,13.5,102.0,Gizi Kurang
1,2,ALKHANZA QIRANIAFI,L,SIJUNJUNG,4,15.0,101.0,Gizi Baik
2,3,AZHIO ARZERO AFDOL,L,JR. GANTING,4,19.0,107.0,Gizi Baik
3,4,RIFDATUL BASYARIAH,P,SIJUNJUNG,4,16.0,100.0,Gizi Baik
4,5,AFIKA ALFIANO,P,SIJUNJUNG,3,15.2,95.0,Gizi Baik
5,6,VERONIKA BILQIS,P,SIJUNJUNG,3,15.5,97.0,Gizi Baik
6,7,RISKI YANDRI CANIAGO,L,JR GANTING,4,16.0,101.0,Gizi Baik
7,8,ADEVA,L,SIJUNJUNG,3,15.2,94.0,Gizi Lebih
8,9,ALMAHIRA MISQOL QIRANI,P,SIJUNJUNG,4,18.2,101.0,Gizi Lebih
9,10,QIANDRA HAIWATU ZAHRA,P,SIJUNJUNG,4,16.2,101.0,Gizi Baik


Eksploraty Data Analysis

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   No            30 non-null     int64  
 1   Nama          30 non-null     object 
 2   JK            30 non-null     object 
 3   Alamat        30 non-null     object 
 4   Usia (Tahun)  30 non-null     int64  
 5   Berat         30 non-null     float64
 6   Tinggi        30 non-null     float64
 7   Status Gizi   30 non-null     object 
dtypes: float64(2), int64(2), object(4)
memory usage: 2.0+ KB


In [4]:
dataset.isnull().sum()

No              0
Nama            0
JK              0
Alamat          0
Usia (Tahun)    0
Berat           0
Tinggi          0
Status Gizi     0
dtype: int64

In [6]:
dataset.duplicated().sum()

0

In [7]:
dataset['Status Gizi'].value_counts()

Gizi Baik      12
Gizi Lebih     11
Gizi Kurang     7
Name: Status Gizi, dtype: int64

Features Selection

In [8]:
df = dataset.drop(['No', 'Nama', 'JK', 'Alamat','Usia (Tahun)'], axis=1)
df

Unnamed: 0,Berat,Tinggi,Status Gizi
0,13.5,102.0,Gizi Kurang
1,15.0,101.0,Gizi Baik
2,19.0,107.0,Gizi Baik
3,16.0,100.0,Gizi Baik
4,15.2,95.0,Gizi Baik
5,15.5,97.0,Gizi Baik
6,16.0,101.0,Gizi Baik
7,15.2,94.0,Gizi Lebih
8,18.2,101.0,Gizi Lebih
9,16.2,101.0,Gizi Baik


In [9]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1]

Split Dataset

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [12]:
print(X_train)

[[  8.1  77. ]
 [ 15.3  92. ]
 [ 15.4 108. ]
 [  8.4  80. ]
 [ 15.5  97. ]
 [ 13.6  92. ]
 [ 18.2 101. ]
 [ 15.   92. ]
 [  8.3  76. ]
 [ 12.   83. ]
 [ 15.  101. ]
 [  9.6  77. ]
 [ 16.  101. ]
 [ 15.2  95. ]
 [ 13.   85. ]
 [  9.   85. ]
 [ 16.2 101. ]
 [ 15.2  94. ]
 [ 11.   77. ]
 [ 16.  100. ]
 [ 13.5 102. ]
 [ 12.   80. ]
 [ 18.  106. ]
 [ 18.  102. ]]


In [13]:
print(y_train)

27    Gizi Kurang
11     Gizi Lebih
17      Gizi Baik
22    Gizi Kurang
5       Gizi Baik
16      Gizi Baik
8      Gizi Lebih
14     Gizi Lebih
23    Gizi Kurang
20     Gizi Lebih
1       Gizi Baik
29      Gizi Baik
6       Gizi Baik
4       Gizi Baik
18     Gizi Lebih
19    Gizi Kurang
9       Gizi Baik
7      Gizi Lebih
25     Gizi Lebih
3       Gizi Baik
0     Gizi Kurang
21     Gizi Lebih
15      Gizi Baik
12     Gizi Lebih
Name: Status Gizi, dtype: object


In [14]:
print(X_test)

[[ 19.  107. ]
 [  7.9  74. ]
 [ 18.6 102. ]
 [ 16.7  93. ]
 [  8.3  77.8]
 [  9.8  76. ]]


In [15]:
print(y_test)

2       Gizi Baik
28    Gizi Kurang
13     Gizi Lebih
10     Gizi Lebih
26    Gizi Kurang
24      Gizi Baik
Name: Status Gizi, dtype: object


Modelling

In [16]:
from math import sqrt
class KNN():
  def __init__(self,k):
    self.k=k
    print(self.k)
  def fit(self,X_train,y_train):
    self.x_train=X_train
    self.y_train=y_train
  def calculate_euclidean(self,sample1,sample2):
    distance=0.0
    for i in range(len(sample1)):
      distance+=(sample1[i]-sample2[i])**2 #Euclidean Distance = sqrt(sum i to N (x1_i – x2_i)^2)
    return sqrt(distance)
  def nearest_neighbors(self,test_sample):
    distances=[]#calculate distances from a test sample to every sample in a training set
    for i in range(len(self.x_train)):
      distances.append((self.y_train[i],self.calculate_euclidean(self.x_train[i],test_sample)))
    distances.sort(key=lambda x:x[1])#sort in ascending order, based on a distance value
    neighbors=[]
    for i in range(self.k): #get first k samples
      neighbors.append(distances[i][0])
    return neighbors
  def predict(self,test_set):
    predictions=[]
    for test_sample in test_set:
      neighbors=self.nearest_neighbors(test_sample)
      labels=[sample for sample in neighbors]
      prediction=max(labels,key=labels.count)
      predictions.append(prediction)
    return predictions

In [17]:
model=KNN(3) #our model
model.fit(X_train,y_train)

3


In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 1)#The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.
knn_model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3, p=1)

In [19]:
y_pred = knn_model.predict(X_test)

In [20]:
predictions = knn_model.predict(X_test)#our model's predictions

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
# help(KNeighborsClassifier)
knn_model = KNeighborsClassifier(n_neighbors=3)
# Membuat model berdasarkan data training
knn_model.fit(X_train, y_train)
# Memprediksi/evaluasi output data testing
y_pred = knn_model.predict(X_test)
# Menampilkan confunssion matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

[[2 0 0]
 [0 2 0]
 [0 0 2]]
              precision    recall  f1-score   support

   Gizi Baik       1.00      1.00      1.00         2
 Gizi Kurang       1.00      1.00      1.00         2
  Gizi Lebih       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



1.0

In [22]:
print(knn_model.predict([[16, 101]]))

['Gizi Baik']


In [24]:
import pickle

filename = 'knn_model.sav'
pickle.dump(knn_model, open(filename, 'wb'))