In [1]:
!pip install skrebate



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skrebate import ReliefF
from sklearn import preprocessing
from sklearn.preprocessing import (StandardScaler, LabelEncoder, OneHotEncoder)
from sklearn import metrics
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import accuracy_score # for calculating accuracy of model
from sklearn.model_selection import train_test_split # for splitting the dataset for training and testing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report # for generating a classification report of model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Load Data KDDTrain+ dan KDDTest+

In [4]:
#fungsi untuk memberikan feature pada dataset
def column_name(data):
  data.columns = [
      'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
      'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
      'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
      'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
      'dst_host_srv_count', 'dst_host_same_srv_rate','dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
      'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
      'dst_host_srv_rerror_rate', 'label', 'difficulty_level'
  ]
  return data

In [5]:
#fungsi untuk load dataset dari drive berformat csv
def load_data(url):
  return pd.read_csv(url, header=None)

In [6]:
#load data train
kddtrain = load_data('/content/drive/MyDrive/Persiapan Skripsi/Bismillah/IDS/dataset/NSL-KDD/KDDTrain+ (1).csv')
kddtrain = column_name(kddtrain)
kddtrain.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [7]:
#load data test
kddtest = load_data('/content/drive/MyDrive/Persiapan Skripsi/Bismillah/IDS/dataset/NSL-KDD/KDDTest+ (1).csv')
kddtest = column_name(kddtest)
kddtest.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [8]:
df = pd.concat([kddtrain, kddtest], ignore_index=True)

In [9]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [10]:
df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,difficulty_level
count,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,...,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0,148517.0
mean,276.779305,40227.95,17088.85,0.000215,0.020523,0.000202,0.189379,0.004323,0.402789,0.255062,...,119.462661,0.534521,0.084103,0.145932,0.030584,0.256122,0.251304,0.13622,0.136397,19.27848
std,2460.683131,5409612.0,3703525.0,0.014677,0.240069,0.019417,2.01316,0.072248,0.490461,22.231375,...,111.232318,0.448061,0.194102,0.308638,0.108975,0.4285,0.429719,0.322741,0.335282,2.739757
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,72.0,0.6,0.02,0.0,0.0,0.0,0.0,0.0,0.0,20.0
75%,0.0,278.0,571.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.07,0.05,0.01,0.6,0.5,0.0,0.0,21.0
max,57715.0,1379964000.0,1309937000.0,1.0,3.0,3.0,101.0,5.0,1.0,7479.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


In [11]:
df.describe(include='object')

Unnamed: 0,protocol_type,service,flag,label
count,148517,148517,148517,148517
unique,3,70,11,40
top,tcp,http,SF,normal
freq,121569,48191,89820,77054


In [12]:
df.shape

(148517, 43)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148517 entries, 0 to 148516
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     148517 non-null  int64  
 1   protocol_type                148517 non-null  object 
 2   service                      148517 non-null  object 
 3   flag                         148517 non-null  object 
 4   src_bytes                    148517 non-null  int64  
 5   dst_bytes                    148517 non-null  int64  
 6   land                         148517 non-null  int64  
 7   wrong_fragment               148517 non-null  int64  
 8   urgent                       148517 non-null  int64  
 9   hot                          148517 non-null  int64  
 10  num_failed_logins            148517 non-null  int64  
 11  logged_in                    148517 non-null  int64  
 12  num_compromised              148517 non-null  int64  
 13 

##Preprocessing Data

In [14]:
#menghapus kolom yg tidak digunakan
df.drop('difficulty_level', axis='columns', inplace=True)

In [15]:
df.shape

(148517, 42)

In [16]:
#cek data duplikat
data_duplikat = df[df.duplicated()]
print("Data Duplikat:")
data_duplikat

Data Duplikat:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
126081,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.05,0.00,0.00,0.0,0.0,1.0,1.0,neptune
126139,0,tcp,private,S0,0,0,0,0,0,0,...,7,0.03,0.07,0.00,0.00,1.0,1.0,0.0,0.0,neptune
126154,0,udp,domain_u,SF,44,44,0,0,0,0,...,255,1.00,0.00,0.01,0.00,0.0,0.0,0.0,0.0,normal
126208,0,icmp,ecr_i,SF,1480,0,0,1,0,0,...,39,1.00,0.00,1.00,0.51,0.0,0.0,0.0,0.0,pod
126214,0,udp,domain_u,SF,44,44,0,0,0,0,...,254,1.00,0.01,0.01,0.00,0.0,0.0,0.0,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148440,0,tcp,private,S0,0,0,0,0,0,0,...,12,0.05,0.08,0.00,0.00,1.0,1.0,0.0,0.0,neptune
148458,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0,smurf
148501,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0,smurf
148511,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0,smurf


In [17]:
#menghapus data duplikat
df = df.drop_duplicates()

In [18]:
df.reset_index(drop=True)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147902,1,tcp,smtp,SF,2599,293,0,0,0,0,...,186,0.73,0.13,0.00,0.00,0.00,0.00,0.26,0.00,mailbomb
147903,0,tcp,smtp,SF,794,333,0,0,0,0,...,141,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00,normal
147904,0,tcp,http,SF,317,938,0,0,0,0,...,255,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00,normal
147905,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.07,0.07,back


In [19]:
#cek ulang data duplikat
#cek data duplikat
data_duplikat = df[df.duplicated()]
print("Data Duplikat:")
data_duplikat

Data Duplikat:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label


In [20]:
#cek missing value dalam data
df.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

##Transformasi Data

In [21]:
#Fungsi mengubah jenis serangan pada label kedalam kategori anomali
def change_label(data):
  data.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'anomali',inplace=True)
  data.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'anomali',inplace=True)
  data.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'anomali',inplace=True)
  data.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'anomali',inplace=True)

In [22]:
change_label(df)

In [23]:
df.label.value_counts()

normal     76967
anomali    70940
Name: label, dtype: int64

###Label Encode

In [24]:
#Membuat sebuah dataframe dengan berisi label (Dos, Probe, R2L, U2R, normal) untuk nantinya di encode
multi_label = pd.DataFrame(df.label)

In [25]:
# label di encode menjadi 1 = normal , 0 = anomali
le2 = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
df['label'] = enc_label

###One Hot

In [32]:
#one-hot encoding
df = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'], prefix="", prefix_sep="")

In [33]:
#cek data setelah di encode
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147907 entries, 0 to 148516
Columns: 123 entries, duration to SH
dtypes: float64(15), int64(24), uint8(84)
memory usage: 57.0 MB


##Seleksi Fitur Numerik dan Kategorik

In [34]:
# Pisahkan fitur dan target
X = df.drop('label', axis=1)  #X berisikan fitur-fitur
y = df['label'] #y berisikan fitur target

In [35]:
num_features_to_select = 20  # jumlah fitur yang diinginkan
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=num_features_to_select)

In [36]:
rfe.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [37]:
print("Selected features:")
print(rfe.support_)
print("Feature ranking:")
print(rfe.ranking_)

Selected features:
[False False False False False False False False False False False  True
 False False False False False False False False False False  True False
  True False False False False False False False False False False  True
  True False False False False  True False False False False False False
 False False False False False  True False  True  True False False False
 False False False False False  True False False False  True False False
 False False False False False False False False False False False False
  True False  True False False False False False False False False  True
 False False  True False False False False False False False  True False
 False False False False False False False  True  True  True False False
  True False]
Feature ranking:
[ 93  94  96  88  19   7  72   2  69   9   6   1   8  41  95  43 103   5
  13  47  48  32   1  44   1  12  16  34  73  82  18  17  27  39  46   1
   1  37  42  76  15   1  30  53 100  26  57  21  84  56  62  64  80   1
 

In [38]:
# Mendapatkan indeks fitur yang berhasil dipilih
selected_feature_indices = rfe.support_

In [39]:
# Mendapatkan nama kolom fitur yang berhasil dipilih
selected_feature_names = X.columns[selected_feature_indices]

In [40]:
selected_feature_names

Index(['su_attempted', 'srv_serror_rate', 'srv_rerror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'IRC', 'domain_u',
       'eco_i', 'ecr_i', 'http', 'imap4', 'ntp_u', 'pm_dump', 'smtp', 'sunrpc',
       'urp_i', 'RSTR', 'S0', 'S1', 'SF'],
      dtype='object')

In [41]:
# Cetak nama kolom fitur yang berhasil dipilih
print("Nama kolom yang berhasil dipilih:")
for feature in selected_feature_names:
    print(feature)

Nama kolom yang berhasil dipilih:
su_attempted
srv_serror_rate
srv_rerror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
IRC
domain_u
eco_i
ecr_i
http
imap4
ntp_u
pm_dump
smtp
sunrpc
urp_i
RSTR
S0
S1
SF


Nama kolom yang berhasil dipilih:
su_attempted
srv_serror_rate
srv_rerror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
IRC
domain_u
eco_i
ecr_i
http
imap4
ntp_u
pm_dump
smtp
sunrpc
urp_i
RSTR
S0
S1
SF

In [42]:
# Mengambil hanya fitur-fitur yang berhasil diseleksi
selected_features = X.iloc[:, selected_feature_indices]

In [43]:
# 1, 7, 10, 21, 27, 28, 29, 34, 35, 38

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147907 entries, 0 to 148516
Columns: 123 entries, duration to SH
dtypes: float64(15), int64(24), uint8(84)
memory usage: 57.0 MB


In [45]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147907 entries, 0 to 148516
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   su_attempted              147907 non-null  int64  
 1   srv_serror_rate           147907 non-null  float64
 2   srv_rerror_rate           147907 non-null  float64
 3   dst_host_srv_serror_rate  147907 non-null  float64
 4   dst_host_rerror_rate      147907 non-null  float64
 5   IRC                       147907 non-null  uint8  
 6   domain_u                  147907 non-null  uint8  
 7   eco_i                     147907 non-null  uint8  
 8   ecr_i                     147907 non-null  uint8  
 9   http                      147907 non-null  uint8  
 10  imap4                     147907 non-null  uint8  
 11  ntp_u                     147907 non-null  uint8  
 12  pm_dump                   147907 non-null  uint8  
 13  smtp                      147907 non-null  u

In [46]:
y.value_counts()

1    76967
0    70940
Name: label, dtype: int64

##Pembagian Data Train & Test (80:20)

In [47]:
# Pisahkan data menjadi data pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

##Model KNN

In [48]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalisasi data
    ('knn', KNeighborsClassifier())  # Model KNN dengan K=3
])

In [49]:
# Latih model KNN dan normalisasi data sekaligus
pipeline.fit(X_train, y_train)

In [50]:
#cek score model
pipeline.score(X_train, y_train), pipeline.score(X_test, y_test)

(0.9489372491020495, 0.9463863160029747)

In [51]:
# Lakukan prediksi pada data pengujian
y_pred = pipeline.predict(X_test)

In [52]:
# Evaluasi performa model
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Model KNN: {accuracy}")

Akurasi Model KNN: 0.9463863160029747


In [53]:
# Tampilkan laporan klasifikasi yang mencakup metrik evaluasi yang lebih rinci
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94     14273
           1       0.93      0.97      0.95     15309

    accuracy                           0.95     29582
   macro avg       0.95      0.95      0.95     29582
weighted avg       0.95      0.95      0.95     29582



##Optimasi Model KNN dengan GridSearchCV

In [66]:
parameter = {
    'knn__n_neighbors': [5],  # Coba beberapa nilai K yang berbeda
    'knn__weights': ['uniform', 'distance'],  # Coba dua jenis bobot
    'knn__metric': ['euclidean', 'manhattan']  # Coba dua jenis metrik jarak
}

In [67]:
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)

In [68]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [69]:
#cek model selama proses tuning
pd.DataFrame(model.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__metric,param_knn__n_neighbors,param_knn__weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.151236,0.053072,213.567584,0.536495,manhattan,5,uniform,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.947239,0.948709,0.94668,0.947543,0.000856,1
0,0.091204,0.011751,40.577518,1.139406,euclidean,5,uniform,"{'knn__metric': 'euclidean', 'knn__n_neighbors...",0.947087,0.948735,0.946452,0.947424,0.000962,2
3,0.129283,0.038195,209.708403,0.421822,manhattan,5,distance,"{'knn__metric': 'manhattan', 'knn__n_neighbors...",0.946859,0.948659,0.946274,0.947264,0.001015,3
1,0.13221,0.064184,36.873983,1.067951,euclidean,5,distance,"{'knn__metric': 'euclidean', 'knn__n_neighbors...",0.946656,0.948735,0.946274,0.947222,0.001081,4


In [70]:
#cek parameter model terbaik
model.best_params_

{'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'knn__weights': 'uniform'}

In [71]:
#cek score model dengan parameter terbaik
model.score(X_train, y_train), model.score(X_test, y_test)

(0.9490048594971476, 0.9465891420458387)

In [72]:
# Lakukan prediksi pada data pengujian
y_pred = model.predict(X_test)

In [73]:
# Evaluasi performa model
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi Model KNN dengan Parameter Terbaik:", accuracy)

Akurasi Model KNN dengan Parameter Terbaik: 0.9465891420458387


In [74]:
# Tampilkan laporan klasifikasi yang mencakup metrik evaluasi yang lebih rinci
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94     14273
           1       0.93      0.97      0.95     15309

    accuracy                           0.95     29582
   macro avg       0.95      0.95      0.95     29582
weighted avg       0.95      0.95      0.95     29582



##Save Model

In [75]:
import pickle

In [77]:
filename = '/content/drive/MyDrive/Persiapan Skripsi/Bismillah/IDS/OTW SEMPRO/model_knn_opt.sav'
pickle.dump(model, open(filename,'wb'))

##Tes Model Prediksi

In [78]:
scaler = StandardScaler()

In [79]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147907 entries, 0 to 148516
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   su_attempted              147907 non-null  int64  
 1   srv_serror_rate           147907 non-null  float64
 2   srv_rerror_rate           147907 non-null  float64
 3   dst_host_srv_serror_rate  147907 non-null  float64
 4   dst_host_rerror_rate      147907 non-null  float64
 5   IRC                       147907 non-null  uint8  
 6   domain_u                  147907 non-null  uint8  
 7   eco_i                     147907 non-null  uint8  
 8   ecr_i                     147907 non-null  uint8  
 9   http                      147907 non-null  uint8  
 10  imap4                     147907 non-null  uint8  
 11  ntp_u                     147907 non-null  uint8  
 12  pm_dump                   147907 non-null  uint8  
 13  smtp                      147907 non-null  u

In [84]:
input_data = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

std_data = input_data_reshape
print(std_data)

prediction = model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('Anomali')
else :
    print('Normal')

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
[1]
Normal


