In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler

# โหลดข้อมูลจากไฟล์ CSV
df = pd.read_csv('UNSW_NB15_testing-set.csv')

In [2]:
# Filter 'attack_cat' column for 'Dos' and 'Normal' values
filtered_df = df[df['attack_cat'].isin(['DoS', 'Normal'])]

In [3]:
attack_cat_mapping = {'DoS': 1, 'Normal': 0}
filtered_df['attack_cat_encoded'] = filtered_df['attack_cat'].map(attack_cat_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['attack_cat_encoded'] = filtered_df['attack_cat'].map(attack_cat_mapping)


In [4]:
print(filtered_df[['attack_cat', 'attack_cat_encoded']].head())

  attack_cat  attack_cat_encoded
0     Normal                   0
1     Normal                   0
2     Normal                   0
3     Normal                   0
4     Normal                   0


In [5]:
X = df[['spkts', 'dpkts', 'sbytes', 'dbytes']]
y = filtered_df.loc[:, 'attack_cat_encoded']


In [6]:
from sklearn.utils import resample

# รวม X และ y เข้าด้วยกัน
combined_data = pd.concat([X, y], axis=1)

# หาจำนวนตัวอย่างที่น้อยที่สุด
min_samples = combined_data['attack_cat_encoded'].value_counts().min()

# สร้างข้อมูลที่ undersampled จากแต่ละกลุ่ม
undersampled_data = combined_data.groupby('attack_cat_encoded').apply(lambda x: resample(x, n_samples=min_samples, random_state=42))

# แยก X และ y ออกจาก undersampled_data
X_resampled = undersampled_data[['spkts', 'dpkts', 'sbytes', 'dbytes']]
y_resampled = undersampled_data['attack_cat_encoded']


In [7]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# หา Information Gain ของแต่ละ feature
ig_scores = mutual_info_classif(X_resampled, y_resampled, discrete_features=False)

# แสดง Information Gain ของแต่ละ feature
for feature, ig_score in zip(X_resampled.columns, ig_scores):
    print(f'Information Gain for {feature}: {ig_score}')

# ใช้ MLPClassifier (Artificial Neural Network)
model = MLPClassifier()

# วนซ้ำโดยใช้ค่า random seed แต่ละครั้ง
for seed in [1, 20, 100, 200, 1000]:
    # แบ่งข้อมูลเป็นชุดฝึกและชุดทดสอบ
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=seed)

    # ใช้ MLPClassifier
    model.fit(X_train, y_train)

    # ทำนายค่า 'label' ด้วยชุดทดสอบ
    y_pred = model.predict(X_test)

    # ประเมินประสิทธิภาพ
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print(f'Seed = {seed}, Accuracy: {accuracy}')

    # แสดง Confusion Matrix
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:')
    print(conf_matrix)

    # แสดง Classification Report
    class_report = metrics.classification_report(y_test, y_pred)
    print('Classification Report:')
    print(class_report)


Information Gain for spkts: 0.21991764777792477
Information Gain for dpkts: 0.3301458638907455
Information Gain for sbytes: 0.5659586151161409
Information Gain for dbytes: 0.4256870684320475
Seed = 1, Accuracy: 0.8483489604565838
Confusion Matrix:
[[2133  349]
 [ 395 2029]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.86      0.85      2482
         1.0       0.85      0.84      0.85      2424

    accuracy                           0.85      4906
   macro avg       0.85      0.85      0.85      4906
weighted avg       0.85      0.85      0.85      4906

Seed = 20, Accuracy: 0.8171626579698329
Confusion Matrix:
[[1939  524]
 [ 373 2070]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.79      0.81      2463
         1.0       0.80      0.85      0.82      2443

    accuracy                           0.82      4906
   macro avg       0.82      0.82      0.82      