In [2]:
# <-- import library -->
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import os
%matplotlib inline
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# <-- config -->
_TODAY = datetime.datetime.today().date()
dataset_dic = "./dataset/"
dataset_list = ["./dataset_test_case_01.csv", "./dataset_test_case_02.csv", "dataset_test_case_03_Digital.csv", "dataset_test_case_03_Analog.csv", "dataset_test_case_04.csv"]
IoT_list = ['Heart_Rate', 'Soil_Moisture', 'Sound_Sensor', 'Temperature_and_Humidity', 'Water_Level', 'phValue']
test_list = ['Digital_Output', 'Analog_Output']
features = ['src.port', 'flow_duration', 'mqtt_duration', 'mqtt_connection_duration', 'mqtt_connection_ack_duration', 'mqtt_disconnection_duration', 'IoT_label', 'test_label']
target_dict = {dataset_list[0] : features[-2], 
          dataset_list[1] : features[-1],
          dataset_list[2] : features[-2], 
          dataset_list[3] : features[-2],
          dataset_list[4] : features[-2]}

# <-- train -->
for dataset in tqdm(dataset_list, desc='XGBoost', position=0):
    print('[*] load dataset : ' + dataset)
    df = pd.read_csv(dataset_dic + dataset)
    # <-- train set split -->
    print('[*] train set split')
    train_features = features[1:-2]
    target = target_dict[dataset]

    label = df[target].unique()
    class_mapping = {cls: i for i, cls in enumerate(label)}
    df[target] = df[target].map(class_mapping)

    X = df[train_features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

    # <-- train XGBoost -->
    print('[*] train XGBoost')
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    print('[*] train result : ')
    accuracy = accuracy_score(y_test, y_pred)
    print('\tAccuracy \t: ', accuracy)
    percision = precision_score(y_test, y_pred, average='weighted')
    print('\tPercision \t: ', percision)
    recall = recall_score(y_test, y_pred, average='weighted')
    print('\tRecall \t\t: ', recall)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('\tF1-Score \t: ', f1)

    print('[*] Confusion matrix : ')
    matrix = confusion_matrix(y_test, y_pred, labels=class_mapping, normalize='true')
    sns.heatmap(matrix, annot=True, xticklabels=label, yticklabels=label, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    print("[*] Feature Importances: \n{0}\n".format(xgb_model.feature_importances_))
    for name, value in zip(train_features, xgb_model.feature_importances_):
        print('{0}: {1:.3f}'.format(name, value))
    sns.barplot(x=xgb_model.feature_importances_, y=train_features)
    plt.show()


XGBoost:   0%|          | 0/5 [00:00<?, ?it/s]

[*] load dataset : ./dataset_test_case_01.csv
[*] train set split
[*] train XGBoost


XGBoost:   0%|          | 0/5 [00:02<?, ?it/s]

[*] train result : 
	Accuracy 	:  0.9311506332524926
	Percision 	:  0.9330167542116381
	Recall 		:  0.9311506332524926
	F1-Score 	:  0.931561107413692
[*] Confusion matrix : 





TypeError: '<' not supported between instances of 'str' and 'int'