In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from matplotlib import pyplot as pyplot
from warnings import simplefilter
import joblib
import time
import warnings
warnings.filterwarnings('ignore')
#显示所有的列
pd.set_option('display.max_columns', None)

#显示所有的行
pd.set_option('display.max_rows', None)

#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


In [5]:
# 加载训练数据和测试数据
dftrain = pd.read_csv("./train70_reduced.csv")

simplefilter(action='ignore', category=FutureWarning)
seed = 7

# 获取和打印数据总的类别数
class_names = dftrain.target.unique()
print(class_names)

# 转换为分类数据,即将数据中标签形式数据转换为可编码类型
dftrain = dftrain.astype('category')  # 可以指定特定的列转为分类数据 df['col1'] = df['col1'].astype('category')

# 找出和打印分类标签列
cat_columns = dftrain.select_dtypes(['category']).columns
print('cat_columns ------------', cat_columns)

# 将标签列转换为编码数字格式，以方便输入模型
dftrain[cat_columns] = dftrain[cat_columns].apply(lambda x: x.cat.codes)

# 分离目标y和特征列x
x_columns = dftrain.columns.drop('target')
x_train = dftrain[x_columns].values
y_train = dftrain['target'].values





['legitimate' 'dos' 'malformed' 'bruteforce' 'slowite' 'flood']
cat_columns ------------ Index(['tcp.flags', 'tcp.time_delta', 'tcp.len', 'mqtt.conack.flags',
       'mqtt.conack.flags.reserved', 'mqtt.conack.flags.sp', 'mqtt.conack.val',
       'mqtt.conflag.cleansess', 'mqtt.conflag.passwd', 'mqtt.conflag.qos',
       'mqtt.conflag.reserved', 'mqtt.conflag.retain', 'mqtt.conflag.uname',
       'mqtt.conflag.willflag', 'mqtt.conflags', 'mqtt.dupflag',
       'mqtt.hdrflags', 'mqtt.kalive', 'mqtt.len', 'mqtt.msg', 'mqtt.msgid',
       'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.protoname', 'mqtt.qos',
       'mqtt.retain', 'mqtt.sub.qos', 'mqtt.suback.qos', 'mqtt.ver',
       'mqtt.willmsg', 'mqtt.willmsg_len', 'mqtt.willtopic',
       'mqtt.willtopic_len', 'target'],
      dtype='object')


In [8]:
import optuna
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

# 定义要优化的超参数空间
def objective(trial):
    hidden_layer_sizes = tuple(
        trial.suggest_categorical('hidden_layer_sizes', [(10,), (50,), (50,50)])
    )
    activation = trial.suggest_categorical('activation', ['relu', 'logistic', 'tanh'])
    alpha = trial.suggest_loguniform('alpha', 1e-6, 1e1)
    learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-2,1e1)
    max_iter = trial.suggest_categorical('max_iter', [100, 200])

    # 创建MLP分类器对象
    mlp = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        alpha=alpha,
        learning_rate_init=learning_rate_init,
        max_iter=max_iter
    )

    # 计算交叉验证分数
    score = cross_val_score(mlp, x_train, y_train, cv=3).mean()

    return score

# 创建Optuna优化对象
study = optuna.create_study(direction='minimize')

# 开始优化
study.optimize(objective, n_trials=100,n_jobs=12)

# 输出最优参数和得分
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)


[32m[I 2023-04-12 11:54:57,221][0m A new study created in memory with name: no-name-0c409947-4bbd-4d10-a6af-179d547706bf[0m
[32m[I 2023-04-12 11:59:25,456][0m Trial 9 finished with value: 0.5962846638110388 and parameters: {'hidden_layer_sizes': (10,), 'activation': 'relu', 'alpha': 0.2900674624154826, 'learning_rate_init': 0.15267979821793826, 'max_iter': 100}. Best is trial 9 with value: 0.5962846638110388.[0m
[32m[I 2023-04-12 12:00:01,573][0m Trial 11 finished with value: 0.5000043169505061 and parameters: {'hidden_layer_sizes': (10,), 'activation': 'relu', 'alpha': 5.97021998869405, 'learning_rate_init': 7.805492021829246, 'max_iter': 100}. Best is trial 11 with value: 0.5000043169505061.[0m
[32m[I 2023-04-12 12:01:10,227][0m Trial 4 finished with value: 0.7096085454513837 and parameters: {'hidden_layer_sizes': (10,), 'activation': 'logistic', 'alpha': 0.3152206015553121, 'learning_rate_init': 0.6077835056191088, 'max_iter': 200}. Best is trial 11 with value: 0.50000431

Best parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'tanh', 'alpha': 0.00023327600952655704, 'learning_rate_init': 2.5832194615396338, 'max_iter': 200}
Best score: 0.03848110762108413
