In [1]:
import pandas as pd
df=pd.read_csv('train_data.csv')
df.info()

In [2]:
df.describe(include='all')

In [5]:
object1=df.select_dtypes(include=['object'])

In [6]:
for column in object1.columns:
    print(column)
    print(object1[column].unique())

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# 读取数据
df = pd.read_csv('train_data.csv')

# 编码类别字段
cat_cols = ['proto', 'service', 'state']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 编码标签字段
label_le = LabelEncoder()
df['attack_cat'] = label_le.fit_transform(df['attack_cat'])

# 特征列
features = df.columns.difference(['id', 'attack_cat'])

# 划分训练/验证集
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df['attack_cat'], test_size=0.3, random_state=42, stratify=df['attack_cat']
)

# 初始化 LGBM 模型
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(label_le.classes_),
    learning_rate=0.1,
    num_leaves=64,
    max_depth=-1,
    random_state=42,
    n_estimators=100
)

# 训练模型，使用 callbacks 替代 early_stopping_rounds
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(0)]
)

# 预测验证集
y_pred = model.predict(X_val)

# 打印准确率
acc = accuracy_score(y_val, y_pred)
print(f'验证集准确率: {acc:.4f}')

# 打印模型参数
print('模型参数:')
print(model.get_params())


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# === 1. 读取训练数据 ===
df = pd.read_csv('train_data.csv')

# 需要编码的类别字段
cat_cols = ['proto', 'service', 'state']
label_encoders = {}

# 编码类别字段
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 编码标签列
label_le = LabelEncoder()
df['attack_cat'] = label_le.fit_transform(df['attack_cat'])

# 特征列（排除 id 和标签）
features = df.columns.difference(['id', 'attack_cat'])

# === 2. 划分训练集和验证集 ===
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df['attack_cat'], test_size=0.3, random_state=42, stratify=df['attack_cat']
)

# === 3. 定义并训练 LightGBM 模型 ===
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(label_le.classes_),
    learning_rate=0.1,
    num_leaves=64,
    max_depth=-1,
    random_state=42,
    n_estimators=100
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(0)]
)

# === 4. 验证集评估 ===
y_pred_val = model.predict(X_val)
acc = accuracy_score(y_val, y_pred_val)
print(f'验证集准确率: {acc:.4f}')
print('模型参数:')
print(model.get_params())

# === 5. 读取测试集并做同样的预处理 ===
df_test = pd.read_csv('test_data.csv')

# 编码测试集的类别特征（使用训练集的编码器）
for col in cat_cols:
    le = label_encoders[col]
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    df_test[col] = df_test[col].map(mapping).fillna(-1).astype(int)


# 特征列
X_test = df_test[features]

# === 6. 模型预测测试集 ===
y_test_pred = model.predict(X_test)
y_test_labels = label_le.inverse_transform(y_test_pred)  # 反编码为原始标签

# === 7. 输出或保存预测结果 ===
df_test['predicted_attack_cat'] = y_test_labels
df_test[['id', 'predicted_attack_cat']].to_csv('test_predictions.csv', index=False)
print("测试集预测完成，结果已保存到 test_predictions.csv")


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# === 1. 读取训练数据 ===
df = pd.read_csv('train_data.csv')

# 编码类别字段
cat_cols = ['proto', 'service', 'state']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 编码标签列
label_le = LabelEncoder()
df['attack_cat'] = label_le.fit_transform(df['attack_cat'])

# 特征列（排除 id 和标签）
features = df.columns.difference(['id', 'attack_cat','ct_ftp_cmd','is_sm_ips_ports','trans_depth','swin','is_ftp_login','dwin'])

# === 2. 特征标准化 ===
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# === 3. 划分训练集和验证集 ===
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df['attack_cat'], test_size=0.3, random_state=42, stratify=df['attack_cat']
)

# === 4. 定义并训练 LightGBM 模型 ===
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(label_le.classes_),
    learning_rate=0.2,
    num_leaves=64,
    max_depth=-1,
    random_state=42,
    n_estimators=100,
    reg_alpha=0.5,   # 加入L1正则
    reg_lambda=2.0   # 加入L2正则
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(0)]
)

# === 5. 验证集评估 ===
y_pred_val = model.predict(X_val)
acc = accuracy_score(y_val, y_pred_val)
print(f'验证集准确率: {acc:.4f}')
print('模型参数:')
print(model.get_params())

# === 6. 读取测试集并预处理 ===
df_test = pd.read_csv('test_data.csv')

# 类别特征编码
for col in cat_cols:
    le = label_encoders[col]
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    df_test[col] = df_test[col].map(mapping).fillna(-1).astype(int)

# 特征标准化（使用训练集的 scaler）
df_test[features] = scaler.transform(df_test[features])

# === 7. 模型预测测试集 ===
X_test = df_test[features]
y_test_pred = model.predict(X_test)
y_test_labels = label_le.inverse_transform(y_test_pred)

# === 8. 输出结果 ===
df_test['attack_cat'] = y_test_labels
df_test[['id', 'attack_cat']].to_csv('test_predictions.csv', index=False)
print("测试集预测完成，结果已保存到 test_predictions.csv")


In [14]:
import matplotlib.pyplot as plt

# === 4.1 打印特征重要性 ===
feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

print("\n各特征的重要性（按降序排列）：")
print(feature_importance)

# 可选：画图可视化特征重要性
plt.figure(figsize=(10, 6))
feature_importance.head(35).plot(kind='bar')
plt.title("Top 20 Feature Importances")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import joblib

# ================== 数据加载与特征工程 ==================

def feature_engineering(df):
    df = df.copy()
    df['bytes_ratio']   = df['sbytes'] / (df['dbytes'] + 1e-6)
    df['pkts_ratio']    = df['spkts'] / (df['dpkts'] + 1e-6)
    df['flow_rate']     = (df['sbytes'] + df['dbytes']) / (df['dur'] + 1e-6)
    df['pkt_size_diff'] = df['spkts'] - df['dpkts']
    df['byte_per_pkt']  = (df['sbytes'] + df['dbytes']) / (df['spkts'] + df['dpkts'] + 1e-6)
    df['total_bytes']   = df['sbytes'] + df['dbytes']
    df['total_pkts']    = df['spkts'] + df['dpkts']
    df['bytes_per_sec'] = df['total_bytes'] / (df['dur'] + 1e-6)
    df['pkts_per_sec']  = df['total_pkts'] / (df['dur'] + 1e-6)
    return df

train = pd.read_csv('train_data.csv')
test  = pd.read_csv('test_data.csv')
train = feature_engineering(train)
test  = feature_engineering(test)

# 处理分类特征
categorical_cols = ['proto', 'service', 'state']
for df in (train, test):
    df[categorical_cols] = df[categorical_cols].replace('-', 'unknown')

# 标签编码
classes = ['Generic','Reconnaissance','Normal','DoS','Fuzzers',
           'Worms','Backdoor','Analysis','Shellcode','Exploits']
le = LabelEncoder()
train['attack_cat'] = train['attack_cat'].map(lambda x: x if x in classes else 'Normal')
y = le.fit_transform(train['attack_cat'])

# 特征和预处理
X = train.drop(['id','attack_cat'], axis=1)
X_test = test.drop('id', axis=1)
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# 划分训练/验证集并计算样本权重
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
sample_weights = class_weights[y]
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X, y, sample_weights,
    test_size=0.2, stratify=y, random_state=42
)

dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
dval   = xgb.DMatrix(X_val,   label=y_val,   weight=w_val)

def train_model(params, dtrain, dval, num_round=2000, early_stopping=50):
    evals = [(dtrain, 'train'), (dval, 'valid')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_round,
        evals=evals,
        early_stopping_rounds=early_stopping,
        verbose_eval=50
    )
    return model

# 定义参数列表
param_list = [
    {'objective':'multi:softprob','num_class':len(classes),'tree_method':'hist',
     'learning_rate':0.05,'max_depth':8,'min_child_weight':3,'gamma':0.3,
     'reg_alpha':1.0,'reg_lambda':2.0,'subsample':0.7,'colsample_bytree':0.6,
     'eval_metric':'mlogloss','n_jobs':-1,'random_state':42,'verbosity':0},
    {'objective':'multi:softprob','num_class':len(classes),'tree_method':'hist',
     'learning_rate':0.03,'max_depth':10,'min_child_weight':2,'gamma':0.2,
     'reg_alpha':0.5,'reg_lambda':1.5,'subsample':0.8,'colsample_bytree':0.7,
     'eval_metric':'mlogloss','n_jobs':-1,'random_state':42,'verbosity':0},
    {'objective':'multi:softprob','num_class':len(classes),'tree_method':'hist',
     'learning_rate':0.07,'max_depth':6,'min_child_weight':4,'gamma':0.4,
     'reg_alpha':1.5,'reg_lambda':2.5,'subsample':0.6,'colsample_bytree':0.5,
     'eval_metric':'mlogloss','n_jobs':-1,'random_state':42,'verbosity':0},
]

# 训练模型
models = []
for params in param_list:
    models.append(train_model(params, dtrain, dval))

# 验证集软投票
proba_val = sum(m.predict(dval) for m in models) / len(models)
pred_val = np.argmax(proba_val, axis=1)
f1 = f1_score(y_val, pred_val, average='macro')
print(f"Ensemble Validation Macro F1: {f1:.4f}")

# 测试集预测
dtest = xgb.DMatrix(X_test)
proba_test = sum(m.predict(dtest) for m in models) / len(models)
pred_test = np.argmax(proba_test, axis=1)
pred_cat = le.inverse_transform(pred_test)

# 保存结果与模型
pd.DataFrame({'id': test['id'], 'attack_cat': pred_cat})\
    .to_csv('ensemble_xgb_predictions.csv', index=False)
joblib.dump(preprocessor, 'ensemble_xgb_preprocessor.pkl')
for idx, m in enumerate(models):
    m.save_model(f'xgb_model_{idx}.model')
print("Prediction and models saved.")


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import joblib

# ================== 数据加载与特征工程 ==================
def feature_engineering(df):
    df = df.copy()
    df['bytes_ratio']   = df['sbytes'] / (df['dbytes'] + 1e-6)
    df['pkts_ratio']    = df['spkts'] / (df['dpkts'] + 1e-6)
    df['flow_rate']     = (df['sbytes'] + df['dbytes']) / (df['dur'] + 1e-6)
    df['pkt_size_diff'] = df['spkts'] - df['dpkts']
    df['byte_per_pkt']  = (df['sbytes'] + df['dbytes']) / (df['spkts'] + df['dpkts'] + 1e-6)
    df['total_bytes']   = df['sbytes'] + df['dbytes']
    df['total_pkts']    = df['spkts'] + df['dpkts']
    df['bytes_per_sec'] = df['total_bytes'] / (df['dur'] + 1e-6)
    df['pkts_per_sec']  = df['total_pkts'] / (df['dur'] + 1e-6)
    return df

train = pd.read_csv('train_data.csv')
test  = pd.read_csv('test_data.csv')
train = feature_engineering(train)
test  = feature_engineering(test)

# 分类特征预处理
categorical_cols = ['proto','service','state']
for df in (train,test):
    df[categorical_cols] = df[categorical_cols].replace('-', 'unknown')

# 标签与样本权重
classes = ['Generic','Reconnaissance','Normal','DoS','Fuzzers','Worms','Backdoor','Analysis','Shellcode','Exploits']
le = LabelEncoder()
train['attack_cat'] = train['attack_cat'].map(lambda x: x if x in classes else 'Normal')
y = le.fit_transform(train['attack_cat'])
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
sample_weights = class_weights[y]

# 特征编码
X = train.drop(['id','attack_cat'], axis=1)
X_test = test.drop('id', axis=1)
preprocessor = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder='passthrough')
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# 划分训练/验证集
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X, y, sample_weights, test_size=0.2, stratify=y, random_state=42
)

# 定义并训练各模型
xgb_clf = XGBClassifier(
    objective='multi:softprob', num_class=len(classes),
    learning_rate=0.02, max_depth=6,
    subsample=0.6, colsample_bytree=0.5,
    reg_alpha=5, reg_lambda=10,
    n_estimators=1000, use_label_encoder=False,
    eval_metric='mlogloss', random_state=42, n_jobs=-1
)

lgb_clf = LGBMClassifier(
    objective='multiclass', num_class=len(classes),
    learning_rate=0.02, max_depth=6,
    subsample=0.6, colsample_bytree=0.5,
    reg_alpha=5, reg_lambda=10,
    n_estimators=1000, random_state=42, n_jobs=-1
)

cat_clf = CatBoostClassifier(
    loss_function='MultiClass', learning_rate=0.02, depth=6,
    bootstrap_type='Bernoulli', subsample=0.6, reg_lambda=10,
    iterations=1000, random_seed=42, verbose=100
)

# 逐模型训练，并传入样本权重（若支持）
xgb_clf.fit(X_train, y_train, sample_weight=w_train)
lgb_clf.fit(X_train, y_train, sample_weight=w_train)
cat_clf.fit(X_train, y_train, sample_weight=w_train)

# 验证集概率与软投票
proba_val = (2 * xgb_clf.predict_proba(X_val)
            + 1 * lgb_clf.predict_proba(X_val)
            + 1 * cat_clf.predict_proba(X_val)) / 4
pred_val = np.argmax(proba_val, axis=1)
print(classification_report(y_val, pred_val, target_names=le.classes_))
f1 = f1_score(y_val, pred_val, average='macro')
print(f"Ensemble Validation Macro F1: {f1:.4f}")

# 测试集预测与保存
proba_test = (2 * xgb_clf.predict_proba(X_test)
             + 1 * lgb_clf.predict_proba(X_test)
             + 1 * cat_clf.predict_proba(X_test)) / 4
pred_test = np.argmax(proba_test, axis=1)
pred_cat = le.inverse_transform(pred_test)
pd.DataFrame({'id': test['id'], 'attack_cat': pred_cat}).to_csv(
    'hetero_ensemble_predictions.csv', index=False
)

# 保存预处理器与模型
joblib.dump(preprocessor, 'hetero_preprocessor.pkl')
joblib.dump({'xgb': xgb_clf, 'lgb': lgb_clf, 'cat': cat_clf}, 'hetero_models.pkl')
print("Heterogeneous ensemble results saved.")


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import joblib

# ================== 数据预处理 ==================
train_data = pd.read_csv('train_data.csv')
test_data  = pd.read_csv('test_data.csv')

classes = ['Generic','Reconnaissance','Normal','DoS','Fuzzers',
           'Worms','Backdoor','Analysis','Shellcode','Exploits']

# 特征工程函数
def feature_engineering(df):
    df = df.copy()
    df['bytes_ratio']   = df['sbytes'] / (df['dbytes'] + 1e-6)
    df['pkts_ratio']    = df['spkts'] / (df['dpkts'] + 1e-6)
    df['flow_rate']     = (df['sbytes'] + df['dbytes']) / (df['dur'] + 1e-6)
    df['pkt_size_diff'] = df['spkts'] - df['dpkts']
    df['byte_per_pkt']  = (df['sbytes'] + df['dbytes']) / (df['spkts'] + df['dpkts'] + 1e-6)
    df['total_bytes']   = df['sbytes'] + df['dbytes']
    df['total_pkts']    = df['spkts'] + df['dpkts']
    df['bytes_per_sec'] = df['total_bytes'] / (df['dur'] + 1e-6)
    df['pkts_per_sec']  = df['total_pkts'] / (df['dur'] + 1e-6)
    return df

train_data = feature_engineering(train_data)
test_data  = feature_engineering(test_data)

# 分类特征替换
categorical_cols = ['proto','service','state']
for df in (train_data, test_data):
    df[categorical_cols] = df[categorical_cols].replace('-', 'unknown')

# 分离特征与标签
X = train_data.drop(['id','attack_cat'], axis=1)
y_raw = train_data['attack_cat']
X_test = test_data.drop('id', axis=1)
test_ids = test_data['id']

# 标签编码
le = LabelEncoder()
y_mapped = y_raw.map(lambda x: x if x in classes else 'Normal')
y = le.fit_transform(y_mapped)

# 特征编码
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# 计算样本权重
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
sample_weights = class_weights[y]

# 划分训练/验证集
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X, y, sample_weights, test_size=0.2, stratify=y, random_state=42
)

# ================== 只使用 模型2 XGBoost 并加强 L1、L2 正则化 ==================
params2 = {
    'objective': 'multi:softmax',
    'num_class': len(classes),
    'tree_method': 'hist',
    'learning_rate': 0.03,
    'max_depth': 10,
    'min_child_weight': 2,
    'gamma': 0.2,
    # 强化正则化
    'reg_alpha': 2.0,   # L1 正则化
    'reg_lambda': 5.0,  # L2 正则化
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'eval_metric': 'mlogloss',
    'n_jobs': -1,
    'random_state': 42,
    'verbosity': 0
}

# 构造 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
dval   = xgb.DMatrix(X_val,   label=y_val,   weight=w_val)

# 训练模型
model2 = xgb.train(
    params2,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dval, 'valid')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# 验证评估
pred_val = model2.predict(dval).astype(int)
print(classification_report(y_val, pred_val, target_names=le.classes_))
f1 = f1_score(y_val, pred_val, average='macro')
print(f"Model2 XGBoost Validation Macro F1: {f1:.4f}")

# 测试集预测与保存
dtest = xgb.DMatrix(X_test)
pred_test = model2.predict(dtest).astype(int)
pred_cat = le.inverse_transform(pred_test)
result = pd.DataFrame({'id': test_ids, 'attack_cat': pred_cat})
result.to_csv('model2_xgb_with_reg.csv', index=False)

# 保存模型和预处理器
joblib.dump(preprocessor, 'model2_preprocessor.pkl')
model2.save_model('model2_xgb_with_reg.model')
print("Model 2 XGBoost with L1 & L2 regularization saved.")
