In [1]:
import pandas as pd
df=pd.read_csv('train_data.csv')
df.info()

In [2]:
df.describe(include='all')

In [5]:
object1=df.select_dtypes(include=['object'])

In [6]:
for column in object1.columns:
    print(column)
    print(object1[column].unique())

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# 读取数据
df = pd.read_csv('train_data.csv')

# 编码类别字段
cat_cols = ['proto', 'service', 'state']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 编码标签字段
label_le = LabelEncoder()
df['attack_cat'] = label_le.fit_transform(df['attack_cat'])

# 特征列
features = df.columns.difference(['id', 'attack_cat'])

# 划分训练/验证集
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df['attack_cat'], test_size=0.3, random_state=42, stratify=df['attack_cat']
)

# 初始化 LGBM 模型
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(label_le.classes_),
    learning_rate=0.1,
    num_leaves=64,
    max_depth=-1,
    random_state=42,
    n_estimators=100
)

# 训练模型，使用 callbacks 替代 early_stopping_rounds
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(0)]
)

# 预测验证集
y_pred = model.predict(X_val)

# 打印准确率
acc = accuracy_score(y_val, y_pred)
print(f'验证集准确率: {acc:.4f}')

# 打印模型参数
print('模型参数:')
print(model.get_params())


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# === 1. 读取训练数据 ===
df = pd.read_csv('train_data.csv')

# 需要编码的类别字段
cat_cols = ['proto', 'service', 'state']
label_encoders = {}

# 编码类别字段
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 编码标签列
label_le = LabelEncoder()
df['attack_cat'] = label_le.fit_transform(df['attack_cat'])

# 特征列（排除 id 和标签）
features = df.columns.difference(['id', 'attack_cat'])

# === 2. 划分训练集和验证集 ===
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df['attack_cat'], test_size=0.3, random_state=42, stratify=df['attack_cat']
)

# === 3. 定义并训练 LightGBM 模型 ===
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(label_le.classes_),
    learning_rate=0.1,
    num_leaves=64,
    max_depth=-1,
    random_state=42,
    n_estimators=100
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(0)]
)

# === 4. 验证集评估 ===
y_pred_val = model.predict(X_val)
acc = accuracy_score(y_val, y_pred_val)
print(f'验证集准确率: {acc:.4f}')
print('模型参数:')
print(model.get_params())

# === 5. 读取测试集并做同样的预处理 ===
df_test = pd.read_csv('test_data.csv')

# 编码测试集的类别特征（使用训练集的编码器）
for col in cat_cols:
    le = label_encoders[col]
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    df_test[col] = df_test[col].map(mapping).fillna(-1).astype(int)


# 特征列
X_test = df_test[features]

# === 6. 模型预测测试集 ===
y_test_pred = model.predict(X_test)
y_test_labels = label_le.inverse_transform(y_test_pred)  # 反编码为原始标签

# === 7. 输出或保存预测结果 ===
df_test['predicted_attack_cat'] = y_test_labels
df_test[['id', 'predicted_attack_cat']].to_csv('test_predictions.csv', index=False)
print("测试集预测完成，结果已保存到 test_predictions.csv")


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# === 1. 读取训练数据 ===
df = pd.read_csv('train_data.csv')

# 编码类别字段
cat_cols = ['proto', 'service', 'state']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 编码标签列
label_le = LabelEncoder()
df['attack_cat'] = label_le.fit_transform(df['attack_cat'])

# 特征列（排除 id 和标签）
features = df.columns.difference(['id', 'attack_cat','ct_ftp_cmd','is_sm_ips_ports','trans_depth','swin','is_ftp_login','dwin'])

# === 2. 特征标准化 ===
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# === 3. 划分训练集和验证集 ===
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df['attack_cat'], test_size=0.3, random_state=42, stratify=df['attack_cat']
)

# === 4. 定义并训练 LightGBM 模型 ===
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(label_le.classes_),
    learning_rate=0.2,
    num_leaves=64,
    max_depth=-1,
    random_state=42,
    n_estimators=100,
    reg_alpha=0.5,   # 加入L1正则
    reg_lambda=2.0   # 加入L2正则
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(0)]
)

# === 5. 验证集评估 ===
y_pred_val = model.predict(X_val)
acc = accuracy_score(y_val, y_pred_val)
print(f'验证集准确率: {acc:.4f}')
print('模型参数:')
print(model.get_params())

# === 6. 读取测试集并预处理 ===
df_test = pd.read_csv('test_data.csv')

# 类别特征编码
for col in cat_cols:
    le = label_encoders[col]
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    df_test[col] = df_test[col].map(mapping).fillna(-1).astype(int)

# 特征标准化（使用训练集的 scaler）
df_test[features] = scaler.transform(df_test[features])

# === 7. 模型预测测试集 ===
X_test = df_test[features]
y_test_pred = model.predict(X_test)
y_test_labels = label_le.inverse_transform(y_test_pred)

# === 8. 输出结果 ===
df_test['attack_cat'] = y_test_labels
df_test[['id', 'attack_cat']].to_csv('test_predictions.csv', index=False)
print("测试集预测完成，结果已保存到 test_predictions.csv")


In [14]:
import matplotlib.pyplot as plt

# === 4.1 打印特征重要性 ===
feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

print("\n各特征的重要性（按降序排列）：")
print(feature_importance)

# 可选：画图可视化特征重要性
plt.figure(figsize=(10, 6))
feature_importance.head(35).plot(kind='bar')
plt.title("Top 20 Feature Importances")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()
