In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 加载数据
data = pd.read_csv('../data/CICIDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

# 检查缺失值
print(data.isnull().sum())

# 删除包含缺失值的行或用填充值替换
data = data.dropna()

# 标签编码，将文本标签转为数值
data['Label'] = data['Label'].apply(lambda x: 1 if x != 'BENIGN' else 0)

# 特征和标签分离
X = data.drop(['Label'], axis=1)
y = data['Label']

# 数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 保存预处理后的数据
processed_data = pd.DataFrame(X_train)
processed_data['Label'] = y_train.values
processed_data.to_csv('../data/processed/preprocessed_data.csv', index=False)
