In [1]:
import pandas as pd


input_file = "ABB_teardrop_fast_0715.csv"
output_file = "teardrop4.0.csv"
data = pd.read_csv(input_file)


print("初始数据中的空值数量：")
print(data.isnull().sum())


data = data.iloc[:, 9:]


columns_to_drop = ['sUrgRate', 'rUrgRate', 'sFinRate', 'rFinRate', 'sSynRate', 'rSynRate', 'sRstRate', 'rRstRate', 'sFragmentRate', 'rFragmentRate', 'sttl', 'sAckRate', 'rAckRate']
data = data.drop(columns_to_drop, axis=1)


data.fillna(0, inplace=True)


print("填充空值后的空值数量：")
print(data.isnull().sum())


non_numeric_cols = data.select_dtypes(exclude=['int64', 'float64']).columns

for col in non_numeric_cols:
    data[col] = data[col].astype('category').cat.codes


if 'sBytesSum' in data.columns and 'rBytesSum' in data.columns:
    data['totalBytes'] = data['sBytesSum'] + data['rBytesSum']

if 'sPackets' in data.columns and 'rPackets' in data.columns:
    data['totalPackets'] = data['sPackets'] + data['rPackets']


target_col = data.columns[-1]
feature_cols = ['totalBytes', 'totalPackets']
cols = list(data.columns)
for col in feature_cols:
    cols.remove(col)
cols.insert(-1, feature_cols[0])
cols.insert(-1, feature_cols[1])
data = data[cols]


numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
numeric_cols = numeric_cols[:-1]
data[numeric_cols] = (data[numeric_cols] - data[numeric_cols].min()) / (data[numeric_cols].max() - data[numeric_cols].min())


for col in non_numeric_cols:
    if data[col].dtype == 'int64':
        data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())


data.fillna(0, inplace=True)


data[target_col] = (data[target_col] == 1).astype(float)


print("处理后的空值数量：")
print(data.isnull().sum())


data.to_csv(output_file, index=False)
print(f"清洗后的数据已保存到 {output_file}")

初始数据中的空值数量：
sAddress           0
rAddress           0
sMACs              0
rMACs              0
sIPs             576
                ... 
sAckDelayMin    3052
rAckDelayMin    3052
sAckDelayAvg    3052
rAckDelayAvg    3052
state              0
Length: 61, dtype: int64
填充空值后的空值数量：
start              0
end                0
startOffset        0
endOffset          0
duration           0
sPackets           0
rPackets           0
sBytesSum          0
rBytesSum          0
sBytesMax          0
rBytesMax          0
sBytesMin          0
rBytesMin          0
sBytesAvg          0
rBytesAvg          0
sLoad              0
rLoad              0
sPayloadSum        0
rPayloadSum        0
sPayloadMax        0
rPayloadMax        0
sPayloadMin        0
rPayloadMin        0
sPayloadAvg        0
rPayloadAvg        0
sInterPacketAvg    0
rInterPacketAvg    0
rttl               0
sPshRate           0
rPshRate           0
sWinTCP            0
rWinTCP            0
sAckDelayMax       0
rAckDelayMax       0
sAckDe