In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score


pandas 和 numpy：用于数据处理和数值计算。
train_test_split：将数据分割为训练集和测试集。
SimpleImputer：处理缺失值。
StandardScaler：对数据进行标准化处理。
KNeighborsClassifier, DecisionTreeClassifier, LogisticRegression, SVC：不同的分类模型。
f1_score：评估模型性能的指标。

In [2]:
file_path = 'fraudulent.csv'
data = pd.read_csv(file_path)

print("\n初始数据的概况：")
print(data.info())  # 查看数据类型、缺失值和列信息
print("\n前五行数据示例：")
print(data.head())  # 展示前五行数据内容



初始数据的概况：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10086 entries, 0 to 10085
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   contain_IP             9996 non-null   float64
 1   is_long                9997 non-null   float64
 2   is_tinyurl             9998 non-null   float64
 3   contain_at             10004 non-null  float64
 4   contain_double_slash   9970 non-null   float64
 5   contain_dash           9992 non-null   float64
 6   contain_subdomain      9989 non-null   float64
 7   is_SSL                 9990 non-null   float64
 8   with_long_history      7291 non-null   float64
 9   contain_icon           8728 non-null   float64
 10  contain_ext_domain     8559 non-null   float64
 11  contain_email_to       8007 non-null   float64
 12  allow_right_click      6679 non-null   float64
 13  contain_pop_up_window  9807 non-null   float64
 14  contain_Iframe         9427 non-null   float

In [3]:
#处理缺失值
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
#分割特征和标签
X = data_imputed.drop(columns=['y'])  # 特征
y = data_imputed['y']  # 标签
#数据集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#数据标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#定义和训练分类模型
models = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=1),
    'Logistic Regression': LogisticRegression(max_iter=500, random_state=1),
    'SVM': SVC(random_state=1)
}
#模型训练和评估
for name, model in models.items():
    model.fit(X_train, y_train)  # 训练模型
    y_pred = model.predict(X_test)  # 预测
    f1 = f1_score(y_test, y_pred)  # 计算F1值
    print(f"{name} 模型的F1值: {f1:.4f}")


KNN 模型的F1值: 0.8379
Decision Tree 模型的F1值: 0.8667
Logistic Regression 模型的F1值: 0.8500
SVM 模型的F1值: 0.8615
