In [5]:
# 读取csv文件
import pandas as pd
data = pd.read_csv("fraudulent.csv")

# 计算每列的缺失值数量
missing_values_count = data.isnull().sum()

# 计算数据集中的总行数
total_rows = data.shape[0]

# 剔除缺失值超过20%的列
columns_to_drop = missing_values_count[missing_values_count > total_rows * 0.20].index
data_cleaned = data.drop(columns=columns_to_drop)

# 众数填充
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'most_frequent')
data_cleaned_imputed = pd.DataFrame(imputer.fit_transform(data_cleaned),columns= data_cleaned.columns)

from sklearn.model_selection import train_test_split
# 将数据分为特征和标签
X = data_cleaned_imputed.drop('y', axis=1)
y = data_cleaned_imputed['y']

# 划分数据集，训练集比例为0.8，随机种子设置为1
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# 初始化不同的分类模型
models = {
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC()
}

# 训练每个模型并在测试集上计算F1分数
for name, model in models.items():
    # 训练模型
    model.fit(X_train, Y_train)
    # 在测试集上进行预测
    Y_pred = model.predict(X_test)
    # 计算F1分数并打印结果
    f1 = f1_score(Y_pred,Y_test)
    print(f"{name} F1 score:{f1:.4f}")



KNN F1 score:0.8412822517591868
Decision Tree F1 score:0.8708133971291866
Logistic Regression F1 score:0.85
SVM F1 score:0.8594249201277955
