In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# 读取CSV文件
data = pd.read_csv("fraudulent.csv")

# 计算每列的缺失值数量
missing_values_count = data.isnull().sum()

# 计算数据集中的总行数
total_rows = data.shape[0]

# 剔除缺失值超过20%的列
columns_to_drop = missing_values_count[missing_values_count > total_rows * 0.20].index
data_cleaned = data.drop(columns=columns_to_drop)

# 众数填充
imputer = SimpleImputer(strategy='most_frequent')
data_cleaned_imputed = pd.DataFrame(imputer.fit_transform(data_cleaned), columns=data_cleaned.columns)

# 将数据分为特征和标签
X = data_cleaned_imputed.drop('y', axis=1)
y = data_cleaned_imputed['y']

# 划分数据集，训练集比例为0.8，随机种子设置为1
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 初始化分类模型
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(max_iter=1000)
}

# 训练模型并在测试集上计算F1分数
for name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    f1 = f1_score(Y_test, Y_pred)
    print(f"{name} F1 score: {f1:.4f}")

Random Forest F1 score: 0.8649
Gradient Boosting F1 score: 0.8612
Naive Bayes F1 score: 0.5530
Neural Network F1 score: 0.8678
