In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# 读取数据
data = pd.read_csv("data/fraudulent.csv")

# 处理缺失值
data.dropna(axis=1, thresh=int(data.shape[0] * 0.5), inplace=True)  # 剔除缺失值过多的列
imputer = SimpleImputer(strategy="most_frequent")  # 使用众数填充其余列的缺失值
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 划分训练集和测试集
X = data.drop("y", axis=1)
y = data["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 定义模型列表
models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LogisticRegression(),
    SVC()
]

# 训练并测试模型，计算 F1 值
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{model.__class__.__name__}的F1值为: {f1}")

KNeighborsClassifier的F1值为: 0.838810641627543
DecisionTreeClassifier的F1值为: 0.863961813842482
LogisticRegression的F1值为: 0.8490718321226796
SVC的F1值为: 0.8585131894484412
