In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import numpy as np

# **1. 加载数据**
file_path = r"C:\Users\Lenovo\Desktop\数据导论\fraudulent.csv"  # 更改为你的文件路径
data = pd.read_csv(file_path)

# **2. 数据概况查看**
print("数据基本信息:")
print(data.info())

# **3. 处理缺失值**
# 计算每列的缺失值比例
missing_data = data.isnull().sum() / len(data) * 100
print("\n缺失值比例:")
print(missing_data)

# 使用众数填充缺失值
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# **4. 数据分割**
# 特征和标签
X = data_imputed.drop(columns=["y"])  # 特征
y = data_imputed["y"]  # 标签

# 将数据集划分为训练集和测试集（80%训练，20%测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# **5. 特征标准化**
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 强制转换为NumPy数组并确保是C顺序
X_train_scaled = np.array(X_train_scaled, order='C')
X_test_scaled = np.array(X_test_scaled, order='C')

# **6. 模型训练与评估**
# 逻辑回归
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)
logistic_y_pred = logistic_model.predict(X_test_scaled)
logistic_f1 = f1_score(y_test, logistic_y_pred)
print(f"逻辑回归 F1 值: {logistic_f1:.4f}")

# 决策树
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_f1 = f1_score(y_test, dt_y_pred)
print(f"决策树 F1 值: {dt_f1:.4f}")

# K-近邻
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_f1 = f1_score(y_test, knn_y_pred)
print(f"K-近邻 F1 值: {knn_f1:.4f}")

# 支持向量机
svm_model = SVC(random_state=1)
svm_model.fit(X_train_scaled, y_train)
svm_y_pred = svm_model.predict(X_test_scaled)
svm_f1 = f1_score(y_test, svm_y_pred)
print(f"支持向量机 F1 值: {svm_f1:.4f}")

# **7. 总结比较**
print("\n模型 F1 值比较:")
print(f"逻辑回归: {logistic_f1:.4f}")
print(f"决策树: {dt_f1:.4f}")
print(f"K-近邻: {knn_f1:.4f}")
print(f"支持向量机: {svm_f1:.4f}")

# 选择最佳模型
model_scores = {
    "逻辑回归": logistic_f1,
    "决策树": dt_f1,
    "K-近邻": knn_f1,
    "支持向量机": svm_f1,
}
best_model = max(model_scores, key=model_scores.get)

print(f"\n最佳模型是: {best_model}")




数据基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10086 entries, 0 to 10085
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   contain_IP             9996 non-null   float64
 1   is_long                9997 non-null   float64
 2   is_tinyurl             9998 non-null   float64
 3   contain_at             10004 non-null  float64
 4   contain_double_slash   9970 non-null   float64
 5   contain_dash           9992 non-null   float64
 6   contain_subdomain      9989 non-null   float64
 7   is_SSL                 9990 non-null   float64
 8   with_long_history      7291 non-null   float64
 9   contain_icon           8728 non-null   float64
 10  contain_ext_domain     8559 non-null   float64
 11  contain_email_to       8007 non-null   float64
 12  allow_right_click      6679 non-null   float64
 13  contain_pop_up_window  9807 non-null   float64
 14  contain_Iframe         9427 non-null   float64

AttributeError: 'Flags' object has no attribute 'c_contiguous'