In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier  # 可以替换为其他分类模型
from sklearn.metrics import f1_score
import numpy as np

# 1. 导入数据集
file_path = 'C:/Users/y2209/Git/homework2/fraudulent.csv'  # 请确保该文件路径正确
data = pd.read_csv(file_path)

# 2. 查看数据基本信息
print("原始数据集基本信息：")
print(data.info())
print("数据集描述：")
print(data.describe())

# 3. 处理缺失值
# 统计缺失值的情况
missing_values = data.isnull().sum()
print("缺失值统计：")
print(missing_values[missing_values > 0])

# 我们可以使用众数填充缺失值
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 检查填充后的缺失值情况
print("填充后的缺失值统计：")
print(data_imputed.isnull().sum())

# 4. 划分特征和标签
X = data_imputed.drop(columns='y')  # 特征数据
y = data_imputed['y']  # 标签

# 划分数据集，设置随机种子为1
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

print(f"训练集样本数量: {len(X_train)}, 验证集样本数量: {len(X_val)}, 测试集样本数量: {len(X_test)}")

# 5. 建立并训练分类模型（使用随机森林作为示例）
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

# 6. 测试模型并评估性能
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)

print(f"测试集的F1值: {f1:.4f}")

# 可选：评估验证集（如果需要）
# y_val_pred = model.predict(X_val)
# val_f1 = f1_score(y_val, y_val_pred)
# print(f"验证集的F1值: {val_f1:.4f}")

原始数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10086 entries, 0 to 10085
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   contain_IP             9996 non-null   float64
 1   is_long                9997 non-null   float64
 2   is_tinyurl             9998 non-null   float64
 3   contain_at             10004 non-null  float64
 4   contain_double_slash   9970 non-null   float64
 5   contain_dash           9992 non-null   float64
 6   contain_subdomain      9989 non-null   float64
 7   is_SSL                 9990 non-null   float64
 8   with_long_history      7291 non-null   float64
 9   contain_icon           8728 non-null   float64
 10  contain_ext_domain     8559 non-null   float64
 11  contain_email_to       8007 non-null   float64
 12  allow_right_click      6679 non-null   float64
 13  contain_pop_up_window  9807 non-null   float64
 14  contain_Iframe         9427 non-null   floa