In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("./data/traininingdata.txt", sep=";")
test_data = pd.read_csv("./data/testdata.txt", sep=";")

# 删除某些列
filtered_column_name = []
train_data = train_data.drop(filtered_column_name, axis=1)
test_data = test_data.drop(filtered_column_name, axis=1)


# 保存原始的y值，因为我们只想对特征进行one-hot编码，而不是目标变量
y = train_data.iloc[:, -1].values
y_test = test_data.iloc[:, -1].values

# 将目标变量转换为0和1
le = LabelEncoder()
y = le.fit_transform(y)
y_test = le.transform(y_test)

# 删除目标变量列，以便我们可以只对特征进行one-hot编码
train_data = train_data.iloc[:, :-1]
test_data = test_data.iloc[:, :-1]

# 使用pandas的get_dummies函数来进行one-hot编码
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# 转换为numpy数组并进行类型转换
train_data = train_data.values.astype(np.float32)
test_data = test_data.values.astype(np.float32)

# 提取特征
X = train_data
X_test = test_data

# 特征缩放
# X = (X - X.mean(axis=0)) / X.std(axis=0)
# X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)

# from sklearn.utils import resample

# # 根据类别标签分离出正例和负例
# positive = train_data[y == 1]
# negative = train_data[y == 0]

# # 上采样正例
# positive_upsampled = resample(
#     positive,
#     replace=True,  # 样本可以被多次抽样
#     n_samples=len(negative),  # 将正例的数量增加到与负例相同
#     random_state=42,
# )  # 随机数生成器种子

# # 合并上采样后的负例和原来的正例
# upsampled = np.vstack([positive_upsampled, negative])
# X_upsampled = upsampled[:, :-1]
# y_upsampled = upsampled[:, -1]
# X_upsampled = (X_upsampled - X_upsampled.mean(axis=0)) / X_upsampled.std(axis=0)

# # 下采样负例
# negative_downsampled = resample(
#     negative,
#     replace=False,  # 样本不能被多次抽样
#     n_samples=len(positive),  # 将负例的数量减少到与正例相同
#     random_state=42,
# )  # 随机数生成器种子

# # 合并下采样后的正例和原来的负例
# downsampled = np.vstack([negative_downsampled, positive])
# X_downsampled = downsampled[:, :-1]
# y_downsampled = downsampled[:, -1]
# X_downsampled = (X_downsampled - X_downsampled.mean(axis=0)) / X_downsampled.std(axis=0)

# # 此时你可以选择使用 X_upsampled, y_upsampled 或 X_downsampled, y_downsampled 来进行模型的训练

In [35]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X, y)

# y_pred = clf.predict(X_test)

# from sklearn.metrics import accuracy_score

# print("accuracy: ", accuracy_score(y_test, y_pred))

clf.score(X_test, y_test)

0.9037929890523057

In [36]:
from sklearn.model_selection import cross_validate
from sklearn import metrics

cross_validate(
    clf, X_test, y_test, cv=5, scoring=["accuracy", "precision", "recall", "f1"]
)

{'fit_time': array([0.55571198, 0.53867579, 0.5521965 , 0.55686951, 0.5589056 ]),
 'score_time': array([0.01799989, 0.01900792, 0.01800346, 0.01853299, 0.01799989]),
 'test_accuracy': array([0.89718076, 0.90049751, 0.90381426, 0.90431416, 0.8949115 ]),
 'test_precision': array([0.63265306, 0.64035088, 0.69      , 0.68627451, 0.60194175]),
 'test_recall': array([0.29245283, 0.34433962, 0.3254717 , 0.33175355, 0.29383886]),
 'test_f1': array([0.4       , 0.44785276, 0.44230769, 0.44728435, 0.39490446])}