In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("./data/traininingdata.txt", sep=";")
test_data = pd.read_csv("./data/testdata.txt", sep=";")

# 删除某些列
filtered_column_name = []
train_data = train_data.drop(filtered_column_name, axis=1)
test_data = test_data.drop(filtered_column_name, axis=1)

# 保存原始的y值，因为我们只想对特征进行one-hot编码，而不是目标变量
y = train_data.iloc[:, -1].values
y_test = test_data.iloc[:, -1].values

# 将目标变量转换为0和1
le = LabelEncoder()
y = le.fit_transform(y)
y_test = le.transform(y_test)

# 删除目标变量列，以便我们可以只对特征进行one-hot编码
train_data = train_data.iloc[:, :-1]
test_data = test_data.iloc[:, :-1]

# 使用pandas的get_dummies函数来进行one-hot编码
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# 转换为numpy数组并进行类型转换
train_data = train_data.values.astype(np.float32)
test_data = test_data.values.astype(np.float32)

# 提取特征
X = train_data
X_test = test_data

# from sklearn.utils import resample

# # 根据类别标签分离出正例和负例
# positive = train_data[y == 1]
# negative = train_data[y == 0]

# # 上采样正例
# positive_upsampled = resample(
#     positive,
#     replace=True,  # 样本可以被多次抽样
#     n_samples=len(negative),  # 将正例的数量增加到与负例相同
#     random_state=42,
# )  # 随机数生成器种子

# # 合并上采样后的负例和原来的正例
# upsampled = np.vstack([positive_upsampled, negative])
# X_upsampled = upsampled[:, :-1]
# y_upsampled = upsampled[:, -1]
# X_upsampled = (X_upsampled - X_upsampled.mean(axis=0)) / X_upsampled.std(axis=0)

# # 下采样负例
# negative_downsampled = resample(
#     negative,
#     replace=False,  # 样本不能被多次抽样
#     n_samples=len(positive),  # 将负例的数量减少到与正例相同
#     random_state=42,
# )  # 随机数生成器种子

# # 合并下采样后的正例和原来的负例
# downsampled = np.vstack([negative_downsampled, positive])
# X_downsampled = downsampled[:, :-1]
# y_downsampled = downsampled[:, -1]
# X_downsampled = (X_downsampled - X_downsampled.mean(axis=0)) / X_downsampled.std(axis=0)

# # 此时你可以选择使用 X_upsampled, y_upsampled 或 X_downsampled, y_downsampled 来进行模型的训练

In [2]:
from sklearn import svm

clf = svm.SVC(kernel="poly", C=2.0, gamma="auto")
clf.fit(X, y)

# y_pred = clf.predict(X_test)

# from sklearn.metrics import accuracy_score

# print("accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_validate
from sklearn import metrics

cross_validate(
    clf, X_test, y_test, cv=5, scoring=["accuracy", "precision", "recall", "f1"]
)

{'fit_time': array([0.95518732, 1.14083266, 0.72142744, 0.75035501, 1.06036115]),
 'score_time': array([0.04692197, 0.04592013, 0.04541206, 0.04633236, 0.04563975]),
 'test_accuracy': array([0.88391376, 0.88225539, 0.87893864, 0.88440265, 0.88440265]),
 'test_precision': array([0.53846154, 0.48275862, 0.42222222, 0.52941176, 0.54166667]),
 'test_recall': array([0.06603774, 0.06603774, 0.08962264, 0.08530806, 0.06161137]),
 'test_f1': array([0.11764706, 0.11618257, 0.14785992, 0.14693878, 0.1106383 ])}

In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_validate

clf = AdaBoostClassifier(
    n_estimators=100,
    random_state=0,
)
clf.fit(X, y)

cross_validate(
    clf, X_test, y_test, cv=5, scoring=["accuracy", "precision", "recall", "f1"]
)

{'fit_time': array([0.440377  , 0.43655038, 0.43436193, 0.43319225, 0.43302631]),
 'score_time': array([0.01539373, 0.01505542, 0.01500082, 0.01624966, 0.0153048 ]),
 'test_accuracy': array([0.90270868, 0.90049751, 0.89662797, 0.90707965, 0.89988938]),
 'test_precision': array([0.640625  , 0.61267606, 0.59398496, 0.67768595, 0.61538462]),
 'test_recall': array([0.38679245, 0.41037736, 0.37264151, 0.38862559, 0.37914692]),
 'test_f1': array([0.48235294, 0.49152542, 0.45797101, 0.4939759 , 0.46920821])}