In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
def load_data(data_path):
    # 这里假设数据是CSV格式
    data = pd.read_csv(data_path)
    return data

In [3]:
def split_data(data):
    X = data.iloc[:, :-1]  # 所有特征列
    y = data.iloc[:, -1]  # 标签列

    # 分割数据集为训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

In [6]:
 data = load_data('../data/features.csv')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50048 entries, 0 to 50047
Columns: 993 entries, 0 to Label
dtypes: float64(992), int64(1)
memory usage: 379.2 MB


In [8]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,983,984,985,986,987,988,989,990,991,Label
0,-0.999912,0.975937,-1.0,0.993943,0.985628,0.999032,-0.999644,0.998977,0.970259,0.9997,...,0.603955,3.19325,1.110541,0.62053,0.757994,1.292508,1.17032,1.944303,0.741259,0
1,0.999969,-0.987782,1.0,-0.967026,-0.92499,-0.957387,0.966745,-0.999295,-0.986237,-0.999977,...,0.374383,0.781333,0.667129,1.026212,0.848688,1.286111,1.244434,0.477745,0.329097,1
2,0.972141,1.0,-0.999986,0.999996,0.999957,0.999998,-0.999999,-0.99948,0.99944,-0.736712,...,0.332628,2.958337,0.692625,0.934682,0.767804,1.364639,0.75903,2.331257,0.687632,0
3,0.999982,0.676503,0.999994,0.124561,0.096042,-0.276798,0.032117,-0.999862,-0.474581,-0.999981,...,0.564951,0.912515,1.083173,1.797331,1.170643,1.871673,1.187126,1.278171,0.766261,1
4,-0.999967,0.997787,-1.0,0.99927,0.997904,0.999819,-0.999952,0.994997,0.994416,0.99972,...,1.054142,3.082209,0.931048,1.011173,1.176199,1.323773,1.60112,1.529859,0.865902,0


In [9]:
X_train, X_test, y_train, y_test = split_data(data)

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40038 entries, 21726 to 15795
Columns: 992 entries, 0 to 991
dtypes: float64(992)
memory usage: 303.3 MB


In [10]:
def save_model(model, model_path):
    joblib.dump(model, model_path)


# 模型加载
def load_model(model_path):
    model = joblib.load(model_path)
    return model

In [40]:
import re

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)

    precision= precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")
    print(f"Classification Report:\n{report}")

In [37]:
# 模型训练
def train_model(X_train, y_train):
    # 这里使用随机森林分类器作为例子
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

In [38]:
model = train_model(X_train, y_train)

In [41]:
evaluate_model(model, X_test, y_test)

Accuracy: 0.9796
Precision: 0.9912
Recall: 0.9677
F1: 0.9793
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5019
           1       0.99      0.97      0.98      4991

    accuracy                           0.98     10010
   macro avg       0.98      0.98      0.98     10010
weighted avg       0.98      0.98      0.98     10010



In [42]:
save_model(model,"machine_model_pt/rf_model.pkl")

In [43]:
from sklearn.svm import SVC

In [44]:
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced', random_state=42,)

In [45]:
svm_model.fit(X_train, y_train)

In [49]:
evaluate_model(svm_model, X_test, y_test)

Accuracy: 0.9829
Precision: 0.9835
Recall: 0.9657
F1: 0.9826
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5019
           1       1.00      0.97      0.98      4991

    accuracy                           0.98     10010
   macro avg       0.98      0.98      0.98     10010
weighted avg       0.98      0.98      0.98     10010



In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
knn_model = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='cosine')
lr_model = LogisticRegression(max_iter=1000, C=1.0, solver='saga', class_weight='balanced', random_state=42)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric='mlogloss',random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
ada_model = AdaBoostClassifier(estimator=rf_model, algorithm='SAMME.R', n_estimators=50)

In [51]:
knn_model.fit(X_train, y_train)
evaluate_model(knn_model, X_test, y_test)

Accuracy: 0.9795
Precision: 0.9796
Recall: 0.9746
F1: 0.9794
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5019
           1       0.98      0.97      0.98      4991

    accuracy                           0.98     10010
   macro avg       0.98      0.98      0.98     10010
weighted avg       0.98      0.98      0.98     10010



In [52]:
lr_model.fit(X_train, y_train)
evaluate_model(lr_model, X_test, y_test)

Accuracy: 0.9827
Precision: 0.9828
Recall: 0.9750
F1: 0.9825
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5019
           1       0.99      0.97      0.98      4991

    accuracy                           0.98     10010
   macro avg       0.98      0.98      0.98     10010
weighted avg       0.98      0.98      0.98     10010





In [53]:
save_model(knn_model,"machine_model_pt/knn_model.pkl")

In [54]:
save_model(lr_model,"machine_model_pt/lr_model.pkl")

In [55]:
save_model(svm_model,"machine_model_pt/svm_model.pkl")

In [56]:
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_test, y_test)

Accuracy: 0.9800
Precision: 0.9801
Recall: 0.9724
F1: 0.9798
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5019
           1       0.99      0.97      0.98      4991

    accuracy                           0.98     10010
   macro avg       0.98      0.98      0.98     10010
weighted avg       0.98      0.98      0.98     10010



In [57]:
save_model(xgb_model,"machine_model_pt/xgb_model.pkl")

In [58]:
ada_model.fit(X_train, y_train)
evaluate_model(ada_model, X_test, y_test)



Accuracy: 0.9789
Precision: 0.9792
Recall: 0.9657
F1: 0.9786
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5019
           1       0.99      0.97      0.98      4991

    accuracy                           0.98     10010
   macro avg       0.98      0.98      0.98     10010
weighted avg       0.98      0.98      0.98     10010



In [59]:
save_model(ada_model,"machine_model_pt/ada_model.pkl")