In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
def load_data(data_path):
    # 这里假设数据是CSV格式
    data = pd.read_csv(data_path)
    return data

In [3]:
def split_data(data):
    X = data.iloc[:, :-1]  # 所有特征列
    y = data.iloc[:, -1]  # 标签列

    # 分割数据集为训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

In [4]:
 data = load_data('../data/en_features_scl.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9295 entries, 0 to 9294
Columns: 993 entries, 0 to Label
dtypes: float64(992), int64(1)
memory usage: 70.4 MB


In [6]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,983,984,985,986,987,988,989,990,991,Label
0,0.917893,-0.446911,0.165036,-0.862138,0.346598,-0.076533,-0.954673,0.497804,-0.036434,0.772667,...,0.008139,0.0,0.274141,0.128219,0.0,0.0,0.0,0.71406,0.194853,0
1,0.067741,0.416022,-0.63413,0.036708,0.897156,0.164955,-0.421121,-0.572758,0.290052,0.81119,...,0.907365,1.164968,0.920369,0.659421,0.59789,2.071712,0.602551,0.699861,1.366862,2
2,0.907648,-0.110102,0.939094,-0.921174,-0.768509,0.397498,-0.92978,0.359127,0.749208,0.987719,...,0.064376,0.118219,0.358266,0.118669,0.332982,0.0,0.0,0.6363,0.293007,0
3,0.958276,-0.042656,0.483127,-0.886986,0.281154,0.289998,-0.964709,0.096086,0.098463,0.995674,...,0.387142,0.475762,0.73102,0.694133,0.369052,2.000726,0.314952,0.955568,0.910409,0
4,-0.240267,0.365375,-0.741992,0.218018,0.920943,0.033666,-0.085969,-0.576424,0.183722,0.535149,...,0.780262,1.157436,0.887722,0.665232,0.791488,2.101362,0.560659,0.597252,1.407343,2


In [7]:
X_train, X_test, y_train, y_test = split_data(data)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7436 entries, 7485 to 7270
Columns: 992 entries, 0 to 991
dtypes: float64(992)
memory usage: 56.3 MB


In [9]:
def save_model(model, model_path):
    joblib.dump(model, model_path)


# 模型加载
def load_model(model_path):
    model = joblib.load(model_path)
    return model

In [10]:
X_train.iloc[0].info()

<class 'pandas.core.series.Series'>
Index: 992 entries, 0 to 991
Series name: 7485
Non-Null Count  Dtype  
--------------  -----  
992 non-null    float64
dtypes: float64(1)
memory usage: 47.8+ KB


test_model = load_model('./machine_model_pt/rf_model_en.pkl')

# 查看训练后模型的特征数量
print(test_model.n_features_in_)


In [11]:
type(pd.DataFrame(X_train.iloc[0]))

pandas.core.frame.DataFrame

In [12]:
pd.DataFrame(X_train.iloc[0]).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,982,983,984,985,986,987,988,989,990,991
7485,-0.060317,0.285803,-0.721763,0.046385,0.905387,0.025148,-0.183045,-0.519955,0.3299,0.509422,...,1.440313,0.891747,1.073319,0.849709,0.624339,0.714614,2.120273,0.630238,0.607971,1.412452


test_model.predict(pd.DataFrame(X_train.iloc[0]).T)

In [13]:
import re

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)

    precision= precision_score(y_test, predictions,average='macro')
    recall = recall_score(y_test, predictions,average='macro')
    f1 = f1_score(y_test, predictions,average='macro')
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")
    print(f"Classification Report:\n{report}")

In [14]:
# 模型训练
def train_model(X_train, y_train):
    # 这里使用随机森林分类器作为例子
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

In [15]:
model = train_model(X_train, y_train)

In [16]:
evaluate_model(model, X_test, y_test)

Accuracy: 0.9758
Precision: 0.9761
Recall: 0.9761
F1: 0.9761
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       668
           1       0.97      0.97      0.97       617
           2       0.98      0.99      0.99       574

    accuracy                           0.98      1859
   macro avg       0.98      0.98      0.98      1859
weighted avg       0.98      0.98      0.98      1859



In [17]:
save_model(model,"machine_model_pt/rf_model_en.pkl")

In [18]:
from sklearn.svm import SVC

In [19]:
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced', random_state=42,probability=True)

In [20]:
svm_model.fit(X_train, y_train)

In [21]:
# 使用SVM模型进行预测，获取决策分数
decision_scores = svm_model.decision_function(X_train)
decision_scores.shape

(7436, 3)

In [22]:
from sklearn.linear_model import LogisticRegression
platt_scaler = LogisticRegression(max_iter=1000, solver='saga', multi_class='multinomial', penalty='l2')
platt_scaler.fit(decision_scores, y_train)
save_model(platt_scaler,"machine_model_pt/platt_scaler_en.pkl")

In [23]:
save_model(svm_model,"machine_model_pt/svm_model_en.pkl")

In [24]:
evaluate_model(svm_model, X_test, y_test)

Accuracy: 0.9774
Precision: 0.9777
Recall: 0.9778
F1: 0.9778
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       668
           1       0.97      0.97      0.97       617
           2       0.99      0.99      0.99       574

    accuracy                           0.98      1859
   macro avg       0.98      0.98      0.98      1859
weighted avg       0.98      0.98      0.98      1859



In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
knn_model = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='cosine')
lr_model = LogisticRegression(max_iter=1000, C=1.0, solver='saga', class_weight='balanced', random_state=42)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric='mlogloss',random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
ada_model = AdaBoostClassifier(estimator=rf_model, algorithm='SAMME.R', n_estimators=50)

In [26]:
knn_model.fit(X_train, y_train)
evaluate_model(knn_model, X_test, y_test)

Accuracy: 0.9763
Precision: 0.9767
Recall: 0.9766
F1: 0.9766
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       668
           1       0.97      0.97      0.97       617
           2       0.99      0.99      0.99       574

    accuracy                           0.98      1859
   macro avg       0.98      0.98      0.98      1859
weighted avg       0.98      0.98      0.98      1859



In [27]:
lr_model.fit(X_train, y_train)
evaluate_model(lr_model, X_test, y_test)

Accuracy: 0.9753
Precision: 0.9757
Recall: 0.9757
F1: 0.9757
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       668
           1       0.97      0.97      0.97       617
           2       0.99      0.99      0.99       574

    accuracy                           0.98      1859
   macro avg       0.98      0.98      0.98      1859
weighted avg       0.98      0.98      0.98      1859



In [28]:
save_model(knn_model,"machine_model_pt/knn_model_en.pkl")

In [29]:
save_model(lr_model,"machine_model_pt/lr_model_en.pkl")

In [30]:
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_test, y_test)

Accuracy: 0.9763
Precision: 0.9767
Recall: 0.9767
F1: 0.9767
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       668
           1       0.97      0.97      0.97       617
           2       0.99      0.99      0.99       574

    accuracy                           0.98      1859
   macro avg       0.98      0.98      0.98      1859
weighted avg       0.98      0.98      0.98      1859



In [31]:
save_model(xgb_model,"machine_model_pt/xgb_model_en.pkl")

In [32]:
ada_model.fit(X_train, y_train)
evaluate_model(ada_model, X_test, y_test)



Accuracy: 0.9769
Precision: 0.9772
Recall: 0.9772
F1: 0.9772
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       668
           1       0.97      0.97      0.97       617
           2       0.99      0.99      0.99       574

    accuracy                           0.98      1859
   macro avg       0.98      0.98      0.98      1859
weighted avg       0.98      0.98      0.98      1859



In [33]:
save_model(ada_model,"machine_model_pt/ada_model_en.pkl")