1. 模型：随机森林分类器
2. 自变量：物种组成
3. 因变量：age_group分类

In [30]:
# 1.数据读取
import pandas as pd
from sklearn import preprocessing
## 物种组成变量
df = pd.read_csv("sub_sp.csv",sep=",",header=0,index_col=5)
df.drop(columns=["Kindom","Phylum","Class","Order","Family"],inplace=True)
df_X = df.T
## 样本的age_group标签
df = pd.read_csv("sub_meta.csv",sep=",",header=0)
df_y = df[['sample', 'age_group']]
df_y.set_index('sample',inplace=True)
## 标签数值型化
le = preprocessing.LabelEncoder()
age_label = le.fit_transform(df_y['age_group'])
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print(label_mapping)
df_y = pd.DataFrame(age_label,index=df_y.index,columns=['age_group'])


{'0~6m': 0, '12~24m': 1, '24~36m': 2, '36m+': 3, '6~12m': 4}


In [31]:
# 2.先划分数据集，再进行特征筛选
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
import numpy as np
## 2.1 划分数据集
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)
## 2.2 特征筛选
### 2.2.1 方差过滤(默认删除方差为0的特征)
selector = VarianceThreshold()
X_train1 = selector.fit_transform(X_train)
X_test1 = selector.fit_transform(X_test)

In [None]:
# 3.构建训练模型
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [None]:
# 4.模型评估
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

predictions = rfc.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", )
print(accuracy)
from sklearn.metrics import confusion_matrix
c_matrix = confusion_matrix(y_test, predictions)
print("Confusion_matrix:", )
print(c_matrix)
from sklearn.metrics import classification_report
c_report = classification_report(y_test, predictions)
print("Classification Report:", )
print(c_report)

In [None]:
# 5.模型参数调优
from sklearn.model_selection import GridSearchCV

# 定义超参数网格
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'criterion': ["gini", "entropy"]
}

# 创建网格搜索实例
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

# 进行超参数调优, grid_search.fit()会自动进行交叉验证和模型评估, 并返回最佳模型
grid_search.fit(X_train, y_train)

# 输出最佳超参数和最佳模型
print("Best score: ", grid_search.best_score_)
print("Best index: ", grid_search.best_index_)
print("Best parameters: ", grid_search.best_params_)
print("Best model: ", grid_search.best_estimator_)

In [40]:
# 6.优化后的模型评估
from sklearn.metrics import accuracy_score

# 使用测试集评估模型
predictions = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5875


In [41]:
# 7.保存优化后的模型
import joblib

joblib.dump(grid_search, 'RandomForestClassifier.pkl')

['RandomForestClassifier.pkl']