In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# 加载预处理后的数据
data = pd.read_csv('../data/processed/preprocessed_data.csv')

X = data.drop('Label', axis=1)
y = data['Label']

# 使用随机森林进行特征重要性分析
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

# 提取特征重要性
importances = clf.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# 打印最重要的前10个特征
print(feature_importance.head(10))

# 保存特征重要性
feature_importance.to_csv('../data/feature_importance.csv', index=False)

# 根据重要性选择特征
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)

# 保存降维后的数据
X_new_df = pd.DataFrame(X_new)
X_new_df['Label'] = y.values
X_new_df.to_csv('../data/processed/reduced_features_data.csv', index=False)
