In [None]:
import pandas as pd
import os
from glob import glob

# 设定文件夹路径
folder_path = "/content/drive/MyDrive/RF/Processed_Seasonss/"

# 获取所有 CSV 文件路径
csv_files = glob(os.path.join(folder_path, "*.csv"))

# 存储数据的列表
dataframes = []

# 遍历所有文件
for file in csv_files:
    df = pd.read_csv(file)

    # 删除 "Additional_Data" 列（如果存在）
    if "Additional_Data" in df.columns:
        df = df.drop(columns=["Additional_Data"])

    # 提取文件名作为 "Province" 信息，并清理字段
    province_name = os.path.basename(file).replace(".csv", "").replace("_Seasonal_Data", "")
    df["Province"] = province_name

    # 修改 Season 列，去掉小数点
    df["Season"] = df["Season"].astype(str).str.replace(".0", "", regex=False)

    dataframes.append(df)

# 合并所有数据
merged_df = pd.concat(dataframes, ignore_index=True)

# 保存合并后的数据
output_path = "/content/drive/MyDrive/RF/Merged_Seasons_Cleaned.csv"
merged_df.to_csv(output_path, index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

# ============== 1. 读取数据 ==============
file_path = "/content/drive/MyDrive/RF/Merged_Seasons_Cleaned.csv"
df = pd.read_csv(file_path)

# ============== 2. 明确目标与特征 ==============
# 目标变量：Drought_Index（你说它代表年产量）
target_col = "Drought_Index"

# 如果你不需要 "Season"、"Yield" 作为特征，可以直接从特征列表中去掉
# 同时保留 Province 作为一个分类特征，或者也去掉，看你是否需要它
# 这里演示先把 Province 也保留下来，通过 one-hot 方式让模型识别省份差异

drop_cols = [target_col, "Season", "Yield"]  # 这些列不作为特征
# 注意：如果 "Yield" 对你来说也是有用的信息，可以去掉它在这里的排除

# 构造特征 X 和目标 y
X = df.drop(columns=drop_cols, errors='ignore')  # errors='ignore' 防止没有这些列时报错
y = df[target_col]

# 查看当前特征列
print("特征列：", X.columns.tolist())

# ============== 3. 处理分类特征 (Province) ==============
# 如果 Province 存在并且是字符串，需要做 one-hot 编码，才能让随机森林识别
if "Province" in X.columns:
    X = pd.get_dummies(X, columns=["Province"], drop_first=True)  # 防止虚拟变量陷阱
    # drop_first=True 表示少一个dummy，以避免共线性

# ============== 4. 划分数据集 (70% 训练, 20% 验证, 10% 测试) ==============
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)
# 这里 X_temp+y_temp 占 30%
# 再从 X_temp, y_temp 中分出 1/3 做测试(即0.1整体)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1/3, random_state=42
)
# 最终:
# X_train:70%  X_val:20%  X_test:10%

print(f"训练集: {X_train.shape}, 验证集: {X_val.shape}, 测试集: {X_test.shape}")

# ============== 5. 训练随机森林 (初步) ==============
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42
)
rf.fit(X_train, y_train)

# ============== 6. 超参数调优 (可选) ==============
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,  # 3折交叉验证
    scoring="r2",  # 以R²为评分标准
    n_jobs=-1      # 并行加速
)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print("最佳参数:", grid_search.best_params_)

# ============== 7. 在验证集和测试集上评估模型 ==============
def evaluate(model, X_data, y_data, dataset_name="Dataset"):
    y_pred = model.predict(X_data)
    r2 = r2_score(y_data, y_pred)
    rmse = np.sqrt(mean_squared_error(y_data, y_pred))
    print(f"{dataset_name} 评估: R²={r2:.3f}, RMSE={rmse:.3f}")
    return r2, rmse

print("\n=== 验证集表现 ===")
evaluate(best_rf, X_val, y_val, "验证集")

print("\n=== 测试集表现 ===")
evaluate(best_rf, X_test, y_test, "测试集")

# ============== 8. 特征重要性 ==============
feature_importances = best_rf.feature_importances_
feature_names = X_train.columns

# 按重要性降序排序
sorted_idx = np.argsort(feature_importances)[::-1]

# 如果特征很多，可以只画Top 10或Top 20
top_n = 10
top_idx = sorted_idx[:top_n]
plt.figure(figsize=(8, 6))
plt.barh(range(top_n), feature_importances[top_idx], align='center')
plt.yticks(range(top_n), [feature_names[i] for i in top_idx])
plt.gca().invert_yaxis()
plt.xlabel("Feature Importance")
plt.title("Top {} Important Features".format(top_n))
plt.show()
