# what if决策模拟器特征工程

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
from pandas import Series
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

plt.rcParams['font.sans-serif'] = ['SimHei']

df = pd.read_csv('../../data/intermediate/初步清洗_比赛数据.csv')
df.head(20)

# 接下来预设几种可以模拟的目标变量，然后用一个字典让其与对应的输入指标相对应。

In [None]:
def normalize(X):
    """去归一化"""
    for col in X.columns:
        scaler = len(np.unique(X[col]))-1
        if scaler == 1: # 去除二元选择器
            X.drop(col, axis=1, inplace=True)
            continue
        temp: Series = X[col]
        X[col] = temp.multiply(scaler).astype(int)
    return X

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2

pd.set_option('display.max_rows', 200)



X = df.iloc[:, 5:] # 去除学院专业年级性别政治面貌的列
y = df['学校整体满意度']

X = X.loc[:,X.max()<=1]
for col in X.columns:
    scaler = len(np.unique(X[col]))-1
    if scaler == 1: # 去除二元选择器
        X.drop(col, axis=1, inplace=True)
        continue
    temp: Series = X[col]
    X[col] = temp.multiply(scaler).astype(int)
X = X.drop("学校整体满意度", axis=1)

y = y.multiply(4).astype(int)

# 方法1: 互信息 (Mutual Information)
selector_mi = SelectKBest(score_func=mutual_info_classif, k='all')
X_mi = selector_mi.fit_transform(X, y)

mi_scores = pd.DataFrame({
    'feature': X.columns,
    'mutual_info_score': selector_mi.scores_
}).sort_values('mutual_info_score', ascending=False)

selected_features = mi_scores[mi_scores['mutual_info_score'] >= 0.162]['feature'].tolist()

X = X[selected_features]

print("1. 互信息得分 (越高越好):")
print(mi_scores)

# # 方法2: 卡方检验 (Chi-square)
# selector_chi2 = SelectKBest(score_func=chi2, k='all')
# X_chi2 = selector_chi2.fit_transform(X, y)
#
# chi2_scores = pd.DataFrame({
#     'feature': X.columns,
#     'chi2_score': selector_chi2.scores_,
#     'chi2_p_value': selector_chi2.pvalues_
# }).sort_values('chi2_score', ascending=False)
#
# print("\n2. 卡方检验得分:")
# print(chi2_scores)

print(X)


In [None]:
# 创建分割器
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# 分割数据
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# 创建LightGBM模型
model = lgb.LGBMClassifier(
    n_estimators=276,
    learning_rate=0.055387245108078006,
    max_depth=7,
    num_leaves=98,
    subsample=0.8454603231401624,
    colsample_bytree=0.7154221406536758,
    reg_alpha=7.377625743035262,
    reg_lambda=1.6389963806513481,
    min_child_samples=39,
    min_child_weight=0.000635427121143748,
    random_state=42
)

# 训练模型
model.fit(X_train, y_train)

# 预测和评估
y_pred = model.predict(X_test)
print("准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:")
print(classification_report(y_test, y_pred))

# 创建XGBoost模型
model1 = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# 训练和评估
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
print("准确率:", accuracy_score(y_test, y_pred1))
print("\n分类报告:")
print(classification_report(y_test, y_pred1))

# 创建随机森林模型
model2 = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
print("准确率:", accuracy_score(y_test, y_pred2))
print("\n分类报告:")
print(classification_report(y_test, y_pred2))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap='Blues', values_format='d')  # values_format='d' 表示显示整数
plt.title('Confusion Matrix')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("混淆矩阵（原始数值）:")
print(cm)

# 获取预测错误的样本索引
wrong_indices = np.where(y_pred != y_test)[0]

# 查看少数类（比如类别0,1,2）被错误预测的情况
minority_classes = [0, 1, 2]

print("少数类错误预测详情:")
for true_class in minority_classes:
    # 找到真实标签为少数类但预测错误的样本
    errors_for_class = []
    for idx in wrong_indices:
        if y_test.iloc[idx] == true_class and y_pred[idx] != true_class:
            errors_for_class.append((idx, y_pred[idx]))

    if errors_for_class:
        print(f"\n真实类别 {true_class} 被错误预测为:")
        for idx, pred_class in errors_for_class:
            print(f"  样本 {idx}: 预测为类别 {pred_class}")
    else:
        print(f"\n真实类别 {true_class}: 无错误预测（但可能因为样本太少）")

print(lgb.__version__)

In [None]:
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

base_params = {
    'device': 'gpu',           # 启动GPU
    'gpu_platform_id': 0,      # 显式指定平台ID
    'gpu_device_id': 0,
    'objective': 'multiclass',
    'num_class': len(np.unique(y.values)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'verbose': -1,             # 关闭训练时的日志输出，让GridSearch输出更清晰
}

lgb_model = lgb.LGBMClassifier(**base_params)

param_grid = {
    # 基础参数
    'n_estimators': [50], # [50, 100, 200],
    'learning_rate': [0.1], # 'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31],# [31, 63, 127],
    'max_depth': [3], #[3, 6, 9],

    # 防止过拟合
    'subsample': [0.8], #[0.8, 0.9, 1.0],
    'colsample_bytree': [0.8], #[0.8, 0.9, 1.0],
    'reg_alpha': [0.1], #[0, 0.1, 0.5],
    'reg_lambda': [0.5], #[0, 0.1, 0.5],
    #
    # 训练过程控制
    'min_child_samples': [20], #[10, 20, 30],      # 叶子节点最小样本数
    'min_child_weight': [0.001], #[0.001, 0.01, 0.1], # 叶子节点最小权重和

}

# # 网格搜索
stratified_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#
# grid_search = GridSearchCV(
#     estimator=lgb_model,
#     param_grid=param_grid,
#     cv=stratified_cv,
#     verbose=1,
#     scoring='accuracy',
#     n_jobs=1
# )
# grid_search.fit(X_train, y_train)
#
#
# print("最佳参数:", grid_search.best_params_)
# print("最好分数:", grid_search.best_score_)

param_bayes = {
    'n_estimators': Integer(30, 300),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'num_leaves': Integer(20, 150),
    'max_depth': Integer(2, 12),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0),
    'reg_alpha': Real(1e-3, 10.0, prior='log-uniform'),
    'reg_lambda': Real(1e-3, 10.0, prior='log-uniform'),
    'min_child_samples': Integer(5, 50),
    'min_child_weight': Real(1e-4, 0.1, prior='log-uniform'),
}

bayes_search = BayesSearchCV(
    lgb_model,
    param_bayes,
    n_iter=100,  # 100次迭代
    cv=stratified_cv,
    verbose=1,
    scoring='accuracy',
    n_jobs=1,
    random_state=42
)

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

# 多角度评估
print("准确率:", accuracy_score(y_test, y_pred))
print("精确率:", precision_score(y_test, y_pred, average='weighted'))
print("召回率:", recall_score(y_test, y_pred, average='weighted'))
print("F1分数:", f1_score(y_test, y_pred, average='weighted'))

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold

# 基础参数（移除n_estimators，因为贝叶斯优化会处理）
base_params = {
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'objective': 'multiclass',
    'num_class': len(np.unique(y.values)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'verbose': -1,
}

# 定义贝叶斯优化目标函数
def objective(trial):
    # 建议的参数范围（基于你之前的网格搜索结果）
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 30, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 2, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-4, 0.1, log=True),
    }

    # 合并基础参数
    all_params = {**base_params, **params}

    # 创建模型
    model = lgb.LGBMClassifier(**all_params)

    # 使用交叉验证评估
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='accuracy',
        n_jobs=1  # 必须为1，因为GPU训练
    )

    # 返回平均准确率
    return cv_scores.mean()

# 创建优化研究
study = optuna.create_study(
    direction='maximize',  # 最大化准确率
    sampler=optuna.samplers.TPESampler(seed=42)  # 使用TPE采样器
)

# 运行优化
print("开始贝叶斯优化...")
study.optimize(objective, n_trials=100)  # 100次试验

# 输出最佳结果
print("\n🎉 优化完成！")
print("最佳分数:", study.best_value)
print("最佳参数:", study.best_params)

# 用最佳参数训练最终模型
best_params = {**base_params, **study.best_params}
final_model = lgb.LGBMClassifier(**best_params)

# 添加早停训练
final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=30, verbose=True),
        lgb.log_evaluation(50)
    ]
)

# 最终评估
y_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\n📊 最终测试集准确率: {test_accuracy:.4f}")

# 可视化优化过程（可选）
try:
    import matplotlib.pyplot as plt
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()
except:
    print("如需可视化，请安装: pip install plotly")

In [None]:
# 在最佳参数附近进行更精细的搜索
def refined_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 250, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.045, 0.065),
        'num_leaves': trial.suggest_int('num_leaves', 90, 110),
        'max_depth': trial.suggest_int('max_depth', 6, 8),
        'subsample': trial.suggest_float('subsample', 0.82, 0.87),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.68, 0.75),
        'reg_alpha': trial.suggest_float('reg_alpha', 6.0, 8.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.2, 2.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 35, 45),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.0005, 0.0008),
    }
    # 合并基础参数
    all_params = {**base_params, **params}

    # 创建模型
    model = lgb.LGBMClassifier(**all_params)

    # 使用交叉验证评估
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='accuracy',
        n_jobs=1  # 必须为1，因为GPU训练
    )

    # 返回平均准确率
    return cv_scores.mean()

# 创建优化研究
study = optuna.create_study(
    direction='maximize',  # 最大化准确率
    sampler=optuna.samplers.TPESampler(seed=42)  # 使用TPE采样器
)

# 运行优化
print("开始贝叶斯优化...")
study.optimize(refined_objective, n_trials=100)  # 100次试验

# 输出最佳结果
print("\n🎉 优化完成！")
print("最佳分数:", study.best_value)
print("最佳参数:", study.best_params)

# 用最佳参数训练最终模型
best_params = {**base_params, **study.best_params}
final_model = lgb.LGBMClassifier(**best_params)

# 添加早停训练
final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=30, verbose=True),
        lgb.log_evaluation(50)
    ]
)

# 最终评估
y_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\n📊 最终测试集准确率: {test_accuracy:.4f}")

In [None]:
# 分析模型特征重要性
feature_importance = final_model.feature_importances_
feature_names = X_train.columns if hasattr(X_train, 'columns') else range(X_train.shape[1])

# 排序并显示最重要的特征
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 21重要特征:")
print(importance_df.head(21))

# 绘制学习曲线
lgb.plot_metric(final_model)

# 边际数据点太少 使用smote采样补充数据

In [None]:
# from imblearn.combine import SMOTETomek
# # 方法1: 使用SMOTE过采样
# smote = SMOTETomek(random_state=42)
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy={ 1: 200, 2: 800}, random_state=42)),
    ('tomek', TomekLinks())  # 在过采样后再应用Tomek
])
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)

# 创建LightGBM模型
model = lgb.LGBMClassifier(
    n_estimators=276,
    learning_rate=0.055387245108078006,
    max_depth=7,
    num_leaves=98,
    subsample=0.8454603231401624,
    colsample_bytree=0.7154221406536758,
    reg_alpha=7.377625743035262,
    reg_lambda=1.6389963806513481,
    min_child_samples=39,
    min_child_weight=0.000635427121143748,
    random_state=42
)

# 训练模型
model.fit(X_resampled, y_resampled)

# 预测和评估
y_pred = model.predict(X_test)
print("准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

# 5折交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("交叉验证准确率:", scores.mean(), "±", scores.std())

# 特征提取的成果

In [None]:
def pick_up_features(X: pd.DataFrame, y: pd.Series, score: float) -> pd.DataFrame:
    """
    自动根据传入的目标指标筛选相关的关系最近n条指标（相关度阈值）

    :param X: 经过统一初始处理过的dataframe
    :param y: 选择预测的列
    :param score: 相关度阈值
    :return:
    """
    X = X.iloc[:, 5:]
    X = X.loc[:,X.max()<=1] # 筛选掉非离散变量
    for col in X.columns:
        scaler = len(np.unique(X[col]))-1
        if scaler == 1: # 去除二元选择器
            X.drop(col, axis=1, inplace=True)
            continue
        temp: Series = X[col]
        X[col] = temp.multiply(scaler).astype(int)
    try:
        X.drop(y.name, axis=1, inplace=True)
    except:
        print("没有该列")
    y = y.multiply(len(np.unique(y)-1)).astype(int)

    # 互信息算法
    selector_mi = SelectKBest(score_func=mutual_info_classif, k='all')
    selector_mi.fit_transform(X, y)

    mi_scores = pd.DataFrame({
        'feature': X.columns,
        'mutual_info_score': selector_mi.scores_
    }).sort_values('mutual_info_score', ascending=False)

    selected_features = mi_scores[mi_scores['mutual_info_score'] >= score]['feature'].tolist()

    X = X[selected_features]

    return X