In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# -------------------------------
# Step 1: 读取数据
# -------------------------------
df = pd.read_csv('../../data/intermediate/初步清洗_比赛数据_2.csv',
                 sep=',', engine='python', encoding='utf-8')

# 去除制表符
df = df.map(lambda x: x.strip('\t') if isinstance(x, str) else x)

# -------------------------------
# Step 2: 定义原始特征列（sx_cols）
# -------------------------------
sx_cols = [
    '课前预学','课堂参与','课后复习','延伸阅读',
    '完成作业时间','自习时间','课外阅读时间','网络课程时间',
    '实验科研时间','社团活动时间','竞赛活动时间','其他学习时间',
    '同学合作','参与科研团队','参与学科竞赛','学习同学方法','师生交流频度'
]

# 转为 float
df[sx_cols] = df[sx_cols].astype(float)

In [None]:
# -------------------------------
# Step 3: 数据标准化
# -------------------------------
X = df[sx_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# -------------------------------
# Step 4: PCA 降维
# -------------------------------
# 保留前 6 个主成分（可调整 n_components）
pca = PCA(n_components=7, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print("PCA 解释的累计方差比例:", np.sum(pca.explained_variance_ratio_))

In [None]:
# 输出每个主成分解释的方差比例
explained_ratios = pca.explained_variance_ratio_
cum_var = np.cumsum(explained_ratios)

print("PCA 各主成分方差贡献率：")
for i, ratio in enumerate(explained_ratios, start=1):
    print(f"主成分 {i}: {ratio:.3f}, 累计: {cum_var[i-1]:.3f}")


#可视化方差贡献率

plt.figure(figsize=(7,5))
plt.bar(range(1, len(explained_ratios)+1), explained_ratios, alpha=0.7, align='center', label='单个主成分方差贡献率')
plt.step(range(1, len(cum_var)+1), cum_var, where='mid', label='累计方差贡献率', color='red')

plt.xlabel('主成分序号')
plt.ylabel('方差解释比例')
plt.title('PCA 主成分方差贡献率')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
# 获取每个主成分的特征向量（载荷）
loadings = pca.components_

# 转换成 DataFrame，更直观
loading_df = pd.DataFrame(loadings.T,
                          columns=[f'PC{i+1}' for i in range(loadings.shape[0])],
                          index=['完成作业时间','自习时间','课外阅读时间','网络课程时间','其他学习时间',
                                 '课前预学','课堂参与','课后复习','延伸阅读','学习同学方法',
                                 '实验科研时间','参与科研团队','参与学科竞赛','竞赛活动时间',
                                 '同学合作','社团活动时间','师生交流频度'])

print("PCA 载荷矩阵：")
print(loading_df.round(3))


In [None]:
# -------------------------------
# Step 5: 确定聚类数 K (肘部法 + 轮廓系数)
# -------------------------------
wcss = []
silhouette_scores = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_pca, kmeans.labels_))

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(range(2,10), wcss, 'o-', label='WCSS')
plt.xlabel('聚类数 K')
plt.ylabel('WCSS')
plt.title('肘部法')

plt.subplot(1,2,2)
plt.plot(range(2,10), silhouette_scores, 'o-', label='轮廓系数')
plt.xlabel('聚类数 K')
plt.ylabel('轮廓系数')
plt.title('轮廓系数')
plt.show()


In [None]:
# -------------------------------
# Step 6: 聚类建模 (KMeans, 假设 4 类)
# -------------------------------
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_pca)



# -------------------------------
# Step 7: 映射画像标签 (可人工命名)
# -------------------------------
cluster_mapping = {
    0: '科研学霸型',
    1: '社团活跃型',
    2: '学业挣扎型',
    3: '自主学习型'
}
df['student_persona'] = df['cluster'].map(cluster_mapping)

# -------------------------------
# Step 8: 统计分析
# -------------------------------
print("各画像人数分布：")
print(df['student_persona'].value_counts())

persona_means = df.groupby('student_persona')[sx_cols].mean()
print("\n各画像特征均值：")
print(persona_means)

# -------------------------------
# Step 9: 可视化
# -------------------------------
# (1) 各画像人数柱状图
sns.countplot(x='student_persona', data=df,
              order=df['student_persona'].value_counts().index)
plt.title("各画像人数分布")
plt.show()

In [None]:
# (2) PCA 前两维度可视化聚类效果
plt.figure(figsize=(6,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df['student_persona'], palette="Set2")
plt.title("PCA降维后的聚类分布（前2主成分）")
plt.xlabel("主成分1")
plt.ylabel("主成分2")
plt.legend()
plt.show() 

In [None]:
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# 创建标签编码器
le = LabelEncoder()
# 将字符串标签转换为数值
numeric_labels = le.fit_transform(df['student_persona'])

# (2) PCA 三维聚类结果可视化
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')

# 绘制三维散点图 - 使用数值标签
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2],
                     c=numeric_labels, cmap="Set2", s=50, alpha=0.8)

# 设置标题和坐标轴标签
ax.set_title("PCA降维 - 聚类分布（前三主成分）", fontsize=14)
ax.set_xlabel("主成分1", fontsize=12)
ax.set_ylabel("主成分2", fontsize=12)
ax.set_zlabel("主成分3", fontsize=12)

# 添加图例 - 需要手动创建图例
legend_labels = le.classes_
handles = [plt.Line2D([0], [0], marker='o', color='w',
                      markerfacecolor=plt.cm.Set2(i/len(legend_labels)),
                      markersize=10) for i in range(len(legend_labels))]
ax.legend(handles, legend_labels, title="学生画像", loc="upper right", fontsize=10)

# 显示图形
plt.tight_layout()
plt.show()

In [None]:
# =========================
# Step 10: 用 GMM + BIC 客观选择簇数，并进行“软聚类”
# =========================
from sklearn.mixture import GaussianMixture

Ks = range(2, 10)
bics = []
gmms = []

# 建议在“标准化后的原始特征空间 X_scaled”上拟合GMM
for k in Ks:
    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gmm.fit(X_scaled)
    bics.append(gmm.bic(X_scaled))
    gmms.append(gmm)

best_k = Ks[int(np.argmin(bics))]
best_gmm = gmms[int(np.argmin(bics))]

print("GMM-BIC 选择的最佳簇数:", best_k)

df['gmm_cluster'] = best_gmm.predict(X_scaled)
proba = best_gmm.predict_proba(X_scaled)  # 每个样本对各簇的后验概率
df['gmm_confidence'] = proba.max(axis=1)

# 标记不确定样本（可调整阈值）
uncertain_thr = 0.55
df['gmm_uncertain'] = (df['gmm_confidence'] < uncertain_thr)

# =========================
# Step 11: 用“客观自动命名”给GMM簇命名（基于z值的过度/不足）
# =========================
Z2 = pd.DataFrame(X_scaled, columns=sx_cols)
Z2['gmm_cluster'] = df['gmm_cluster']
gmm_cluster_z = Z2.groupby('gmm_cluster')[sx_cols].mean()
gmm_auto_names = gmm_cluster_z.apply(build_auto_label, axis=1).to_dict()
df['gmm_persona'] = df['gmm_cluster'].map(gmm_auto_names)

print("GMM 自动标签：", gmm_auto_names)

# 对不确定样本追加说明（可选）
df.loc[df['gmm_uncertain'], 'gmm_persona'] = df.loc[df['gmm_uncertain'], 'gmm_persona'] + "（混合型/不确定）"

# =========================
# Step 12: 画像“特征签名”表（每簇z值排序）
# =========================
def cluster_signature(z_df, topn=5):
    sig = {}
    for c, row in z_df.iterrows():
        order = row.sort_values(ascending=False)
        sig[c] = {
            'top_pos': order.head(topn).round(2).to_dict(),
            'top_neg': order.tail(topn).round(2).to_dict()
        }
    return sig

print("KMeans 簇签名：", cluster_signature(cluster_z, topn=5))
print("GMM 簇签名：", cluster_signature(gmm_cluster_z, topn=5))

# =========================
# Step 13: 可解释规则（用决策树解释 GMM 簇）
# =========================
from sklearn.tree import DecisionTreeClassifier, export_text

tree_gmm = DecisionTreeClassifier(max_depth=3, min_samples_leaf=30, random_state=42)
tree_gmm.fit(X_scaled, df['gmm_cluster'])
print(export_text(tree_gmm, feature_names=sx_cols))

# =========================
# Step 14: 特征重要性（置换重要度，避免树模型固有偏好）
# =========================
from sklearn.inspection import permutation_importance

perm = permutation_importance(tree_gmm, X_scaled, df['gmm_cluster'],
                              n_repeats=20, random_state=42, n_jobs=-1)
imp = pd.Series(perm.importances_mean, index=sx_cols).sort_values(ascending=False)
print("\n置换重要度：\n", imp.round(4))

# =========================
# Step 15: 个体层面的“与簇均值距离”和“样本轮廓系数”
# =========================
from sklearn.metrics import silhouette_samples, pairwise_distances

# a) 轮廓系数（基于你原先KMeans的标签）
df['km_silhouette'] = silhouette_samples(X_pca, df['cluster'])

# b) 与各簇中心（标准化空间）距离，衡量“典型性”
#    先把KMeans中心从PCA -> 标准化空间
km_centers_scaled = pca.inverse_transform(kmeans.cluster_centers_)
D = pairwise_distances(X_scaled, km_centers_scaled)  # 形状: [n_samples, n_clusters]
df['km_center_dist'] = D.min(axis=1)  # 与所属中心的最小距离

# 标记“非典型个体”（既远离中心又轮廓系数低）
df['km_atypical'] = (df['km_center_dist'] > np.percentile(df['km_center_dist'], 80)) & (df['km_silhouette'] < 0.1)

# =========================
# Step 16: 稳定性简单检测（KMeans重启一致性）
# =========================
from sklearn.metrics import adjusted_rand_score

def km_labels(seed):
    return KMeans(n_clusters=kmeans.n_clusters, random_state=seed, n_init=10).fit_predict(X_pca)

labels_ref = df['cluster'].values
aris = []
for seed in range(43, 53):
    aris.append(adjusted_rand_score(labels_ref, km_labels(seed)))
print("KMeans 重复拟合 ARI（越高越稳定）:", np.round(aris, 3), "平均:", np.mean(aris).round(3))
