In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from datetime import datetime


1. 特征提取

student_ID: 学习者ID
time: 答题时间
title_ID: 题目ID
knowledge: 题目类型（如选择题、填空题、编程题等）
is_correct: 答题是否正确（1表示正确，0表示错误）
score: 得分
attempt: 第几次尝试

In [2]:
# submit_record = pd.read_csv('path_to_your_data.csv')

relative_path = '../../sourcedata/'

# 读取题目信息 title = 题目
title_info = pd.read_csv(relative_path + 'Data_TitleInfo.csv')

# 读取学生信息 student = 学生
student_info = pd.read_csv(relative_path + 'Data_StudentInfo.csv')

# 读取学生答题记录
submit_record = pd.read_csv(relative_path + 'All_Class/all_class_submit_record.csv')

In [6]:
# 公共数据处理

# 对学生的提交记录按照学生ID、题目ID分组，然后在组内对时间进行排序
submit_record = submit_record.sort_values(by=['student_ID', 'title_ID', 'time'], ascending=[True, True, True])
# 对submit_record统计学生ID与题目ID相同的记录，即学生对同一题目的多次提交记录，添加一列attempt，表示第几次尝试。
submit_record['attempt'] = submit_record.groupby(['student_ID', 'title_ID']).cumcount() + 1
# print(submit_record.head(30))

# submit_record['is_correct'] = 1 if submit_record['full_score'] == submit_record['score'] else 0
# 根据submit_record的full_score和score是否相等，判断学生的答题是否正确，添加一列is_correct，1表示正确，0表示错误。
submit_record['is_correct'] = submit_record['full_score'] == submit_record['score']

In [4]:
# 时间特征提取
# 提取答题时间的特征，如每日、每周、每月的答题数量，分析答题高峰时段。
submit_record['time'] = pd.to_datetime(submit_record['time'])
submit_record['hour'] = submit_record['time'].dt.hour
submit_record['day'] = submit_record['time'].dt.day
submit_record['weekday'] = submit_record['time'].dt.weekday
submit_record['month'] = submit_record['time'].dt.month

# 按时间特征统计答题量
hourly_data = submit_record.groupby('hour').size()
daily_data = submit_record.groupby('day').size()
weekly_data = submit_record.groupby('weekday').size()
monthly_data = submit_record.groupby('month').size()


In [7]:
# 题型偏好
# 统计不同题型的答题次数和正确率。（不区分学生）
question_type_stats = submit_record.groupby('knowledge').agg({'title_ID': 'count', 'is_correct': 'mean'})
question_type_stats.columns = ['question_count', 'correct_rate']
print(question_type_stats)


           question_count  correct_rate
knowledge                              
b3C9s               14780      0.313532
g7R2j               33041      0.223117
m3D1v               47740      0.287516
r8S3g               36730      0.205717
t5V9e               37986      0.215948
y9W5d               62541      0.267840


In [None]:

# 正确率
# 计算每个学习者的整体正确率，以及不同题型、不同时间段的正确率。
student_correct_rate = submit_record.groupby('student_ID')['is_correct'].mean()
type_correct_rate = submit_record.groupby(['student_ID', 'knowledge'])['is_correct'].mean().unstack()# unstack()将多层索引的数据框转换为单层索引的数据框


In [None]:

# 尝试次数
# 记录每个学习者对每道题目的尝试次数。
attempt_stats = submit_record.groupby('student_ID')['attempt'].mean()


In [None]:
# 聚类分析
# 对学习者进行分群，找出具有相似学习行为的学习者群体
features = submit_record.groupby('student_ID').agg({
    'is_correct': 'mean',
    'title_ID': 'count',
    'attempt': 'mean',
    'hour': lambda x: x.value_counts().idxmax(),
    'knowledge': lambda x: x.value_counts().idxmax()
})

# 编码分类特征
features['hour'] = features['hour'].astype('category').cat.codes
features['knowledge'] = features['knowledge'].astype('category').cat.codes

# 标准化特征
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# KMeans聚类
kmeans = KMeans(n_clusters=3, random_state=42)
features['cluster'] = kmeans.fit_predict(scaled_features)


In [None]:
# 可视化聚类结果
# 使用雷达图展示不同学习者群体的特征。
import matplotlib.pyplot as plt
from math import pi

def plot_radar_chart(submit_record, title):
    # 准备数据
    labels = submit_record.columns[:-1]
    num_vars = len(labels)
    
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    
    for i, (name, row) in enumerate(submit_record.iterrows()):
        values = row.drop('cluster').tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Cluster {name}')
        ax.fill(angles, values, alpha=0.25)
    
    ax.set_yticklabels([])
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels)
    
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title(title)
    plt.show()

# 聚类中心点
cluster_centers = features.groupby('cluster').mean()
plot_radar_chart(cluster_centers, 'Cluster Characteristics')
