In [12]:
import torch
import numpy as np
from scipy import sparse
import os

# 导入你的项目中的必要模块
# 确保这些路径是正确的，如果需要，请根据你的项目结构调整
from params import DEVICE # 假设 DEVICE 已在 params.py 中定义
from data_process import min_seq_len, max_seq_len # 假设 min_seq_len, max_seq_len 已在 data_process.py 中定义
from utils import gen_sqgkt_graph, build_adj_list, build_adj_list_uq, gen_sqgkt_graph_uq


print(f"Using device: {DEVICE}")

# 模拟 train_test.py 中的 params
params = {
    'max_seq_len': max_seq_len,
    'min_seq_len': min_seq_len,
    'epochs': 2,
    'lr': 0.01,
    'lr_gamma': 0.85,
    'batch_size': 128,
    'size_q_neighbors': 4,
    'size_q_neighbors_2': 5, # 你的 u_neighbors 关联的 q_neighbors_2
    'size_s_neighbors': 10,
    'size_u_neighbors': 5,  # 你的用户邻居（问题）的最大数量
    'num_workers': 0,
    'prefetch_factor': 4,
    'agg_hops': 3,
    'emb_dim': 100,
    'hard_recap': False,
    'dropout': (0.2, 0.4),
    'rank_k': 10,
    'k_fold': 5
}
print(f"Loaded parameters: {params}")

# --- 1. 原始 uq_table.npz 文件加载后查看形状 ---
print("\n--- Debugging uq_table loading ---")

# 加载 qs_table (通常是 (num_question, num_skill))
qs_table_np = sparse.load_npz('data/qs_table.npz').toarray()
qs_table = torch.tensor(qs_table_np, dtype=torch.int64, device=DEVICE)
num_question_total = qs_table_np.shape[0] # 获取总问题数，后续用于检查 question_id 范围

print(f"Shape of qs_table_np (numpy array): {qs_table_np.shape}")
print(f"Shape of qs_table (torch.Tensor): {qs_table.shape}")


# 加载原始 uq_table 文件
raw_uq_matrix_np = sparse.load_npz('data/uq_table.npz').toarray()
print(f"DEBUG 1: Shape of raw_uq_matrix_np (numpy array from uq_table.npz): {raw_uq_matrix_np.shape}")

# 原始 uq_table 直接转换为 tensor 的形状 (这将是二维的)
uq_table_initial_tensor = torch.tensor(raw_uq_matrix_np, dtype=torch.int64, device=DEVICE)
print(f"DEBUG 2: Shape of uq_table_initial_tensor (torch.Tensor, direct conversion): {uq_table_initial_tensor.shape}")


# --- 2. 模拟构建 u_neighbors 和 q_neighbors_2 ---
# 这些图结构是构建三维 uq_table 的必要前提
print("\n--- Simulating u_neighbors and q_neighbors_2 generation ---")
u_neighbors_list, q_neighbors_list_for_uq = build_adj_list_uq() # 注意这里q_neighbors_list是用于uq的
u_neighbors_np, q_neighbors_2_np = gen_sqgkt_graph_uq(u_neighbors_list, q_neighbors_list_for_uq,
                                                      params['size_u_neighbors'], params['size_q_neighbors_2'])
u_neighbors = torch.tensor(u_neighbors_np, dtype=torch.int64, device=DEVICE)
q_neighbors_2 = torch.tensor(q_neighbors_2_np, dtype=torch.int64, device=DEVICE)

print(f"DEBUG 3: Shape of u_neighbors (torch.Tensor): {u_neighbors.shape}")
print(f"DEBUG 4: Shape of q_neighbors_2 (torch.Tensor): {q_neighbors_2.shape}")

# --- 3. 模拟构建三维 uq_table ---
print("\n--- Simulating 3D uq_table construction ---")

# 获取必要的维度信息
num_user_from_data = raw_uq_matrix_np.shape[0] # 从原始数据获取用户数
max_user_neighbors = params['size_u_neighbors'] # 从 params 中获取每个用户的最大邻居数

print(f"DEBUG 5: Derived num_user_from_data: {num_user_from_data}")
print(f"DEBUG 6: Derived max_user_neighbors (from params): {max_user_neighbors}")

# 创建一个三维的 uq_table 张量，零初始化
processed_uq_table_data = torch.zeros(num_user_from_data, max_user_neighbors, 3, dtype=torch.float32)
print(f"DEBUG 7: Shape of processed_uq_table_data (zero-initialized, expected 3D): {processed_uq_table_data.shape}")


# --- 4. 填充 processed_uq_table_data (关键逻辑，请根据你的论文和数据完善) ---
print("\n--- Populating 3D uq_table data ---")
# 遍历每个用户
for user_idx in range(num_user_from_data):
    # 获取当前用户的邻居问题列表
    current_user_neighbor_questions = u_neighbors[user_idx].cpu().numpy()

    # 遍历当前用户的每个邻居问题槽位
    for neighbor_slot_idx, question_id in enumerate(current_user_neighbor_questions):
        # 检查 question_id 是否有效 (非填充值，且在问题总数范围内)
        if question_id != -1 and question_id >= 0 and question_id < num_question_total:
            # --- 这是你需要根据论文和数据定制的逻辑 ---
            # 假设 raw_uq_matrix_np 是一个 (num_user, num_question_total) 矩阵，
            # 存储了用户 `user_idx` 和问题 `question_id` 之间的连接强度。

            connection_strength = raw_uq_matrix_np[user_idx, question_id]

            # 示例逻辑：
            c_i_val = float(connection_strength) # 例如，用原始连接强度作为 c_i
            g_p_val = 0.5 if connection_strength > 0 else 0.0 # 例如，如果有连接，给一个正向增益
            g_n_val = 0.5 if connection_strength > 0 else 0.0 # 例如，如果有连接，给一个负向增益

            # 赋值到 processed_uq_table_data
            processed_uq_table_data[user_idx, neighbor_slot_idx, 0] = c_i_val
            processed_uq_table_data[user_idx, neighbor_slot_idx, 1] = g_p_val
            processed_uq_table_data[user_idx, neighbor_slot_idx, 2] = g_n_val
# --- 填充结束 ---

# 将填充好的 processed_uq_table_data 移动到指定设备 (GPU/CPU)
uq_table_final_tensor = processed_uq_table_data.to(DEVICE)
print(f"DEBUG 8: Final uq_table_final_tensor shape after population: {uq_table_final_tensor.shape}")

print("\n--- Debugging complete ---")


Using device: cuda:0
Loaded parameters: {'max_seq_len': 200, 'min_seq_len': 20, 'epochs': 2, 'lr': 0.01, 'lr_gamma': 0.85, 'batch_size': 128, 'size_q_neighbors': 4, 'size_q_neighbors_2': 5, 'size_s_neighbors': 10, 'size_u_neighbors': 5, 'num_workers': 0, 'prefetch_factor': 4, 'agg_hops': 3, 'emb_dim': 100, 'hard_recap': False, 'dropout': (0.2, 0.4), 'rank_k': 10, 'k_fold': 5}

--- Debugging uq_table loading ---
Shape of qs_table_np (numpy array): (17622, 123)
Shape of qs_table (torch.Tensor): torch.Size([17622, 123])
DEBUG 1: Shape of raw_uq_matrix_np (numpy array from uq_table.npz): (2131, 17622)
DEBUG 2: Shape of uq_table_initial_tensor (torch.Tensor, direct conversion): torch.Size([2131, 17622])

--- Simulating u_neighbors and q_neighbors_2 generation ---
DEBUG 3: Shape of u_neighbors (torch.Tensor): torch.Size([2131, 5])
DEBUG 4: Shape of q_neighbors_2 (torch.Tensor): torch.Size([17622, 5])

--- Simulating 3D uq_table construction ---
DEBUG 5: Derived num_user_from_data: 2131
DEBUG

In [13]:
import pandas as pd
import numpy as np
import os
from scipy import sparse
from scipy.stats import norm, poisson

min_seq_len = 20
max_seq_len = 200
k = 0.3
d = 0.7
b = 10

if __name__ == '__main__':
    data = pd.read_csv(filepath_or_buffer='data/assist09_origin.csv', encoding="ISO-8859-1")
    data = data.sort_values(by='user_id', ascending=True)
    data = data.drop(data[data['skill_id'] == ' '].index)
    data = data.dropna(subset=['skill_id'])
    data = data.drop(data[data['original'] == 0].index)
    is_valid_user = data.groupby('user_id').size() >= min_seq_len
    data = data[data['user_id'].isin(is_valid_user[is_valid_user].index)]
    data = data.loc[:, ['order_id', 'user_id', 'problem_id', 'correct', 'skill_id', 'skill_name',
                        'ms_first_response', 'answer_type', 'attempt_count','hint_count']]
    question_attempt_stats = data.groupby('problem_id')['attempt_count'].mean().reset_index()
    question_attempt_stats.rename(columns={'attempt_count': 'mean_attempt'}, inplace=True)
    data = pd.merge(data, question_attempt_stats, on='problem_id', suffixes=('', '_attempt'))
    data['attempt_factor'] = 1 - poisson(data['mean_attempt']).cdf(data['attempt_count'] - 1)
    data['attempt_factor_g'] = k + (1 - k) / (1 + np.exp(-d * (data['attempt_factor'] - b)))


    question_hint_stats = data.groupby('problem_id')['hint_count'].agg('mean').reset_index()
    question_hint_stats.rename(columns={'hint_count': 'mean_hint'}, inplace=True)
    data = pd.merge(data, question_hint_stats, on='problem_id')

    # data['hint_count'] = data['hint_count']
    #CDF
    data['hint_factor'] = 1 - poisson(data['mean_hint']).cdf(data['hint_count'] - 1)
    data['hint_factor_g'] = k + (1 - k) / (1 + np.exp(-d * (data['hint_factor'] - b)))
    data['ability_factor'] = data.groupby('user_id')['correct'].transform('mean')

    num_answer = data.shape[0]
    questions = set()
    skills = set()
    users = set()

    for row in data.itertuples(index=False):
        users.add(row[1])
        questions.add(row[2])
        if isinstance(row[4], (int, float)):
            skills.add(int(row[4]))
        else:
            skill_add = set(int(s) for s in row[4].split('_'))
            skills = skills.union(skill_add)
    data.to_csv('data/data_processed.csv', sep=',', index=False)

    num_q = len(questions)
    num_s = len(skills)
    num_user = len(users)
    if not os.path.exists('data/question2idx.npy'):

        questions = list(questions)
        skills = list(skills)
        users = list(users)
        question2idx = {questions[i]: i + 1 for i in range(num_q)}
        question2idx[0] = 0
        skill2idx = {skills[i]: i for i in range(num_s)}
        user2idx = {users[i]: i for i in range(num_user)}
        num_q += 1
        idx2question = {question2idx[q]: q for q in question2idx}
        idx2skill = {skill2idx[s]: s for s in skill2idx}
        idx2user = {user2idx[u]: u for u in user2idx}

        np.save('data/question2idx.npy', question2idx)
        np.save('data/skill2idx.npy', skill2idx)
        np.save('data/user2idx.npy', user2idx)
        np.save('data/idx2question.npy', idx2question)
        np.save('data/idx2skill.npy', idx2skill)
        np.save('data/idx2user.npy', idx2user)
    else:
        question2idx = np.load('data/question2idx.npy', allow_pickle=True).item()
        skill2idx = np.load('data/skill2idx.npy', allow_pickle=True).item()
        user2idx = np.load('data/user2idx.npy', allow_pickle=True).item()
        idx2question = np.load('data/idx2question.npy', allow_pickle=True).item()
        idx2skill = np.load('data/idx2skill.npy', allow_pickle=True).item()
        idx2user = np.load('data/idx2user.npy', allow_pickle=True).item()

    #row[1]:user_id, row[2]:problem_id, row[4]:skill_id
    if not os.path.exists('data/qs_table.npz'):
        qs_table = np.zeros([num_q, num_s], dtype=float)
        q_set = data['problem_id'].drop_duplicates()
        q_samples = pd.concat([data[data['problem_id'] == q_id].sample(1) for q_id in q_set])
        for row in q_samples.itertuples(index=False):
            if isinstance(row[4], (int, float)):
                qs_table[question2idx[row[2]], skill2idx[int(row[4])]] = 1
            else:
                skill_add = [int(s) for s in row[4].split('_')]
                for s in skill_add:
                    qs_table[question2idx[row[2]], skill2idx[s]] = 1
        qq_table = np.matmul(qs_table, qs_table.T)
        ss_table = np.matmul(qs_table.T, qs_table)
        qs_table = sparse.coo_matrix(qs_table)
        qq_table = sparse.coo_matrix(qq_table)
        ss_table = sparse.coo_matrix(ss_table)
        sparse.save_npz('data/qs_table.npz', qs_table)
        sparse.save_npz('data/qq_table.npz', qq_table)
        sparse.save_npz('data/ss_table.npz', ss_table)
    else:
        qs_table = sparse.load_npz('data/qs_table.npz').toarray()
        qq_table = sparse.load_npz('data/qq_table.npz').toarray()
        ss_table = sparse.load_npz('data/ss_table.npz').toarray()
    weights = np.array([0.4, 0.4, 0.2])
    if not os.path.exists('data/uq_table.npz'):
        uq_table = np.zeros([num_user, num_q], dtype=float)
        u_set = data['user_id'].drop_duplicates()
        u_samples = pd.concat([data[data['user_id'] == u_id].sample(1) for u_id in u_set])
        for row in u_samples.itertuples(index=False):
            user_index = user2idx[row[1]]
            question_index = question2idx[row[2]]
            factors = np.array([row.attempt_factor_g, row.hint_factor_g, row.ability_factor])
            factor_value = np.sum(factors * weights)
            uq_table[user2idx[row[1]], question2idx[(row[2])]] = factor_value
        uq_table = sparse.coo_matrix(uq_table)
        sparse.save_npz('data/uq_table.npz', uq_table)
    else:
        uq_table = sparse.load_npz('data/uq_table.npz').toarray()

    if not os.path.exists('data/user_seq.npy'):
        user_seq = np.zeros([num_user, max_seq_len])
        user_res = np.zeros([num_user, max_seq_len])
        user_user = np.zeros([num_user, max_seq_len])
        num_seq = [0 for _ in range(num_user)]
        user_mask = np.zeros([num_user, max_seq_len])
        for row in data.itertuples(index=False):
            user_id = user2idx[row[1]]
            if num_seq[user_id] < max_seq_len - 1:
                user_seq[user_id, num_seq[user_id]] = question2idx[row[2]]
                user_res[user_id, num_seq[user_id]] = row[3]
                user_mask[user_id, num_seq[user_id]] = 1
                user_user[user_id, num_seq[user_id]] = user_id
                num_seq[user_id] += 1
        np.save('data/user_seq.npy', user_seq)
        np.save('data/user_res.npy', user_res)
        np.save('data/user_mask.npy', user_mask)
        np.save('data/user_user.npy', user_mask)

FileNotFoundError: [Errno 2] No such file or directory: 'data/assist09_origin.csv'

In [14]:
# 检查uq_table数据
print("=== uq_table数据检查 ===")
print(f"uq_table形状: {uq_table_3d.shape}")
print(f"零值比例: {torch.sum(uq_table_3d == 0) / uq_table_3d.numel():.4f}")
print(f"数值范围: [{uq_table_3d.min():.4f}, {uq_table_3d.max():.4f}]")
print(f"各通道均值: {torch.mean(uq_table_3d, dim=(0,1))}")

# 如果发现数据全为0，需要检查数据加载和预处理过程

=== uq_table数据检查 ===


NameError: name 'uq_table_3d' is not defined