In [21]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    '../dataset/exam_record.csv',
    usecols=['order_id', 'student_id', 'tag_id', 'correct', 'tag_v'],encoding='GB2312'
).dropna(subset=['tag_id'])
print(data)

       order_id  student_id  tag_id  tag_v  correct
0             1          46     101  穷举与递推        0
1             2          39     101  穷举与递推        0
2             3          40     101  穷举与递推        1
3             4          41     101  穷举与递推        1
4             5          45     101  穷举与递推        0
...         ...         ...     ...    ...      ...
31539     31540         125      98     链表        0
31540     31541          24      98     链表        1
31541     31542          40      98     链表        1
31542     31543          24      98     链表        1
31543     31544         125      98     链表        0

[31544 rows x 5 columns]


In [23]:
raw_question = data.tag_id.unique().tolist()
num_skill = len(raw_question)

# question id from 0 to (num_skill - 1)
questions = { p: i for i, p in enumerate(raw_question) }
print("questions:", questions)
print("number of skills: %d" % num_skill)

questions: {'穷举与递推': 0, '基本类型变量作函数参数': 1, '分类统计': 2, '一维数组作函数参数': 3, '排序查找算法': 4, '字符串': 5, '字符数组': 6, '矩阵运算': 7, '二维数组': 8, '循环控制结构': 9, '最值计算': 10, '字符数组作函数参数': 11, '指针': 12, '数值计算': 13, '累加累乘': 14, '选择控制结构': 15, '日期转换': 16, '键盘输入和屏幕输出': 17, '函数': 18, '数据类型、运算符与表达式': 19, '递归函数': 20, '输出图形': 21, '一维数组': 22, '数组': 23, '结构体': 24, '动态数据结构': 25, '变量的作用域和存储类型': 26, '链表': 27, '共用体': 28, '流程转移控制': 29, '字符指针': 30}
number of skills: 31


In [18]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.student_id == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):

    seq = student.sort_values('order_id')
    q = [questions[q] for q in seq.tag_id.tolist()]
    a = seq.correct.tolist()
    return q, a

# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]
sequences = parse_all_seq(data.student_id.unique())
with open('../dataset/sequences.txt', 'a', encoding='utf8') as f:
    for seq in tqdm.tqdm(sequences, 'write into file: '):
            print("seq:", seq)
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')

parse student sequence:	: 100%|██████████| 117/117 [00:00<00:00, 1907.49it/s]


答题序列数: 117
整个数据中共有学生: 117


write into file: 100%|██████████| 117/117 [00:00<00:00, 5865.81it/s]

seq: ([0, 1, 2, 3, 0, 9, 1, 10, 3, 2, 5, 11, 5, 11, 7, 8, 12, 1, 14, 12, 3, 4, 5, 17, 13, 15, 17, 15, 13, 9, 9, 0, 10, 1, 14, 1, 14, 1, 20, 3, 2, 4, 3, 4, 4, 10, 10, 3, 3, 22, 25, 17, 13, 2, 13, 3, 15, 5, 11, 0, 9, 11, 5, 7, 8, 15, 21, 9, 9, 9, 15, 13, 9, 21, 9, 0, 22, 5, 12, 9, 14, 7, 8, 3, 0, 0, 9, 0, 9, 3, 23, 15, 15, 8, 22, 8, 22, 8, 22, 22, 12, 0, 3, 22, 7, 8, 7, 8, 7, 8, 7, 8, 7, 8, 25, 17, 13, 15, 20, 15, 15, 15, 15, 15, 9, 0, 4, 22, 5, 11, 11, 5, 12, 12, 25, 14, 9, 9, 14, 14, 9, 9, 14, 9, 14, 9, 9, 14, 1, 14, 20, 26, 14, 3, 23, 13, 9, 15, 14, 22, 0, 9, 0, 9, 11, 5, 12, 5, 27, 27, 13, 19, 0, 9, 9, 0, 5, 11, 9, 25, 5, 15, 13, 13, 5, 5, 12, 8, 15, 0, 9, 7, 8, 11, 5, 5, 11, 10, 8, 5, 11, 20, 12, 25, 9, 0, 14, 29, 11, 5, 9, 29, 5, 15, 22, 7, 8, 25, 30, 12, 5, 5, 11, 25, 25, 22, 22, 12, 12, 16, 12, 19, 13, 15, 9, 14, 9, 14, 14, 9, 0, 9, 14, 9, 9, 21, 20, 20, 20, 8, 8, 8, 8, 8, 8, 19, 13, 15, 4, 22, 9, 14, 0, 9, 9, 15, 13, 9, 5, 22, 0, 0, 9, 4, 12, 4, 12, 12, 10, 0, 9, 8, 13, 22, 4, 1




In [19]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)
print(len(train_sequences))
print(len(test_sequences))
print(train_sequences)

82
35
[([13, 17, 15, 17, 15, 15, 15, 15, 14, 1, 20, 9, 15, 17, 21, 9, 0, 22, 20, 0, 3, 17, 13, 20, 15, 13, 17, 17, 13, 19, 13, 13, 19, 17, 17, 15, 15, 19, 14, 9, 9, 14, 20, 20, 20, 3, 23, 13, 9, 15, 13, 19, 13, 19, 12, 7, 8, 29, 14, 13, 17, 17, 13, 9, 29, 19, 13, 15, 2, 22, 14, 9, 20, 8, 8, 19, 13, 9, 29, 13, 0, 9, 13, 9, 9, 0, 9, 0, 4, 12, 19, 13, 13, 17, 17, 13, 15, 21, 9, 22, 15, 26, 24, 15, 19, 15, 19, 9, 0, 9, 17, 17, 21, 9, 0, 18, 5, 17, 5, 15, 0, 18, 5, 5, 6, 5, 6], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([7, 8, 10, 3, 11, 5, 13, 0, 9, 5, 11, 3, 4, 10, 3, 9, 0, 14, 1, 13, 17, 13, 13, 19, 19, 17, 13, 15, 17, 15, 15, 15, 15, 15, 15, 15, 15, 1

In [11]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='GB2312') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            print("seq:", seq)
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')
    f.close()
# save triple line format for other tasks
sequences2tl(train_sequences, '../dataset/train.txt')
sequences2tl(test_sequences, '../dataset/test.txt')

write into file: 100%|██████████| 82/82 [00:00<00:00, 4836.50it/s]


seq: ([13, 11, 5, 12, 5, 13, 17, 19, 13, 17, 13, 15, 17, 15, 9, 14, 14, 1, 3, 10, 4, 7, 8, 8, 23, 3, 3, 10, 5, 5, 5, 5, 11, 11, 3, 12, 12, 12, 24, 25, 3, 2, 13, 0, 9, 11, 5, 9, 14, 1, 22, 1, 25, 25, 20, 7, 8, 0, 9, 8, 22, 0, 9, 4, 22, 7, 8, 20, 9, 14, 17, 13, 17, 13, 5, 5, 11, 11, 11, 5, 12, 12, 12, 25, 13, 19, 13, 17, 13, 17, 17, 17, 17, 13, 19, 14, 9, 14, 9, 9, 14, 3, 4, 4, 23, 13, 3, 12, 6, 5, 5, 11, 7, 8, 13, 19, 12, 5, 5, 11, 5, 6, 15, 13, 17, 9, 14, 12, 12, 12, 8, 9, 9, 5, 11, 5, 11, 13, 17, 17, 17, 13, 5, 5, 14, 9, 4, 4, 22, 13, 4, 3, 5, 30, 6, 25, 30, 12, 11, 5, 5, 10, 8, 12, 12, 8, 8, 8, 16, 12, 19, 13, 15, 9, 14, 20, 1, 9, 1, 12, 12, 17, 14, 9, 20, 20, 8, 8, 8, 8, 0, 18, 22, 13, 2, 9, 13, 18, 13, 9, 9, 0, 13, 18, 4, 12, 10, 12, 17, 13, 17, 13, 12, 12, 12, 12, 13, 18, 13, 18, 5, 18, 12, 18, 9, 24, 13, 17, 13, 5, 5, 12, 0, 18, 2, 18, 13, 18, 21, 18, 15, 18, 17, 22, 3, 21, 9, 0, 18, 0, 18, 0, 18, 0, 18, 0, 18, 5, 17, 25, 25, 5, 15, 1, 0, 0, 18, 5, 13, 5, 11], [1, 1, 1, 1, 1, 1, 

write into file: 100%|██████████| 35/35 [00:00<00:00, 4379.23it/s]

seq: ([0, 9, 15, 13, 17, 13, 19, 17, 13, 15, 17, 15, 15, 15, 15, 15, 15, 15, 15, 13, 9, 9, 14, 13, 0, 3, 17, 17, 13, 13, 0, 9, 15, 0, 9, 2, 3, 15, 13, 17, 17, 13, 17, 15, 15, 9, 15, 15, 13, 9, 15, 17, 7, 8, 20, 20, 9, 14, 0, 9, 22, 0, 9, 17, 13, 15, 15, 15, 15, 9, 13, 17, 17, 17, 13, 17, 13, 15, 19, 13, 15, 15, 15, 15, 15, 9, 0, 19, 13, 13, 17, 13, 17, 17, 17, 19, 13, 15, 15, 15, 19, 15, 9, 14, 14, 9, 9, 14, 1, 14, 16, 8, 8, 12, 14, 22, 0, 9, 13, 19, 13, 19, 9, 13, 17, 9, 15, 13, 15, 15, 15, 15, 20, 9, 0, 9, 0, 14, 29, 13, 17, 17, 17, 13, 4, 15, 17, 4, 4, 22, 10, 8, 12, 8, 8, 19, 13, 15, 15, 15, 9, 14, 20, 1, 13, 19, 17, 15, 15, 9, 21, 20, 8, 8, 19, 13, 15, 9, 15, 9, 14, 9, 14, 9, 9, 14, 7, 8, 29, 13, 13, 17, 13, 9, 2, 9, 7, 8, 15, 13, 9, 0, 9, 21, 9, 19, 13, 17, 13, 17, 13, 13, 17, 17, 13, 13, 17, 13, 17, 15, 15, 15, 26, 9, 18, 15, 15, 19, 17, 13, 15, 19, 15, 15, 9, 2, 0, 9, 15, 4, 22, 4, 4, 22, 18, 15, 17, 13, 19, 17, 17, 5, 6, 5, 17, 15, 15, 17, 9, 0, 5, 15, 15, 9, 14, 15, 0, 18, 4,




In [20]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
MAX_STEP = 50
NUM_QUESTIONS = num_skill


def encode_onehot(sequences, max_step, num_questions):
    result = []
    count = 0
    for q, a in tqdm.tqdm(sequences, 'convert to one-hot format: '):
        length = len(q)
        print('length:', length)
        count = count +1

        mod = 0 if length % max_step == 0 else (max_step - length % max_step)
        print(mod)
        onehot = np.zeros(shape=[length + mod, 2 * num_questions])
        print(onehot.shape)
        for i, q_id in enumerate(q):
            index = int(q_id if a[i] > 0 else q_id + num_questions)
            onehot[i][index] = 1
        result = np.append(result, onehot)
    print(result.shape)
    print(count)
    return result.reshape(-1, max_step, 2 * num_questions)


# reduce the amount of data for example running faster
percentage = 1

train_data = encode_onehot(train_sequences[: int(len(train_sequences) * percentage)], MAX_STEP, NUM_QUESTIONS)
print(test_sequences[: int(len(test_sequences) * percentage)])
test_data = encode_onehot(test_sequences[: int(len(test_sequences) * percentage)], MAX_STEP, NUM_QUESTIONS)
print(test_data.shape[0])



convert to one-hot format: 100%|██████████| 82/82 [00:00<00:00, 492.33it/s]


length: 127
23
(150, 62)
length: 426
24
(450, 62)
length: 309
41
(350, 62)
length: 325
25
(350, 62)
length: 405
45
(450, 62)
length: 35
15
(50, 62)
length: 160
40
(200, 62)
length: 230
20
(250, 62)
length: 404
46
(450, 62)
length: 299
1
(300, 62)
length: 304
46
(350, 62)
length: 208
42
(250, 62)
length: 398
2
(400, 62)
length: 18
32
(50, 62)
length: 389
11
(400, 62)
length: 399
1
(400, 62)
length: 231
19
(250, 62)
length: 265
35
(300, 62)
length: 286
14
(300, 62)
length: 257
43
(300, 62)
length: 310
40
(350, 62)
length: 347
3
(350, 62)
length: 290
10
(300, 62)
length: 311
39
(350, 62)
length: 271
29
(300, 62)
length: 400
0
(400, 62)
length: 220
30
(250, 62)
length: 144
6
(150, 62)
length: 420
30
(450, 62)
length: 140
10
(150, 62)
length: 190
10
(200, 62)
length: 198
2
(200, 62)
length: 430
20
(450, 62)
length: 290
10
(300, 62)
length: 153
47
(200, 62)
length: 245
5
(250, 62)
length: 117
33
(150, 62)
length: 288
12
(300, 62)
length: 374
26
(400, 62)
length: 229
21
(250, 62)
length: 298


convert to one-hot format:   0%|          | 0/35 [00:00<?, ?it/s]

length: 260
40
(300, 62)
length: 275
25
(300, 62)
length: 439
11
(450, 62)
length: 177
23
(200, 62)
length: 139
11
(150, 62)
length: 249
1
(250, 62)
length: 569
31
(600, 62)
length: 383
17
(400, 62)
length: 481
19
(500, 62)
length: 231
19
(250, 62)
length: 384
16
(400, 62)
length: 170
30
(200, 62)
length: 409
41
(450, 62)
length: 473
27
(500, 62)
length: 226
24
(250, 62)
length: 345
5
(350, 62)
length: 408
42
(450, 62)
length: 601
49
(650, 62)
length: 309
41
(350, 62)
length: 17
33
(50, 62)


convert to one-hot format: 100%|██████████| 35/35 [00:00<00:00, 1002.70it/s]

length: 265
35
(300, 62)
length: 522
28
(550, 62)
length: 5
45
(50, 62)
length: 269
31
(300, 62)
length: 7
43
(50, 62)
length: 226
24
(250, 62)
length: 3
47
(50, 62)
length: 469
31
(500, 62)
length: 150
0
(150, 62)
length: 245
5
(250, 62)
length: 348
2
(350, 62)
length: 153
47
(200, 62)
length: 153
47
(200, 62)
length: 444
6
(450, 62)
length: 159
41
(200, 62)
(675800,)
35
218





In [28]:
# save onehot data
np.save('../dataset/train_data.npy', train_data)
np.save('../dataset/test_data.npy', test_data)

In [29]:
train_data = np.load('../dataset/train_data.npy')
print(train_data.shape[0],train_data.shape[1])

500 50
