In [33]:
from EduData import get_data
import os

if not os.path.exists('../../data/anonymized_full_release_competition_dataset/anonymized_full_release_competition_dataset.csv'):
    get_data("assistment-2017", "../../data")

In [34]:
import pandas as pd
import tqdm

pathname = 'anonymized_full_release_competition_dataset'
# pathname = 'lianxi'

data = pd.read_csv(
    '../../data/'+ pathname + '/' + pathname + '.csv',
    usecols=['startTime', 'timeTaken', 'studentId', 'skill', 'problemId', 'correct']
).dropna(subset=['skill', 'problemId']).sort_values('startTime')


data.timeTaken = data.timeTaken.astype(int)

In [35]:
# skills = data.skill.unique().tolist()
# 修改代码使其能够适应1个问题对应多个技能
all_skills = set()
for skill_str in data.skill:
    skills = skill_str.split(',')  # 假设技能是以逗号分隔的字符串
    all_skills.update(skills)
skills = list(all_skills)

problems = data.problemId.unique().tolist()
at = data.timeTaken.unique()

# question id from 1 to #num_skill
skill2id = { p: i+1 for i, p in enumerate(skills) }
import json
# 保存到 JSON 文件
with open('knowledge2id.json', 'w') as json_file:
    json.dump(skill2id, json_file)
problem2id = { p: i+1 for i, p in enumerate(problems) }
at2id = { a: i for i, a in enumerate(at) }

print("number of skills: %d" % len(skills))
print("number of problems: %d" % len(problems))
print("number of answer time: %d" % len(at))

number of skills: 102
number of problems: 3162
number of answer time: 1326


In [36]:
import numpy as np

it = set()
# calculate interval time   时间间隔都设置成it2id的形式，也就是不需要保存每一个时间间隔，只是保存了ID
for u in data.studentId.unique():
    startTime = np.array(data[data.studentId == u].startTime)
    for i in range(1, len(startTime)):
        item = (startTime[i] - startTime[i - 1]) // 60
        if item > 43200:
            item = 43200
        it.add(item)

it2id = { a: i for i, a in enumerate(it) }
print("number of interval time: %d" % len(it))

number of interval time: 2839


In [37]:
# problems to skills  # 这一步能否处理成一个问题对应多个技能
# problem2skill = {}
# for s, p in zip(np.array(data.skill), np.array(data.problemId)):
#     problem2skill[problem2id[p]] = skill2id[s]
# with open('../../data/anonymized_full_release_competition_dataset/problem2skill', 'w', encoding='utf-8') as f:
#     f.write(str(problem2skill))
    
problem2skill = {}
for skills, p in zip(np.array(data.skill), np.array(data.problemId)):
    skill_ids = [skill2id[s] for s in skills.split(',')]  # 假设技能是以逗号分隔的字符串
    problem2skill[problem2id[p]] = skill_ids
with open('../../data/' + pathname + '/problem2skill', 'w', encoding='utf-8') as f:
    f.write(str(problem2skill))
    

In [45]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.studentId == student_id], student_id)
        all_sequences.extend([student_sequence])
    return all_sequences


# 返回一个学生的答题序列转化为对应的ID形式
def parse_student_seq(student, student_id):
    seq = student
    # 这一步其实是不需要的，因为我们已经生成了problem2skill文件，问题对应的技能直接在文件中读取即可，所以这里我先满足原来的代码的形式，减少代码修改
    s = [skill2id[q.split(',')[0]] for q in seq.skill.tolist()]
    # s = [skill2id[q] for q in seq.skill.tolist()]
    a = seq.correct.tolist()
    p = [problem2id[p] for p in seq.problemId.tolist()]
    it = [0]
    startTime = np.array(seq.startTime)
    
    for i in range(1, len(startTime)):
        item = (startTime[i] - startTime[i - 1]) // 60
        if item > 43200:
            item = 43200
        it.append(it2id[item])
    at = [at2id[int(x)] for x in seq.timeTaken.tolist()]
    return student_id, s, a, p, it, at


sequences = parse_all_seq(data.studentId.unique())
# sequences = [
#     (
#         studentId,
#         [1, 2, 3],  # 技能ID序列
#         [1, 0, 1],  # 正确与否序列
#         [5, 6, 7],  # 问题ID序列
#         [0, 15, 30],  # 时间间隔序列
#         [120, 180, 240]  # 答题时间序列
#     ),
#     # 其他学生的答题序列...
# ]


parse student sequence:	: 100%|██████████| 1709/1709 [00:05<00:00, 340.62it/s]


In [46]:
from sklearn.model_selection import train_test_split, KFold
# split train data and test data
train_data, test_data = train_test_split(sequences, test_size=.2, random_state=10)
train_data = np.array(train_data, dtype=object)
test_data = np.array(test_data, dtype=object)

In [47]:
def sequences2l(sequences, trg_path):
    with open(trg_path, 'w', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write data into file: %s' % trg_path):
            studentId, s_seq, a_seq, p_seq, it_seq, at_seq = seq
            seq_len = len(s_seq)
            f.write(str(seq_len) + '\n')
            f.write(str(studentId) + '\n')
            f.write(','.join([str(s) for s in s_seq]) + '\n')
            f.write(','.join([str(a) for a in a_seq]) + '\n')
            f.write(','.join([str(p) for p in p_seq]) + '\n')
            f.write(','.join([str(i) for i in it_seq]) + '\n')
            f.write(','.join([str(a) for a in at_seq]) + '\n')

# split into 5 folds
# kfold = KFold(n_splits=5, shuffle=True, random_state=10)
# idx = 0
# for train_data_1, valid_data in kfold.split(train_data):
#     sequences2l(train_data[train_data_1], '../../data/' + pathname + '/train' + str(idx) + '.txt')
#     sequences2l(train_data[valid_data], '../../data/' + pathname + '/valid' + str(idx) + '.txt')
#     idx += 1
# 定义分割比例
train_ratio = 0.8
num_train = int(len(train_data) * train_ratio)

# 随机打乱数据
np.random.seed(10)
np.random.shuffle(train_data)

# 分割数据
train_data_1 = train_data[:num_train]
valid_data = train_data[num_train:]
# 写入文件
sequences2l(train_data_1, '../../data/' + pathname + '/train.txt')
sequences2l(valid_data, '../../data/' + pathname + '/valid.txt')

sequences2l(test_data, '../../data/' + pathname + '/test.txt')


write data into file: ../../data/anonymized_full_release_competition_dataset/train.txt:  28%|██▊       | 310/1093 [00:00<00:00, 1551.61it/s]

6507
6697
6866
843
6048
3912
5245
2646
3817
5653
530
1159
7417
7294
3334
4032
6459
2161
7116
251
7649
4341
636
2493
498
7583
3862
5116
2498
37
413
1313
834
993
1858
6482
2533
6411
3645
6421
2607
4767
7542
1635
5431
5574
3007
6129
4848
4104
372
1538
243
2703
3999
1627
938
4693
4981
3098
3377
6364
5386
4562
11
3573
4331
7228
6476
1800
2999
631
1108
351
1007
4255
3027
2425
6513
5778
7015
6400
4390
5705
7292
3961
2489
2435
7310
3362
7505
7297
4145
3109
3899
4359
4253
6215
900
3207
922
5444
5605
4846
3688
4164
492
4006
5590
4635
6230
1305
743
4962
5660
828
4315
2772
3863
7455
6731
5213
1565
430
1460
811
2092
1701
1162
1135
598
3881
5295
3879
1915
4582
3272
7531
6876
3799
6584
4127
1621
7493
3567
3587
1177
3904
2021
5363
809
3744
5950
6727
2865
6511
7275
3277
3157
4956
2131
3097
3834
837
2492
2519
2420
7537
3296
2643
4453
1241
5131
2156
6311
4662
6646
3435
2403
3848
7021
7723
283
4413
1539
2589
4997
609
3128
3873
7223
3067
5529
1317
514
401
2518
6322
5580
1350
4087
4174
1380
3687
2231
6287
4

write data into file: ../../data/anonymized_full_release_competition_dataset/train.txt:  60%|██████    | 661/1093 [00:00<00:00, 1693.86it/s]

4139
7365
5388
3039
2481
7083
752
6250
4813
387
6912
1896
7398
2776
7543
1672
690
7666
7617
6029
5159
7175
6327
4915
6918
4061
3771
7308
4333
744
1962
808
6862
3727
873
6815
5246
4425
6652
3763
6183
457
6577
7782
4291
2151
2146
819
6752
2082
6950
2972
6462
3829
7769
2026
5638
2691
4388
6974
2662
726
6006
5429
1450
3307
7613
3312
5332
6131
1765
1726
2148
6069
6502
4267
1309
4510
5819
6012
4654
2655
581
5354
65
7028
6294
4905
5662
4659
3159
7737
6858
6460
3002
6869
1231
2603
4817
341
949
7073
4412
781
7532
7058
2258
4079
1983
2095
2393
6979
6125
410
3992
2339
4366
6038
6279
7224
1913
7003
5163
3816
404
3026
3866
4162
6236
4867
3061
2582
6093
5999
2900
2714
5202
4879
1079
3321
2989
5078
4302
2396
546
2288
1981
5171
7076
1818
7511
6292
7345
6626
1143
3323
4270
3266
5869
5475
2737
6480
2497
2311
2591
4685
4665
7253
134
5482
4958
417
5310
4072
6504
499
1648
847
1649
3812
2634
1124
4226
4498
7636
43
2487
7012
7651
6695
269
5051
415
3245
3791
2416
2070
6304
363
640
5870
7219
1534
4553
6603
623

write data into file: ../../data/anonymized_full_release_competition_dataset/train.txt:  77%|███████▋  | 840/1093 [00:00<00:00, 1725.04it/s]

1272
2279
4520
7022
5155
4760
2932
5222
810
3768
1408
1667
2575
5491
3467
5209
1237
2343
842
299
3843
2959
7214
3250
998
6953
1064
1459
41
161
1374
4137
7724
1576
5774
1750
7295
6453
3083
6911
1229
154
589
3790
4873
6593
2975
2192
1178
6776
4695
1044
4071
5432
5717
1598
1089
1755
838
7190
6756
5335
1172
3499
7278
6578
2213
2766
3507
6828
5628
6535
6057
2784
2755
7090
3751
7680
2319
3391
2555
2509
7730
1771
7717
4227
5954
4724
2042
3572
160
6212
3932
6270
3259
383
7728
3256
3188
7170
2895
493
2041
337
237
7501
2902
932
1054
7211
391
6326
125
2471
7775
4842
2370
4573
894
1992
4375
7629
6879
7541
6214
7745
5048
1723
5417
6308
1208
5201
5687
7586
3431
5469
892
3559
4969
459
1055
7495
7104
1622
5423
5032
1232
1375
1570
663
1117
1548
268
5387
5056
6741
3731
495
3267
4433
3837
2923
7322
6610
7525
934
7432
584
2075
4518
3759
5467
7753
538
6495
4953
4165
403
3521
501
6422
3921
1977
3395
1287
3640
5517
2225
566
7283
5254
278
1154
2762
1320
3545
3352
6758
2140
6225
6568
4595
4230
462
3436
2955
64

write data into file: ../../data/anonymized_full_release_competition_dataset/train.txt: 100%|██████████| 1093/1093 [00:00<00:00, 1654.53it/s]


848
3253
4369
2738
1631
7655
4877
6227
1823
3260
6768
6346
2453
5130
3457
3195
6778
2010
7507
2175
7174
4013
2490
3100
282
505
5240
4786
5176
5027
1952
1404
4703
376
7067
447
4461
688
3861
1892
6345
1821
476
4666
4179
6312
1016
3311
1364
5712
4088
3970
6854
3189
7449
7469
2711
756
164
7302
224
7341
5684
6826
2939
4681
4172
5966
4563
8
7508
5984
6205
2586
4074
5558
2802
1185
1781
7487
1238
5035
4040
1250
5225
7439
7046
3252
7571
504
35
6307
5576


write data into file: ../../data/anonymized_full_release_competition_dataset/valid.txt:  56%|█████▌    | 153/274 [00:00<00:00, 1528.93it/s]

4761
3315
3609
6932
27
2044
1527
2033
176
3080
5779
4440
3761
4403
2501
2216
1439
2625
3515
5625
7703
3506
2969
4658
516
1438
3620
6409
5418
3867
5122
5683
6371
523
4452
3445
2642
4884
5873
5435
542
4650
5054
937
4896
3456
7438
6621
1965
6405
7605
1972
1417
4354
7596
3982
3403
1884
3198
570
6607
520
2924
2207
3764
738
1572
2942
2233
7189
3091
2537
1372
132
5347
3455
3487
6368
4466
6419
131
6325
2269
5082
2631
3283
4675
295
5663
6605
2402
3149
2048
6785
1919
4590
3282
5745
961
7164
901
7557
7289
6005
5752
4551
887
5212
638
2864
2933
6407
696
4625
2136
2469
5572
5944
6656
3908
3892
4737
1788
3408
6760
7646
1338
3749
7155
2495
5468
3066
2620
1881
5090
2572
3860
3778
1586
1099
5036
1568
4381
2071
261
3514
5652
6309
7446
6317
1857
2661
1110
7239
2238
5656
2340
4642
6722
5810
1684
7778
6022
1670
4012
5460
6606
4651
203
5859
2210
2433
3815
2898
87
1266
2242
6694
5103
2440
3320
355
215
1951
5599
7756
4286
3933
2253
3502
321
6640
2273
6501
903
6849
1088
5976
3886


write data into file: ../../data/anonymized_full_release_competition_dataset/valid.txt: 100%|██████████| 274/274 [00:00<00:00, 1473.40it/s]


6797
5231
6298
6938
156
5462
3300
607
2775
3135
5645
7601
6084
6252
2165
4557
1046
645
4292
6857
7431
5788
4830
7178
9
1047
4524
2309
1668
496
6186
1752
5265
3069
2719
4663
6880
5760
7079
1301
6043
5205
2035
1922
5096
4483
7251
2636
5781
2842
1607
6028
3215
5259
1960
4209
5642
6916
1911
3684
826
2968
522
3343
684
6102
6010
7233
4220
6023
668
229
4358
5736
6032


write data into file: ../../data/anonymized_full_release_competition_dataset/test.txt:   0%|          | 0/342 [00:00<?, ?it/s]

802
3392
2701
1855
1337
7585
1824
3029
3872
3275
101
6218
4576
747
2697
7478
4019
2535
7635
4052
2538
3336
7604
5013
1483


write data into file: ../../data/anonymized_full_release_competition_dataset/test.txt:  44%|████▍     | 152/342 [00:00<00:00, 1489.13it/s]

4819
4475
1941
4881
5777
7093
5734
1234
2334
5166
7355
2293
2431
4096
2115
7238
5633
1045
4622
6609
83
2725
7074
7110
263
5554
6144
240
2581
7650
6315
5927
4713
3905
7609
6782
7750
2229
3700
4605
7517
2993
3772
3618
1707
349
6388
5409
3115
6226
1013
4604
3511
4457
1590
4667
7014
5832
6939
594
4547
7236
898
2297
3828
1200
2976
7598
7137
6455
1437
1905
6154
4159
6267
3929
2848
3755
2688
3089
2888
5922
7143
1762
5565
2981
3553
3967
3203
303
6001
1470
1469
6369
5593
6030
2197
6984
2819
3262
597
7660
2614
1867
2132
3206
3497
185
2147
2222
1735
5494
3787
1203
6989
6886
1826
710
5650
625
5306
4640
3233
3465
7679
5607
954
4393
4062
3348
3476
5389
2740
2831
3494
6375
739
6427
3634
5776
3910
4446
5614
33
6962
6008
7662
2839
7064
432
1764
4874
3870
6297
1599
7258
5719
7023
3410
474
968
1925
3747
2163
1294
2278
7060
2664
683
6096
6278
7731
5863
3601
6700
491
6814
972
7594
782
3485
4511
2220
7401
5105
2722
3937
7187
77
1658
126
94
1575
5741
3797
1993
1394
7473
1125
6108
616
2372
3738
1866
1776
2816

write data into file: ../../data/anonymized_full_release_competition_dataset/test.txt:  88%|████████▊ | 301/342 [00:00<00:00, 1469.37it/s]

2494
312
771
2284
6754
3346
3853
3432
6135
6615
7443
3381
364
4691
4592
7713
5744
4263
2570
1825
2003
799
7313
5673
6053
1720
2058
6027
2106
314
4542
6805


write data into file: ../../data/anonymized_full_release_competition_dataset/test.txt: 100%|██████████| 342/342 [00:00<00:00, 1474.33it/s]

3390
4876
4946
3708
1257
2764
2866
3404
2460
6661
7783
6712
4970



