In [1]:
import pandas as pd
import numpy as np
import tqdm
import os
import math
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, KFold

In [2]:
%matplotlib inline

In [3]:
data = pd.read_csv(
    'anonymized_full_release_competition_dataset.csv',
    usecols=['startTime', 'studentId', 'skill', 'problemId', 'correct']
).dropna(subset=['skill', 'problemId']).sort_values('studentId')

In [4]:
data

Unnamed: 0,studentId,skill,problemId,startTime,correct
0,8,properties-of-geometric-figures,1118,1096470301,0
695,8,pattern-finding,948,1108146052,1
696,8,pattern-finding,949,1108146072,1
697,8,pattern-finding,950,1108146089,1
698,8,pattern-finding,951,1108146096,1
...,...,...,...,...,...
942628,7783,noskill,9774,1137163799,0
942627,7783,noskill,9774,1137163662,0
942626,7783,point-plotting,9772,1137163637,1
942632,7783,noskill,9774,1137163860,1


In [5]:
def get_shortest_sequence(df):
    students = df.studentId.unique()
    shortest  = len(df[df['studentId']==8])
    the_sid  = 0
    for sid in students:
        sid_len = len(df[df['studentId']==sid])
        if sid_len < shortest:
            shortest = sid_len
            the_sid = sid
    return shortest , the_sid
shortest_sequence, s_sid = get_shortest_sequence(data)

In [6]:
def get_longest_sequence(df):
    students = df.studentId.unique()
    longest  = 0
    the_sid  = 0
    for sid in students:
        sid_len = len(df[df['studentId']==sid])
        if sid_len > longest:
            longest = sid_len
            the_sid = sid
    return longest , the_sid
longest_sequence, l_sid = get_longest_sequence(data)

In [7]:
skills = data.skill.unique().tolist()
students = data.studentId.unique().tolist()
problems = data.problemId.unique().tolist()
skill2id = { p: i+1 for i, p in enumerate(skills) }
problem2id = { p: i+1 for i, p in enumerate(problems) }
student2id = {p: i for i, p in enumerate(students)}
print("number of skills: %d" % len(skills))
print("number of problems: %d" % len(problems))
print("number of students: %d" % len(data.studentId.unique()))
print("length of the longest sequence: %d" % longest_sequence)
print("studentId of the longest sequence: %d" % l_sid)
print("length of the shortest sequence: %d" % shortest_sequence)
print("studentId of the shortest sequence: %d" % s_sid)

number of skills: 102
number of problems: 3162
number of students: 1709
length of the longest sequence: 3057
studentId of the longest sequence: 1483
length of the shortest sequence: 2
studentId of the shortest sequence: 5163


In [8]:
np.save('problem2id.npy',problem2id)
np.save('skill2id.npy',skill2id)

In [9]:
data = data.sort_values(['studentId','problemId','startTime'])

In [10]:
#Historical total attempt time
pid_try = [0 for x in range(len(data))]
plist =  np.array(data.problemId)
for i in range(1,942816):
    if plist[i] == plist[i-1]:
        pid_try[i] = pid_try[i-1]+1
    else:
        continue
data['pid_try']=pid_try

In [11]:
max(pid_try)

90

In [12]:
data = data.sort_values(['studentId','startTime'])

In [13]:
#Continuous attempt time
timestamp_try = [0 for x in range(len(data))]
plist =  np.array(data.problemId)
for i in range(1,942816):
    if plist[i] == plist[i-1]:
        timestamp_try[i] = timestamp_try[i-1]+1
    else:
        continue
data['timestamp_try']=timestamp_try

In [14]:
max(timestamp_try)

59

In [15]:
Q_matrix = np.zeros((len(problems)+1,len(skills)+1))
for s, p in zip(np.array(data.skill), np.array(data.problemId)):
    Q_matrix[problem2id[p],skill2id[s]] = 1
np.savetxt('Q_matrix.txt', Q_matrix)

In [16]:
Q_matrix_idx = np.zeros((len(problems)+1,len(skills)+1))
for s, p in zip(np.array(data.skill), np.array(data.problemId)):
    Q_matrix_idx[problem2id[p],skill2id[s]] = skill2id[s]
np.savetxt('Q_matrix_idx.txt', Q_matrix_idx)

In [17]:
#interval time
data = data.sort_values(['studentId','startTime'])
intervaltime = [1 for x in range(len(data))]
studentlist =  np.array(data.studentId)
time =  np.array(data.startTime)
for i in range(1,len(data)):
    if studentlist[i] == studentlist[i-1]:
        raw_time = (time[i]-time[i-1])//60
        if raw_time <=43200:
            if raw_time <=20:
                intervaltime[i]=1
            elif raw_time<=60:
                intervaltime[i]=2
            elif raw_time<=1440:
                intervaltime[i]=3
            elif raw_time<=10080:
                intervaltime[i]=4
            elif raw_time<=43200:
                intervaltime[i]=5
        else:
            intervaltime[i]=5
    else:
        continue
data['intervaltime']=intervaltime

In [18]:
#generate trainset to assess difficulty

train_data = data[data.studentId==8][0:800]
students.remove(8)
for s in students:
    sample_data = data[data.studentId==s]
    len_sequence = len(sample_data)
    if len_sequence>1000:
        sample_data = sample_data[0:1000]
    sample_data = sample_data[0: math.floor(0.8*len_sequence)]

    train_data = pd.concat([train_data,sample_data],  ignore_index = True)

In [19]:
train_data

Unnamed: 0,studentId,skill,problemId,startTime,correct,pid_try,timestamp_try,intervaltime
0,8,properties-of-geometric-figures,1118,1096470301,0,0,0,1
1,8,properties-of-geometric-figures,1119,1096470350,1,0,0,1
2,8,sum-of-interior-angles-more-than-3-sides,1120,1096470354,0,0,0,1
3,8,sum-of-interior-angles-more-than-3-sides,1120,1096470360,0,1,1,1
4,8,sum-of-interior-angles-more-than-3-sides,1121,1096470378,1,0,0,1
...,...,...,...,...,...,...,...,...
718021,7783,substitution,405,1147445571,1,3,3,1
718022,7783,substitution,406,1147445581,0,0,0,1
718023,7783,substitution,406,1147445591,0,1,1,1
718024,7783,substitution,406,1147445593,0,2,2,1


In [20]:
# finding the knowledge concepts that are not included in the training set.
for i in data.skill.unique().tolist():
    if i not in train_data.skill.unique().tolist():
        print(i)
print(skill2id['slope'])

slope
102


In [21]:
#get ability
def get_ability(df):
    all_ability = []
    ability = pd.DataFrame(df.studentId.unique().tolist(),columns=['studentId'])
    ability = ability.sort_values('studentId')
    p_all = len(df.problemId.unique())
    for sid in ability.studentId.unique():
        sid_data = df[df['studentId']==sid].sort_values(['problemId','startTime'])
        sequence_len= len(sid_data)
        p_len = len(sid_data.problemId.unique())
        count_correct = sid_data.correct.sum()
        g = (count_correct/sequence_len)*math.log((p_all/(p_all-p_len)),10)
        all_ability.append(g)
    ability['ability']=all_ability
    return ability

stu_ability = get_ability(train_data)
print(stu_ability)

      studentId   ability
0             8  0.035685
1             9  0.012710
2            11  0.009705
3            25  0.016779
4            27  0.001550
...         ...       ...
1704       7772  0.010926
1705       7775  0.008979
1706       7778  0.005209
1707       7782  0.006412
1708       7783  0.009319

[1709 rows x 2 columns]


In [22]:
print("min of ability: " , stu_ability.ability.min())
print("max of ability: ",stu_ability.ability.max())

min of ability:  0.0
max of ability:  0.05847667215606971


In [23]:
min_max_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
stu_ability['ability'] = stu_ability[['ability']].apply(min_max_scaler)

In [24]:
stu_ability

Unnamed: 0,studentId,ability
0,8,0.610251
1,9,0.217360
2,11,0.165959
3,25,0.286934
4,27,0.026512
...,...,...
1704,7772,0.186844
1705,7775,0.153543
1706,7778,0.089083
1707,7782,0.109651


In [25]:
train_data = pd.merge(train_data,stu_ability,how='left',on='studentId')

In [26]:
#get problem difficulty
def get_problem_difficulty(df,epsilon=0.003):
    all_difficulty = []
    problem_d = pd.DataFrame(df.problemId.unique().tolist(),columns=['problemId'])
    problem_d = problem_d.sort_values('problemId')
    stu_num = len(df.studentId.unique())
    for pid in problem_d.problemId.unique():
        pid_data = df[df['problemId']==pid].sort_values(['studentId','startTime'])
        sequence_len = len(pid_data)
        s_len = len(pid_data.studentId.unique())
        pid_data_wrong = pid_data[pid_data['correct']==0]
        sum_ability = pid_data_wrong.ability.sum()
        d = (sum_ability/sequence_len)*math.log(stu_num /(1+s_len),10)
        all_difficulty.append(d)
    problem_d['problem_difficulty']=all_difficulty
    return problem_d

In [27]:
problem_difficulty = get_problem_difficulty(train_data)

In [28]:
print("min of difficulty: " , problem_difficulty.problem_difficulty.min())
print("max of difficulty: ",problem_difficulty.problem_difficulty.max())

min of difficulty:  0.0
max of difficulty:  2.9317120670567554


In [29]:
problem_difficulty

Unnamed: 0,problemId,problem_difficulty
369,1,0.195480
1078,2,0.184821
1079,3,0.226257
1080,4,0.167115
727,5,0.115611
...,...,...
2748,22572,0.000000
2694,22646,0.000000
2691,22685,2.931712
2692,22686,0.000000


In [30]:
train_data = pd.merge(train_data,problem_difficulty,how='left',on='problemId')

In [31]:
problem_all = pd.DataFrame(data.problemId.unique().tolist(),columns=['problemId'])
#first use average difficulty value to fill df
problem_all['problem_difficulty']=problem_difficulty.problem_difficulty.mean()
for p, d in zip(problem_difficulty.problemId,problem_difficulty.problem_difficulty):
    problem_all.loc[problem_all['problemId']==p, 'problem_difficulty']=d

In [32]:
problem_difficulty

Unnamed: 0,problemId,problem_difficulty
369,1,0.195480
1078,2,0.184821
1079,3,0.226257
1080,4,0.167115
727,5,0.115611
...,...,...
2748,22572,0.000000
2694,22646,0.000000
2691,22685,2.931712
2692,22686,0.000000


In [33]:
problem_all = problem_all.sort_values('problemId')

In [34]:
problem_all

Unnamed: 0,problemId,problem_difficulty
369,1,0.195480
1217,2,0.184821
1218,3,0.226257
1219,4,0.167115
866,5,0.115611
...,...,...
2874,22686,0.000000
2875,22687,0.000000
2823,22759,0.212090
2824,22760,0.212090


In [35]:
#get knowledge concept difficulty
problem_difficulty_dict = problem_difficulty.set_index('problemId')['problem_difficulty'].to_dict()

In [36]:
Q_matrix_pdifficulty = np.zeros((len(problems)+1,len(skills)+1))
for s, p in zip(np.array(train_data.skill), np.array(train_data.problemId)):
    Q_matrix_pdifficulty[problem2id[p],skill2id[s]] = problem_difficulty_dict[p]

Q_matrix_pdifficulty = Q_matrix_pdifficulty.sum(0)

In [37]:
Q_matrix_pdifficulty

array([0.00000000e+00, 5.27571466e+00, 3.94186653e+01, 8.77726975e+00,
       2.68177728e+01, 1.05189107e+01, 6.53884968e+00, 1.15212395e+00,
       5.94712607e+00, 7.47302445e+00, 9.62911977e+00, 8.34790196e+00,
       3.83133813e+00, 1.18073304e+01, 5.94186025e+00, 2.75683202e+01,
       1.05940960e+01, 9.98613512e+00, 2.86169356e+01, 3.77987793e+00,
       1.63169078e+01, 1.70223690e+01, 1.14465555e+01, 3.06304537e+00,
       7.95383234e+00, 3.39361234e+00, 7.45719633e+00, 7.83819285e+00,
       6.63812873e+00, 1.04912557e+00, 2.93888406e+00, 1.22817284e+00,
       7.60241674e+00, 8.75771367e+00, 1.81593687e+00, 2.57356347e+00,
       7.19367913e+00, 1.27650272e+00, 9.67835664e-01, 1.99564282e+00,
       1.08837854e+00, 5.88226245e+00, 8.97900683e+00, 2.40512055e+00,
       8.71422409e+00, 2.89436121e+00, 2.48341608e-01, 9.50451313e+00,
       3.37101414e+00, 9.25826000e+00, 4.28212012e-01, 2.30174087e+00,
       4.05305206e-01, 3.35371321e+00, 3.58636049e+00, 4.76310601e+00,
      

In [38]:
Q_matrix[0][0]=1
Q_matrix_tomean = Q_matrix.sum(0)
mean_Q_matrix_pdifficulty = Q_matrix_pdifficulty/Q_matrix_tomean

In [39]:
mean_Q_matrix_pdifficulty

array([0.        , 0.1388346 , 0.18081957, 0.19505044, 0.26817773,
       0.30937973, 0.10546532, 0.14401549, 0.15249041, 0.43958967,
       0.18517538, 0.18972504, 0.18244467, 0.1639907 , 0.15235539,
       0.15753326, 0.20772737, 0.13869632, 0.24458919, 0.17999419,
       0.19658925, 0.19126257, 0.2543679 , 0.17016919, 0.17675183,
       0.16968062, 0.13809623, 0.1703955 , 0.22890099, 0.14987508,
       0.1469442 , 0.13646365, 0.16175355, 0.19461586, 0.10088538,
       0.21446362, 0.12402895, 0.10637523, 0.13826224, 0.15351099,
       0.15548265, 0.1838207 , 0.16325467, 0.09620482, 0.16137452,
       0.13782672, 0.1241708 , 0.1760095 , 0.11236714, 0.12858694,
       0.14273734, 0.12787449, 0.1013263 , 0.14581362, 0.17931802,
       0.1323085 , 0.11037457, 0.31816723, 0.19211821, 0.086956  ,
       0.16031495, 0.19298875, 0.        , 0.05781864, 0.        ,
       0.14665042, 0.        , 0.1861137 , 0.17211088, 0.14407758,
       0.20488188, 0.14523152, 0.10325393, 0.16553838, 0.14609

In [40]:
#use average value fill the knowledge concept that do not appear in the training set
mean_Q_matrix_pdifficulty[-1] = mean_Q_matrix_pdifficulty[1:-1].sum()/101

In [41]:
mean_Q_matrix_pdifficulty.reshape(-1)

array([0.        , 0.1388346 , 0.18081957, 0.19505044, 0.26817773,
       0.30937973, 0.10546532, 0.14401549, 0.15249041, 0.43958967,
       0.18517538, 0.18972504, 0.18244467, 0.1639907 , 0.15235539,
       0.15753326, 0.20772737, 0.13869632, 0.24458919, 0.17999419,
       0.19658925, 0.19126257, 0.2543679 , 0.17016919, 0.17675183,
       0.16968062, 0.13809623, 0.1703955 , 0.22890099, 0.14987508,
       0.1469442 , 0.13646365, 0.16175355, 0.19461586, 0.10088538,
       0.21446362, 0.12402895, 0.10637523, 0.13826224, 0.15351099,
       0.15548265, 0.1838207 , 0.16325467, 0.09620482, 0.16137452,
       0.13782672, 0.1241708 , 0.1760095 , 0.11236714, 0.12858694,
       0.14273734, 0.12787449, 0.1013263 , 0.14581362, 0.17931802,
       0.1323085 , 0.11037457, 0.31816723, 0.19211821, 0.086956  ,
       0.16031495, 0.19298875, 0.        , 0.05781864, 0.        ,
       0.14665042, 0.        , 0.1861137 , 0.17211088, 0.14407758,
       0.20488188, 0.14523152, 0.10325393, 0.16553838, 0.14609

In [42]:
X=5*((mean_Q_matrix_pdifficulty- np.min(mean_Q_matrix_pdifficulty))/((np.max(mean_Q_matrix_pdifficulty)+0.00001) - np.min(mean_Q_matrix_pdifficulty)))

In [43]:
int_list = []
for x in X:
    int_list.append(math.floor(x)+1)
int_Q_matrix_pdifficulty = np.array(int_list)

In [44]:
kcd_matrix = np.zeros((len(problems)+1,len(skills)+1))
for i in range(1,len(problems)+1):
    kcd_matrix[i] = int_Q_matrix_pdifficulty

In [45]:
kcd_matrix_final = kcd_matrix * Q_matrix

In [46]:
np.savetxt('Q_matrix_kcdifficulty_5.txt', kcd_matrix_final)

In [47]:
#  problem difficulty classification
l = 5
category_scaler = lambda x : ((x-np.min(x))/((np.max(x)+0.00001)-np.min(x)))*l
category_scaler_int = lambda x : math.floor(x)+1

In [48]:
problem_all['problem_difficulty_temp'] = problem_all[['problem_difficulty']].apply(category_scaler)
problem_all['problem_difficulty_temp'] = problem_all['problem_difficulty_temp'].apply(category_scaler_int)

In [49]:
data = pd.merge(data,problem_all,how='left',on='problemId')

In [50]:
# parse sequence
data  = data.sort_values(['studentId','startTime'])

In [51]:
data

Unnamed: 0,studentId,skill,problemId,startTime,correct,pid_try,timestamp_try,intervaltime,problem_difficulty,problem_difficulty_temp
0,8,properties-of-geometric-figures,1118,1096470301,0,0,0,1,0.100126,1
1,8,properties-of-geometric-figures,1119,1096470350,1,0,0,1,0.060015,1
2,8,sum-of-interior-angles-more-than-3-sides,1120,1096470354,0,0,0,1,0.064156,1
3,8,sum-of-interior-angles-more-than-3-sides,1120,1096470360,0,1,1,1,0.064156,1
4,8,sum-of-interior-angles-more-than-3-sides,1121,1096470378,1,0,0,1,0.081103,1
...,...,...,...,...,...,...,...,...,...,...
942811,7783,mean,2272,1147447464,1,3,3,1,0.157785,1
942812,7783,mean,2273,1147447468,0,0,0,1,0.176045,1
942813,7783,mean,2273,1147447539,0,1,1,1,0.176045,1
942814,7783,mean,2273,1147447543,1,2,2,1,0.176045,1


In [52]:
def filtered_parse_all_seq(students):
    all_total_train_sequences = []
    all_train_sequences = []
    all_valid_sequences = []
    all_test_sequences  = []
    for student_id in tqdm.tqdm(students,'parse studen sequence:\t'):
        len_allsequences = len(data[data['studentId']==student_id])
        if len_allsequences < 10 :
            continue
        else:
            total_train_sequences,train_sequences, valid_sequences, test_sequences = filtered_parse_tudent_seq(data[data['studentId']==student_id])
            all_total_train_sequences.extend([total_train_sequences])
            all_train_sequences.extend([train_sequences])
            all_valid_sequences.extend([valid_sequences])
            all_test_sequences.extend([test_sequences])
    return all_total_train_sequences, all_train_sequences, all_valid_sequences, all_test_sequences
    
def filtered_parse_tudent_seq(student):
    if len(student)>1000:
        student = student.iloc[:1000]
    end_train = math.floor(len(student)*0.6)
    end_valid = math.floor(len(student)*0.8)
    seq = student.sort_values('startTime')

    
    
    
    all_train_p = [problem2id[p] for p in seq.problemId.tolist()[0:end_valid]]
    all_train_ptry = seq.pid_try.tolist()[0:end_valid]
    all_train_p_d = seq.problem_difficulty_temp.tolist()[0:end_valid]
    all_train_ttry = seq.timestamp_try.tolist()[0:end_valid]
    all_train_a = seq.correct.tolist()[0:end_valid]
    all_train_sid = [student2id[p] for p in seq.studentId.tolist()[0:end_valid]]
    all_train_it = seq.intervaltime.tolist()[0:end_valid]

    
    train_p = [problem2id[p] for p in seq.problemId.tolist()[0:end_train]]
    train_ptry = seq.pid_try.tolist()[0:end_train]
    train_p_d = seq.problem_difficulty_temp.tolist()[0:end_train]
    train_ttry = seq.timestamp_try.tolist()[0:end_train]
    train_a = seq.correct.tolist()[0:end_train]
    train_sid = [student2id[p] for p in seq.studentId.tolist()[0:end_train]]
    train_it = seq.intervaltime.tolist()[0:end_train]

    
    valid_p = [problem2id[p] for p in seq.problemId.tolist()[end_train:end_valid]]
    valid_ptry = seq.pid_try.tolist()[end_train:end_valid]
    valid_p_d = seq.problem_difficulty_temp.tolist()[end_train:end_valid]
    valid_ttry = seq.timestamp_try.tolist()[end_train:end_valid]
    valid_a = seq.correct.tolist() [end_train:end_valid]
    valid_sid = [student2id[p] for p in seq.studentId.tolist()[end_train:end_valid]]
    valid_it = seq.intervaltime.tolist()[end_train:end_valid]

    
    test_p = [problem2id[p] for p in seq.problemId.tolist()[end_valid:]]
    test_ptry = seq.pid_try.tolist()[end_valid:]
    test_p_d = seq.problem_difficulty_temp.tolist()[end_valid:]
    test_ttry = seq.timestamp_try.tolist()[end_valid:]
    test_a = seq.correct.tolist()[end_valid:]
    test_sid = [student2id[p] for p in seq.studentId.tolist()[end_valid:]]
    test_it = seq.intervaltime.tolist()[end_valid:]

    
    return (all_train_p,all_train_ptry,all_train_p_d,all_train_ttry,all_train_a, all_train_sid,all_train_it),(train_p,train_ptry,train_p_d,train_ttry,train_a, train_sid, train_it),(valid_p,valid_ptry,valid_p_d,valid_ttry,valid_a, valid_sid, valid_it),(test_p,test_ptry,test_p_d,test_ttry,test_a, test_sid, test_it)

In [53]:
filtered_all_train_sequence,filtered_train_sequences, filtered_valid_sequences, filtered_test_sequences= filtered_parse_all_seq(data.studentId.unique())

parse studen sequence:	: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1709/1709 [00:05<00:00, 302.77it/s]


In [54]:
def sequences2l(sequences, trg_path):
    with open(trg_path, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences,'write data into file:%s'% trg_path):
            p_seq, ptry_seq, pd_seq , ttry_seq, a_seq ,sid_seq, it_seq= seq
            seq_len = len(p_seq)
            f.write(str(seq_len)+'\n')
            f.write(','.join([str(p) for p in p_seq]) + '\n')
            f.write(','.join([str(ptry) for ptry in ptry_seq]) + '\n')
            f.write(','.join([str(pd) for pd in pd_seq]) + '\n')
            f.write(','.join([str(ttry) for ttry in ttry_seq]) + '\n')
            f.write(','.join([str(a) for a in a_seq]) + '\n')
            f.write(','.join([str(sid) for sid in sid_seq]) + '\n')
            f.write(','.join([str(it) for it in it_seq]) + '\n')


sequences2l(filtered_all_train_sequence, 'total_train' + '_filtered_5' + '.txt')
sequences2l(filtered_train_sequences, 'train' + '_filtered_5' + '.txt')
sequences2l(filtered_valid_sequences, 'valid' + '_filtered_5' + '.txt')
sequences2l(filtered_test_sequences, 'test'+'_filtered_5.txt')

write data into file:total_train_filtered_5.txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1708/1708 [00:00<00:00, 2143.35it/s]
write data into file:train_filtered_5.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1708/1708 [00:00<00:00, 3124.25it/s]
write data into file:valid_filtered_5.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1708/1708 [00:00<00:00, 8875.44it/s]
write data into file:test_filtered_5.txt: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1