### Implementation of DKT:
#### Part 1: Define constants

In [10]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from deepkt import data_util,deepkt,metrics

In [11]:
import redis  
pool = redis.ConnectionPool(host='ubuntu1', port=6379, decode_responses=True)
r = redis.Redis(host='ubuntu1', port=6379, decode_responses=True)

In [12]:
data = []
students = r.keys()  
max_feat = 0
for student_id in students:
    try:
        li=r.lrange(student_id, 0, -1)
        if len(li) <= 1:
            continue
        di=[]
        for row in li:
            s= row.split(',')
            feat = int(s[0])+int(s[1])*2
            if feat>max_feat:
                max_feat = feat
            s.append(feat)
            di.append(s)
        df=pd.DataFrame(di,columns=['correct','skill_id','skill_with_answer'])
        df['correct']=df['correct'].astype(dtype=int)
        df['skill_id']=df['skill_id'].astype(dtype=int)
        data.append(tuple((df['skill_with_answer'].values[:-1],df['skill_id'].values[1:],df['correct'].values[1:])))
    except:
        print(student_id)
        continue
# students.remove('name')

name


In [13]:
students.remove('name')

In [14]:
seq = pd.Series(data, index=students)
nb_users = len(seq)
dataset = tf.data.Dataset.from_generator(
        generator=lambda: seq,
        output_types=(tf.int32, tf.int32, tf.float32)
    )

In [16]:
features_depth = max_feat
skill_depth = 91+1
batch_size = 32
MASK_VALUE = -1.
verbose = 1  # Verbose = {0,1,2}
best_model_weights = "weights/bestmodel"  # File to save the model.
log_dir = "logs"  # Path to save the logs.
optimizer = "adam"  # Optimizer to use
lstm_units = 50  # Number of LSTM units
epochs = 3  # Number of epochs to train
dropout_rate = 0.3 

In [79]:
features_depth

181

In [17]:
dataset = dataset.map(
        lambda feat, skill, label: (
            tf.one_hot(feat, depth=features_depth),
            tf.concat(
                values=[
                    tf.one_hot(skill, depth=skill_depth),
                    tf.expand_dims(label, -1)
                ],
                axis=-1
            )
        )
    )

In [18]:
dataset = dataset.padded_batch(
        batch_size=batch_size,
        padding_values=(MASK_VALUE, MASK_VALUE),
        padded_shapes=([None, None], [None, None]),
        drop_remainder=True
    )

In [19]:
length = nb_users // batch_size

In [20]:
test_fraction = 0.1
validation_fraction =0.1
train_set, test_set, val_set = data_util.split_dataset(dataset=dataset,
                                                           total_size=length,
                                                           test_fraction=test_fraction,
                                                           val_fraction=validation_fraction)

In [21]:
nb_skills =skill_depth
nb_features = features_depth
set_sz = length * batch_size
test_set_sz = (set_sz * test_fraction)
val_set_sz = (set_sz - test_set_sz) * validation_fraction
train_set_sz = set_sz - test_set_sz - val_set_sz
print("============= Data Summary =============")
print("Total number of students: %d" % set_sz)
print("Training set size: %d" % train_set_sz)
print("Validation set size: %d" % val_set_sz)
print("Testing set size: %d" % test_set_sz)
print("Number of skills: %d" % nb_skills)
print("Number of features in the input: %d" % nb_features)
print("========================================")

Total number of students: 18432
Training set size: 14929
Validation set size: 1658
Testing set size: 1843
Number of skills: 92
Number of features in the input: 181


In [22]:
student_model = deepkt.DKTModel(
        nb_features=nb_features,
        nb_skills=nb_skills,
        hidden_units=lstm_units,
        dropout_rate=dropout_rate)

In [23]:
student_model.compile(
        optimizer=optimizer,
        metrics=[
            metrics.BinaryAccuracy(),
            metrics.AUC(),
            metrics.Precision(),
            metrics.Recall()
        ])

In [37]:
print(student_model.summary())

Model: "DKTModel"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, None, 181)]       0         
_________________________________________________________________
masking_1 (Masking)          (None, None, 181)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 50)          46400     
_________________________________________________________________
outputs (TimeDistributed)    (None, None, 92)          4692      
Total params: 51,092
Trainable params: 51,092
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
epochs = 10
history = student_model.fit(dataset=train_set,
                                epochs=epochs,
                                verbose=verbose,
                                validation_data=val_set,
                                callbacks=[
                                    tf.keras.callbacks.CSVLogger(f"{log_dir}/train.log"),
                                    tf.keras.callbacks.ModelCheckpoint(best_model_weights,
                                                                       save_best_only=True,
                                                                       save_weights_only=True),
                                    tf.keras.callbacks.TensorBoard(log_dir=log_dir)
                                ])

Epoch 1/10
    465/Unknown - 11s 23ms/step - loss: 0.0826 - binary_accuracy: 0.9799 - auc_1: 0.9911 - precision_1: 0.9119 - recall_1: 0.9892

TypeError: evaluate() got an unexpected keyword argument 'x'

In [25]:
student_model.load_weights(best_model_weights)
result = student_model.evaluate(test_set, verbose=verbose)



In [52]:
data = []
student_id = 'U_8033986'
li=r.lrange(student_id, 0, -1)
if len(li) <= 1:
    print("没有做题记录，无法预测")
di=[]
for row in li:
    s= row.split(',')
    feat = int(s[0])+int(s[1])*2
    if feat>max_feat:
        max_feat = feat
    s.append(feat)
    di.append(s)
df=pd.DataFrame(di,columns=['correct','skill_id','skill_with_answer'])
df['correct']=df['correct'].astype(dtype=int)
df['skill_id']=df['skill_id'].astype(dtype=int)
data.append(tuple((df['skill_with_answer'].values[:-1],df['skill_id'].values[1:],df['correct'].values[1:])))
data

[(array([  3, 129, 101,  65, 129,   3, 101], dtype=int64),
  array([64, 50, 32, 64,  1, 50, 32]),
  array([1, 1, 1, 1, 1, 1, 1]))]

In [64]:
seq = pd.Series(data, index=[student_id])
nb_users = len(seq)
dataset = tf.data.Dataset.from_generator(
        generator=lambda: seq,
        output_types=(tf.int32, tf.int32, tf.float32)
)

In [65]:
features_depth = max_feat
skill_depth = 91+1
batch_size = 32
MASK_VALUE = -1.
verbose = 1  # Verbose = {0,1,2}
best_model_weights = "weights/bestmodel"  # File to save the model.
log_dir = "logs"  # Path to save the logs.
optimizer = "adam"  # Optimizer to use
lstm_units = 50  # Number of LSTM units
epochs = 3  # Number of epochs to train
dropout_rate = 0.3 

In [66]:
dataset = dataset.map(
        lambda feat, skill, label: (
            tf.one_hot(feat, depth=features_depth),
            tf.concat(
                values=[
                    tf.one_hot(skill, depth=skill_depth),
                    tf.expand_dims(label, -1)
                ],
                axis=-1
            )
        )
    )

In [67]:
dataset = dataset.padded_batch(
        batch_size=1,
        padding_values=(MASK_VALUE, MASK_VALUE),
        padded_shapes=([None, None], [None, None]),
        drop_remainder=True
    )

In [68]:
type(dataset)

tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset

In [69]:
# 测试单个学生
student_id = 


SyntaxError: invalid syntax (<ipython-input-69-8e1cc0bbe2ee>, line 2)

In [71]:
p =student_model.predict(dataset)

In [76]:
p[0][-1]

array([0.92451835, 0.9864709 , 0.03341067, 0.9616207 , 0.6442864 ,
       0.9507751 , 0.51043624, 0.58850646, 0.97118694, 0.5627289 ,
       0.7308159 , 0.83594775, 0.43803224, 0.53803194, 0.89481056,
       0.8470025 , 0.597956  , 0.9635488 , 0.99976045, 0.84536135,
       0.8132259 , 0.8747394 , 0.9861593 , 0.42026725, 0.40344357,
       0.86272776, 0.9731734 , 0.45123354, 0.84522223, 0.96729696,
       0.92790276, 0.9829868 , 0.9591638 , 0.83736897, 0.8970469 ,
       0.44312084, 0.91761553, 0.3324117 , 0.4663628 , 0.40743768,
       0.9312024 , 0.30116743, 0.9342462 , 0.49234852, 0.95417786,
       0.5668284 , 0.9774703 , 0.84237736, 0.06274936, 0.6192659 ,
       0.92499316, 0.2280215 , 0.48547888, 0.08118367, 0.17802924,
       0.97667515, 0.4140955 , 0.96509796, 0.9829143 , 0.9188718 ,
       0.9383851 , 0.36076826, 0.4011748 , 0.45156395, 0.97441614,
       0.76680756, 0.3680261 , 0.88891757, 0.55423933, 0.9976928 ,
       0.892238  , 0.48494303, 0.9939953 , 0.8603007 , 0.93670

In [None]:
train1 = pd.read_json(fdir+'/problem_act_train.json',lines=True,encoding='utf-8')
train2 = pd.read_json(fdir+'/problem_act_train_2.json',lines=True,encoding='utf-8')
train = pd.concat([train1,train2],ignore_index=True).drop_duplicates()
problem_info = pd.read_json(fdir+'/problem_info.json',lines=True,encoding='utf-8')
train_data = pd.merge(train,problem_info,on='problem_id')
train_data["skill"]=train_data["concept"]
for i in range(len(train_data)):
    train_data["skill"].iloc[i] = train_data["concept"].iloc[i][0]
train_data["skill_id"]=train_data["skill"]

In [None]:
from category_encoders import OrdinalEncoder
encoder = OrdinalEncoder(cols = ['skill_id','student_id','problem_id'], 
                         handle_unknown = 'value', 
                         handle_missing = 'value').fit(train_data) 
encoded_train = encoder.transform(train_data)
encoded_train['user_id'] =encoded_train['student_id']
encoded_train['correct'] =encoded_train['label']
encoded_train=encoded_train.drop(['problem_id','student_id','label','concept','detail'],axis=1)
encoded_train.to_csv('train_data.csv',index=0,encoding='utf-8')

In [1]:
# fn = "data/ASSISTments_skill_builder_data.csv" # Dataset path
fn = "train_data.csv"
verbose = 1 # Verbose = {0,1,2}
best_model_weights = "weights/bestmodel" # File to save the model.
log_dir = "logs" # Path to save the logs.
optimizer = "adam" # Optimizer to use
lstm_units = 50 # Number of LSTM units
batch_size = 16 # Batch size
epochs = 3 # Number of epochs to train
dropout_rate = 0.3 # Dropout rate
test_fraction = 0.2 # Portion of data to be used for testing
validation_fraction = 0.2 # Portion of training data to be used for validation

#### Part 2: Pre-processing

In [2]:
from deepkt import deepkt, data_util, metrics


dataset, length, nb_features, nb_skills = data_util.load_dataset(fn=fn,
                                                                 batch_size=batch_size,
                                                                 shuffle=True)

train_set, test_set, val_set = data_util.split_dataset(dataset=dataset,
                                                       total_size=length,
                                                       test_fraction=test_fraction,
                                                       val_fraction=validation_fraction)


set_sz = length * batch_size
test_set_sz = (set_sz * test_fraction)
val_set_sz = (set_sz - test_set_sz) * validation_fraction
train_set_sz = set_sz - test_set_sz - val_set_sz
print("============= Data Summary =============")
print("Total number of students: %d" % set_sz)
print("Training set size: %d" % train_set_sz)
print("Validation set size: %d" % val_set_sz)
print("Testing set size: %d" % test_set_sz)
print("Number of skills: %d" % nb_skills)
print("Number of features in the input: %d" % nb_features)
print("========================================")

Total number of students: 13408
Training set size: 8581
Validation set size: 2145
Testing set size: 2681
Number of skills: 61
Number of features in the input: 122


#### Part 3: Building the model

In [3]:
student_model = deepkt.DKTModel(
                        nb_features=nb_features,
                        nb_skills=nb_skills,
                        hidden_units=lstm_units,
                        dropout_rate=dropout_rate)

student_model.compile(
        optimizer=optimizer,
        metrics=[
            metrics.BinaryAccuracy(),
            metrics.AUC(),
            metrics.Precision(),
            metrics.Recall()
        ])

student_model.summary()

Model: "DKTModel"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, None, 122)]       0         
_________________________________________________________________
masking (Masking)            (None, None, 122)         0         
_________________________________________________________________
lstm (LSTM)                  (None, None, 50)          34600     
_________________________________________________________________
outputs (TimeDistributed)    (None, None, 61)          3111      
Total params: 37,711
Trainable params: 37,711
Non-trainable params: 0
_________________________________________________________________


#### Part 4: Train the Model

In [None]:
import tensorflow as tf
history = student_model.fit(dataset=train_set,
                            epochs=epochs,
                            verbose=verbose,
                            validation_data=val_set,
                            callbacks=[ 
                                tf.keras.callbacks.CSVLogger(f"{log_dir}/train.log"),
                                tf.keras.callbacks.ModelCheckpoint(best_model_weights,
                                                                   save_best_only=True,
                                                                   save_weights_only=True),
                                tf.keras.callbacks.TensorBoard(log_dir=log_dir)
        ])

In [None]:
tf.__version__

#### Part 5: Load the Model with the Best Validation Loss

In [None]:
student_model.load_weights(best_model_weights)

#### Part 6: Test the Model

In [4]:
student_model = deepkt.DKTModel(
                        nb_features=nb_features,
                        nb_skills=nb_skills,
                        hidden_units=lstm_units,
                        dropout_rate=dropout_rate)

student_model.compile(
        optimizer=optimizer,
        metrics=[
            metrics.BinaryAccuracy(),
            metrics.AUC(),
            metrics.Precision(),
            metrics.Recall()
        ])
student_model.load_weights('./weights/bestmodel')
result = student_model.evaluate(test_set, verbose=verbose)



In [None]:
student_model