<a href="https://colab.research.google.com/github/wwells/dkt/blob/master/notebooks/torch_DKT_ww.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')

import pandas as pd
import torch
import torch.nn.functional as F
import math
import numpy as np
import os 

Mounted at /content/gdrive


## Getting Working Env Ready

These sections are used to prepare and clone our recommender-infra repo.   Be sure not to check in any commits with your GITHUB_PAT.   

Example usecase: 
https://medium.com/analytics-vidhya/how-to-use-google-colab-with-github-via-google-drive-68efb23a42d

In [2]:
# get our working env ready
HOME = '/content/drive/MyDrive'
GITHUB_DIR = HOME + '/Github'
NDA_RECOMMENDER_INFRA = GITHUB_DIR + '/recommender-infra'

if not os.path.exists(GITHUB_DIR):
   os.makedirs(GITHUB_DIR)

In [5]:
# IMPORTANT:  DO NOT commit this notebook with your GITHUB_PAT checked in.   
USERNAME = 'wwells'
GITHUB_PAT = 'somestring'

In [6]:
%cd $GITHUB_DIR
!git clone https://$GITHUB_PAT@github.com/khan-nda/recommender-infra.git
%cd $NDA_RECOMMENDER_INFRA

/content/drive/MyDrive/Github
Cloning into 'recommender-infra'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 0), reused 6 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.
/content/drive/MyDrive/Github/recommender-infra


In [8]:
%pwd

'/content/drive/MyDrive/Github/recommender-infra'

## Checking in any commits



## Modeling

In [None]:
inter_df = pd.read_csv('gdrive/My Drive/interactions.csv', sep=',') # example data from HawkesKT repo 
inter_df

In [None]:
# aggregate data by user 
max_step=50 # caps the sequences
user_wise_dict = dict()
cnt, n_inters = 0, 0
for user, user_df in inter_df.groupby('user_id'):
            df = user_df[:max_step]  # consider the first 50 interactions
            user_wise_dict[cnt] = {
                'user_id': user,
                'skill_seq': df['skill_id'].values.tolist(),
                'correct_seq': [round(x) for x in df['correct']],
                'time_seq': df['timestamp'].values.tolist(),
                'problem_seq': df['problem_id'].values.tolist()
            }
            cnt += 1
            n_inters += len(df)
user_seq_df = pd.DataFrame.from_dict(user_wise_dict, orient='index')
user_seq_df # this is the same as our seq step

n_users = max(inter_df['user_id'].values) + 1
n_skills = max(inter_df['skill_id']) + 1
n_problems = max(inter_df['problem_id']) + 1
user_seq_df

Unnamed: 0,user_id,skill_seq,correct_seq,time_seq,problem_seq
0,0,"[0, 0, 0, 1, 1, 1]","[1, 1, 1, 0, 0, 1]","[0, 80, 129, 130, 220, 260]","[0, 1, 2, 3, 4, 5]"
1,1,"[0, 0, 0, 0, 1, 1, 1]","[1, 1, 1, 1, 0, 1, 0]","[0, 40, 80, 120, 200, 250, 280]","[0, 1, 2, 3, 4, 5, 6]"


In [None]:
# train test split, although more like setup for k-fold cross-v
# data_df['dev'] is validation data
# here we take a fifth of the available data and reserve it as test
# then we take 10% of the remaining data as validation 
# the rest is training data. 
# all this is saved as 
k_fold=5
data_df = {
            'train': pd.DataFrame(), 'dev': pd.DataFrame(), 'test': pd.DataFrame()
        }

# def gen_fold_data(self, k):
# assert k < k_fold
k=0
n_examples = len(user_seq_df)
fold_size = math.ceil(n_examples / k_fold) # say we have 100 examples then this is 100/5 = 20 rows in each fold
fold_begin = k * fold_size # ??
fold_end = min((k + 1) * fold_size, n_examples)
data_df['test'] = user_seq_df.iloc[fold_begin:fold_end]
residual_df = pd.concat([user_seq_df.iloc[0:fold_begin], user_seq_df.iloc[fold_end:n_examples]])
dev_size = int(0.1 * len(residual_df))
dev_indices = np.random.choice(residual_df.index, dev_size, replace=False)  # random
data_df['dev'] = user_seq_df.iloc[dev_indices]
data_df['train'] = residual_df.drop(dev_indices)
#logging.info('# Train: {}, # Dev: {}, # Test: {}'.format(len(data_df['train']), len(data_df['dev']), len(data_df['test'])))
# so the above splits the two rows of seq data into one row per test, one train and 0 for dev

In [None]:
#batch_end = min(len(data), batch_start + batch_size)
#real_batch_size = batch_end - batch_start
# ignoring all batch stuff for now 

# padding function 
def pad_lst(lst, value=0, dtype=np.int64):
    inner_max_len = max(map(len, lst))
    result = np.ones([len(lst), inner_max_len], dtype) * value
    for i, row in enumerate(lst):
        for j, val in enumerate(row):
            result[i][j] = val
    return result

# data setup
user_ids = user_seq_df['user_id'].values
user_seqs = user_seq_df['skill_seq'].values
label_seqs = user_seq_df['correct_seq'].values

lengths = np.array(list(map(lambda lst: len(lst), user_seqs)))
indice = np.array(np.argsort(lengths, axis=-1)[::-1])
inverse_indice = np.zeros_like(indice)
for i, idx in enumerate(indice):
        inverse_indice[idx] = i

feed_dict = {
            'user_id': torch.from_numpy(user_ids[indice]),
            'skill_seq': torch.from_numpy(pad_lst(user_seqs[indice])),    # [batch_size, num of items to predict]
            'label_seq': torch.from_numpy(pad_lst(label_seqs[indice])),   # [batch_size, num of items to predict]
            'length': torch.from_numpy(lengths[indice]),                        # [batch_size]
            'inverse_indice': torch.from_numpy(inverse_indice),
            'indice': torch.from_numpy(indice)
        }

feed_dict
# feed dict is an input to the model 
# we have user id tenosr, 
# then skill_sequence tensor - which is padded with 0 to the longest sequence (here it's 7)
# then we have label tensor, which is again length 7 each 

{'user_id': tensor([1, 0]), 'skill_seq': tensor([[0, 0, 0, 0, 1, 1, 1],
         [0, 0, 0, 1, 1, 1, 0]]), 'label_seq': tensor([[1, 1, 1, 1, 0, 1, 0],
         [1, 1, 1, 0, 0, 1, 0]]), 'length': tensor([7, 6]), 'inverse_indice': tensor([1, 0]), 'indice': tensor([1, 0])}

In [None]:
# set up the model 
skill_num=n_skills
emb_size=10
hidden_size=10
num_layer=1

skill_embeddings = torch.nn.Embedding(skill_num * 2, emb_size) # embedding layer is skill_num*2 (which is feature length) by emb dim
rnn = torch.nn.LSTM(
            input_size=emb_size, hidden_size=hidden_size, batch_first=True,
            num_layers=num_layer
   )
out = torch.nn.Linear(hidden_size, skill_num)
loss_function = torch.nn.BCELoss()



In [None]:
# forward_pass
# in earlier step we define three layers:
  # 1 embedding layer with as many rows as skills*2 (feature_length) and as many columns as emb dims
  # LSTM Layer with input of emb dim columns, num of hidden units, layers
  # output linear layer that as input takes 10 cols from LSTM layer, outputs as many columns as skills (here 2) I think (at least out dim =2)

seq_sorted = feed_dict['skill_seq']     # [batch_size, history_max]
labels_sorted = feed_dict['label_seq']  # [batch_size, history_max]
lengths = feed_dict['length']           # [batch_size]

# this is the feature we had before, where skill_id*2+correct
# so this is passing the feature to the embeding layer that's size feature_length, emd dimensions
# so very similar to the step where we one hot encoded the feature. 
# each user sequence is now converted to an embedding but here we get 10 dimensional embedding for each step in the sequence.
embed_history_i = skill_embeddings(seq_sorted + labels_sorted * skill_num) # 7 X 10 embedings for two tensors (two users) 

# packing a tensor is converting several tensors into 1 by interleaving elements of both tensors, length-1 here takes 1st to n-1 elements, so time shifting piece 
embed_history_i_packed = torch.nn.utils.rnn.pack_padded_sequence(embed_history_i, lengths - 1, batch_first=True) # output here is 11 embeding vectors, 
# outpup is 11 embeding vectors, 64 dims each, Hidden layer is 4 X 64
# it's 11 vectors because we took two tensors of 7 and 6 length, removed the last element and then packed into 1 so 6+5=11
output, hidden = rnn(embed_history_i_packed, None) 

# output is 6 X 64 vectors, two tensors,second tensor last row is 0,since we had 6 and 5 elem input 
# I think pad_packed_sequence reverses the procedure from pack_padded_sequence()
output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True) 
# 2 tensors, each 6X2, out() is the Linear dense layer, I think here we have two values for each student, each sequence step, likely logits
pred_vector = out(output) 

# this takes skill sequences from 2nd element to last, so timeshifted. SO these are also length 6 each like the output of pred_vector
target_item = seq_sorted[:, 1:] 

# gather creates a new tensor by picking values from input tensor based on index values provided, these index row or col values of input depending on dim
# so here we have prediction tensor that's 6 by 2, index comes from unsqueezing (reshaping) 2x6 skill seq tensor along column dim.
# index ends up looking like this for one seq (6 rows, 1 col) 
 #  [[0],
 #   [0],
 #   [0],
 #   [1],
 #   [1],
 #   [1]] 
# we create a new tensor by picking a value from pred_vector tensor that match the index and then squeezing back into 2x6 shape  
prediction_sorted = torch.gather(pred_vector, dim=-1, index=target_item.unsqueeze(dim=-1)).squeeze(dim=-1) # need to understand the squeez stuff, but this gives two tensors 6 vals each 
label = labels_sorted[:, 1:] # this is also timeshifted correct 

prediction_sorted = torch.sigmoid(prediction_sorted)
        
prediction = prediction_sorted[feed_dict['inverse_indice']]
label = label[feed_dict['inverse_indice']].double()

out_dict = {'prediction': prediction, 'label': label}
# # embed_history_i # two tensors 7 x 64 each. (7 is the sequence length, so each sequence element is converted into 64 emb)
# # seq_sorted+labels_sorted*2
# embed_history_i_packed
# target item is 2 by 6 tensor (this is just skill seq), unsqueeze reshapes it into 2 by 6 rows by 1 column (sort of wide to long), used as index in gather()


In [None]:
# now loss function

indice = feed_dict['indice']
lengths = feed_dict['length'] - 1
predictions, labels = out_dict['prediction'][indice], out_dict['label'][indice]
predictions = torch.nn.utils.rnn.pack_padded_sequence(predictions, lengths, batch_first=True).data
labels = torch.nn.utils.rnn.pack_padded_sequence(labels, lengths, batch_first=True).data
loss_function(predictions.float(), labels.float())


tensor(0.7479, grad_fn=<BinaryCrossEntropyBackward0>)

In [None]:
### FULL CODE FOR LSTM DKT BASED ON HawkesKT REPO 

# Read data 
from google.colab import drive 
drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
import torch 
import math

inter_df = pd.read_csv('gdrive/My Drive/ka_lstm_df_big.txt', sep=',', header=None) # example data from HawkesKT repo 
inter_df.columns=['user_id', 'skill_id', 'correct']

# prepare sequence data
max_step=100 # caps the sequences
user_wise_dict = dict()
cnt, n_inters = 0, 0
for user, user_df in inter_df.groupby('user_id'):
            df = user_df[:max_step]  # consider the first 50 interactions
            user_wise_dict[cnt] = {
                'user_id': user,
                'skill_seq': df['skill_id'].values.tolist(),
                'correct_seq': [round(x) for x in df['correct']]
            }
            cnt += 1
            n_inters += len(df)
user_seq_df = pd.DataFrame.from_dict(user_wise_dict, orient='index')
user_seq_df # this is the same as our seq step

n_users = max(inter_df['user_id'].values) + 1
n_skills = max(inter_df['skill_id']) + 1
user_seq_df['seq_length'] = user_seq_df['correct_seq'].apply(len)
user_seq_df=user_seq_df[user_seq_df.seq_length > 3] # filter out short sequences

# set up feed_dict dataset 
# padding function 
def pad_lst(lst, value=0, dtype=np.int64):
    inner_max_len = max(map(len, lst))
    result = np.ones([len(lst), inner_max_len], dtype) * value
    for i, row in enumerate(lst):
        for j, val in enumerate(row):
            result[i][j] = val
    return result

# data setup
user_ids = user_seq_df['user_id'].values
user_seqs = user_seq_df['skill_seq'].values
label_seqs = user_seq_df['correct_seq'].values

lengths = np.array(list(map(lambda lst: len(lst), user_seqs))) # same as seq_length above for each user 
# next we add an index by length
# argsort returns indices that would sort an array, but the indices are in the same order as original array 
# [::2] is called Slice notation it means <start_index> <end_index> <by>, eg start at 0th index, end at last index, by 2 (every second element)
# [::-1] -1 reverses the array, so start with the 0th, to last, by 1 from back 
indice = np.array(np.argsort(lengths, axis=-1)[::-1]) 
inverse_indice = np.zeros_like(indice) # array of zeros same shape as indice
for i, idx in enumerate(indice):
        inverse_indice[idx] = i

feed_dict = {
            'user_id': torch.from_numpy(user_ids[indice]), # this is ordred by longest sequence?
            'skill_seq': torch.from_numpy(pad_lst(user_seqs[indice])),    # [batch_size, num of items to predict]
            'label_seq': torch.from_numpy(pad_lst(label_seqs[indice])),   # [batch_size, num of items to predict]
            'length': torch.from_numpy(lengths[indice]),                        # [batch_size]
            'inverse_indice': torch.from_numpy(inverse_indice),
            'indice': torch.from_numpy(indice)
        }



Mounted at /content/gdrive


In [None]:


# At this point we are done reading in data, transforming it, and running the forward pass of the model. 
# now need to figure out train and predict 
# good reference on steps - https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/

# define the device
device = torch.device("cpu")

# define the model 

class dkt_model(torch.nn.Module):
  def __init__(self, emb_size, skill_num, hidden_size, num_layer):
   super(dkt_model, self).__init__()

   # define params
   self.skill_num=n_skills
   self.emb_size=emb_size
   self.hidden_size=hidden_size
   self.num_layer=num_layer
   

   # define the layers
   self.skill_embeddings = torch.nn.Embedding(self.skill_num * 2, self.emb_size) # embedding layer is skill_num*2 (which is feature length) by emb dim
   self.rnn = torch.nn.LSTM(
            input_size=self.emb_size, hidden_size=self.hidden_size, batch_first=True,
            num_layers=self.num_layer
   )
   self.out = torch.nn.Linear(self.hidden_size, self.skill_num)

  # define the forward pass
  #  1) get embeddings, 2) then pack them, 3) then pass through lstm layer, 4) unpack, 5) then dense layer
  def forward(self, feed_dict):
    seq_sorted = feed_dict['skill_seq']     # [batch_size, history_max]
    labels_sorted = feed_dict['label_seq']  # [batch_size, history_max]
    lengths = feed_dict['length']           # [batch_size]

    embed_history_i = self.skill_embeddings(seq_sorted + labels_sorted * self.skill_num)  
    embed_history_i_packed = torch.nn.utils.rnn.pack_padded_sequence(embed_history_i, lengths - 1, batch_first=True)
    output, hidden = self.rnn(embed_history_i_packed, None) 
    output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True) 
    pred_vector = self.out(output) 

    target_item = seq_sorted[:, 1:] # timeshifted sequence of exerciees 
    label = labels_sorted[:, 1:] # timeshifted sequence of labels
 
    prediction_sorted = torch.gather(pred_vector, dim=-1, index=target_item.unsqueeze(dim=-1)).squeeze(dim=-1) 
    prediction_sorted = torch.sigmoid(prediction_sorted)
    prediction = prediction_sorted[feed_dict['inverse_indice']]
    label = label[feed_dict['inverse_indice']].double()
    out_dict = {'prediction': prediction, 'label': label}
    return out_dict

    
# Instantiate the model with paramters
dktm = dkt_model(emb_size=64, skill_num=n_skills, hidden_size=64, num_layer=1)  
dktm.to(device)

# Define hyperparameters
n_epochs = 6
lr=0.001

# Define Loss, Optimizer
optimizer = torch.optim.Adam(dktm.parameters(), lr=lr)
loss_function = torch.nn.BCELoss()



In [None]:
feed_dict

{'user_id': tensor([3181, 6443, 3221,  ..., 3689, 9104, 3237]),
 'skill_seq': tensor([[ 66,  66,  66,  ..., 123, 123,  20],
         [ 83,  83,  83,  ..., 101, 101, 101],
         [ 30,  30,  30,  ..., 148,  37,  37],
         ...,
         [ 67,  67,  67,  ...,   0,   0,   0],
         [101, 101, 101,  ...,   0,   0,   0],
         [ 15,  15,  15,  ...,   0,   0,   0]]),
 'label_seq': tensor([[0, 1, 1,  ..., 1, 0, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'length': tensor([100, 100, 100,  ...,   4,   4,   4]),
 'inverse_indice': tensor([2790, 4429, 5556,  ..., 2492, 7799, 7403]),
 'indice': tensor([2949, 5996, 2983,  ..., 3425, 8489, 2998])}

In [None]:
#tmp=dktm(feed_dict)
tmp['label'].shape

torch.Size([8789, 99])

In [None]:
# now define the training run

for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    out_dict = dktm(feed_dict) # this step outputs the result of the forward pass (8799 X 99 dataset)
    indice = feed_dict['indice']
    lengths = feed_dict['length'] - 1
    predictions, labels = out_dict['prediction'][indice], out_dict['label'][indice]
    predictions = torch.nn.utils.rnn.pack_padded_sequence(predictions, lengths, batch_first=True).data
    labels = torch.nn.utils.rnn.pack_padded_sequence(labels, lengths, batch_first=True).data
    loss=loss_function(predictions.float(), labels.float())
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # updates the weights
    
    if epoch%2 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))


Epoch: 2/6............. Loss: 0.6924
Epoch: 4/6............. Loss: 0.6817
Epoch: 6/6............. Loss: 0.6716


In [None]:
# w = list(dktm.parameters())
# len(w) 

for name, param in dktm.named_parameters():
    if param.requires_grad:
        print (name, param.data)