In [1]:
import numpy as np
import pandas as pd 

import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [9]:
%%time
dtype = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','user_answer':'int8','answered_correctly':'int8', 'prior_question_elapsed_time':'float64'}

train_df = pd.read_csv('train.csv', usecols=[1, 2, 3,4, 6,7,8,9], dtype=dtype)
train_df.head()

CPU times: user 36.2 s, sys: 5.53 s, total: 41.7 s
Wall time: 42.3 s


Unnamed: 0,timestamp,user_id,content_id,content_type_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,115,5692,0,3,1,,
1,56943,115,5716,0,2,1,37000.0,False
2,118363,115,128,0,0,1,55000.0,False
3,131167,115,7860,0,0,1,19000.0,False
4,137965,115,7922,0,1,1,11000.0,False


In [10]:
user_flags = train_df.groupby('user_id').agg({'answered_correctly':['mean','count','std']}).reset_index()
user_flags.columns=['user_id','user_answer_mean','user_answer_count','user_std']
user_flags.user_std = user_flags.user_std.fillna(user_flags.user_std.mean())

train_df=train_df.merge(user_flags,on=['user_id'],how='left')

train_df = train_df[train_df.content_type_id == False]

#arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [11]:
train_df

Unnamed: 0,timestamp,user_id,content_id,content_type_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_answer_mean,user_answer_count,user_std
0,0,115,5692,0,3,1,,,0.695652,46,0.465215
1,0,1805962620,5547,0,1,0,,,0.297297,37,0.463373
2,0,2015251289,4024,0,0,1,,,0.520548,219,0.585206
3,0,867941388,6659,0,3,1,,,0.563131,2772,0.539976
4,0,867946278,3977,0,2,1,,,0.450000,20,0.510418
...,...,...,...,...,...,...,...,...,...,...,...
99271295,87193076570,626308830,8185,0,2,0,13000.0,True,0.682344,12545,0.481410
99271296,87193279051,626308830,6686,0,0,0,10000.0,True,0.682344,12545,0.481410
99271297,87193332075,626308830,5860,0,2,1,21000.0,True,0.682344,12545,0.481410
99271298,87193355096,626308830,11465,0,0,0,25000.0,True,0.682344,12545,0.481410


In [None]:
# %%time
# questions = pd.read_csv("questions.csv")
# train_questions=train.merge(questions,left_on='content_id',right_on='question_id',how='left')
# del questions
# train_questions_gp=train_questions.groupby('question_id').agg({'answered_correctly':['mean','count','std']}).reset_index()
# train_questions_gp.columns=['question_id','q_correct','q_count','q_std']
# train_questions_gp.q_std = train_questions_gp.q_std.fillna(train_questions_gp.q_std.mean())
# train=train.merge(train_questions_gp,left_on='content_id',right_on='question_id',how='left')
# user_flags = train.groupby('user_id').agg({'answered_correctly':['mean','count','min','max','std']}).reset_index()
# user_flags.columns=['user_id','user_answer_mean','user_answer_count','user_min','user_max','user_std']
# user_flags.user_std = user_flags.user_std.fillna(user_flags.user_std.mean())
# user_flags.isnull().sum()
# train_df=train.merge(user_flags,on=['user_id'],how='left')

In [12]:
skills = train_df["content_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))

number skills 13523


In [13]:
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))

In [18]:
group[115]

(array([5692, 5716,  128, 7860, 7922,  156,   51,   50, 7896, 7863,  152,
         104,  108, 7900, 7901, 7971,   25,  183, 7926, 7927,    4, 7984,
          45,  185,   55, 7876,    6,  172, 7898,  175,  100, 7859,   57,
        7948,  151,  167, 7897, 7882, 7962, 1278, 2063, 2065, 2064, 3363,
        3365, 3364], dtype=int16),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        0, 1], dtype=int8))

In [19]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=100):
        super(SAKTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = group
        
#         self.user_ids = [x for x in group.index]
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]
            if len(q) < 10:
                continue
            self.user_ids.append(user_id)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            q[:] = q_[-self.max_seq:]
            qa[:] = qa_[-self.max_seq:]
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
        
        target_id = q[1:]
        label = qa[1:]

        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill

        return x, target_id, label

In [20]:
train, val = train_test_split(group, test_size=0.2)

train_dataset = SAKTDataset(train, n_skill)
train_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True, num_workers=8)
del train

val_dataset = SAKTDataset(val, n_skill)
val_dataloader = DataLoader(val_dataset, batch_size=2048, shuffle=True, num_workers=8)
del val
# print(item[0])
# print(item[1])
# print(item[2])

In [22]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128):
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)

        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


model = SAKTModel(n_skill, embed_dim=128)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [24]:
def train_epoch(model, train_iterator, optim, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        target_mask = (target_id != 0)

        optim.zero_grad()
        output, atten_weight = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        
        loss = criterion(output, label)
        loss.backward()
        optim.step()
        train_loss.append(loss.item())
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [25]:
def val_epoch(model, val_iterator, criterion, device="cpu"):
    model.eval()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(val_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        target_mask = (target_id != 0)

        with torch.no_grad():
            output, atten_weight = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)

        loss = criterion(output, label)
        train_loss.append(loss.item())

        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [None]:
epochs = 20

over_fit = 0
last_auc = 0
train_losses = []
valid_losses = []
train_auces = []
valid_auces = []
for epoch in range(epochs):
    train_loss, train_acc, train_auc = train_epoch(model, train_dataloader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
    
    val_loss, avl_acc, val_auc = val_epoch(model, val_dataloader, criterion, device)
    print("epoch - {} val_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, val_loss, avl_acc, val_auc))
    train_losses.append(train_loss)
    valid_losses.append(val_loss)
    train_auces.append(train_auc)
    valid_auces.append(val_auc)
    if val_auc > last_auc:
        last_auc = val_auc
        over_fit = 0
    else:
        over_fit += 1
        
    
    if over_fit >= 2:
        print("early stop epoch ", epoch)
        break

  0%|          | 0/153 [00:00<?, ?it/s]

In [None]:
 
plt.figure(figsize=(10,7))
plt.title("Loss vs Epoch")
plt.plot(train_losses, label = "train")
plt.plot(valid_losses, label = "val")

plt.legend()
plt.show() 


plt.figure(figsize=(10,7))
plt.title("AUC")
plt.plot(train_auces, label = "train")
plt.plot(valid_auces, label = "val")

plt.legend()
plt.show() 