In [72]:
import gc
# import psutil
import joblib
import random
import time
from tqdm import tqdm

import numpy as np
import pandas as pd
import datatable as dt

from itertools import combinations_with_replacement

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [18]:
seed_value = 42
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) # gpu vars
# torch.backends.cudnn.deterministic = True  #needed
# torch.backends.cudnn.benchmark = False

# Hyperparameters

In [19]:
TRAIN_SAMPLES = 320000
MAX_SEQ = 180
MIN_SAMPLES = 5
EMBED_DIM = 128
DROPOUT_RATE = 0.2
LEARNING_RATE = 2e-3
MAX_LEARNING_RATE = 2e-3
EPOCHS = 30
TRAIN_BATCH_SIZE = 64
ACCEPTED_USER_CONTENT_SIZE = 4

# Load Data

In [20]:
%%time

dtypes = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}
train_df = dt.fread('./train.csv', columns=set(dtypes.keys())).to_pandas()
for col, dtype in dtypes.items():
    train_df[col] = train_df[col].astype(dtype)
train_df = train_df[train_df.content_type_id == False]
train_df = train_df.sort_values(['timestamp'], ascending=True)
train_df.reset_index(drop=True, inplace=True)

CPU times: user 1min 51s, sys: 14.2 s, total: 2min 6s
Wall time: 1min 12s


# Statistics

In [21]:
skills = train_df["content_id"].unique()
df = train_df.values
degree_of_difficulty_count = np.zeros((len(skills), 2))
for i in tqdm(range(len(df))):
    degree_of_difficulty_count[df[i][2]][df[i][4]] += 1
correctness = np.zeros(len(skills))
for i in range(len(degree_of_difficulty_count)):
    tmp = degree_of_difficulty_count[i][1] / (degree_of_difficulty_count[i][0] + degree_of_difficulty_count[i][1])
    correctness[i] = int(tmp / 0.2)

joblib.dump(correctness, 'correctness.pkl.zip')

100%|██████████| 99271300/99271300 [03:20<00:00, 494572.51it/s]


['correctness.pkl.zip']

In [75]:
question = pd.read_csv('./questions.csv')
tags = question['tags'].values
for i in range(len(tags)):
    if i == 10033:
        tags[i] = [162]
    else:
        tags[i] = [int(tag) for tag in tags[i].split(' ')]

tags_num = 188 # total number of category of tags 

cooccurrence_matrix = np.zeros((188, 188)) # total tags
for i in range(len(tags)):
    if len(tags[i]) == 1:
        cooccurrence_matrix[tags[i][0]][tags[i][0]] += 1
    for tag_1, tag_2 in list(combinations_with_replacement(tags[i], 2)):
        cooccurrence_matrix[tag_1][tag_2] += 1
        cooccurrence_matrix[tag_2][tag_1] += 1

98.0
1257.0
288.0
90.0
150.0
170.0
16.0
232.0
2240.0
248.0
586.0
30.0
678.0
48.0
696.0
36.0
420.0
76.0
208.0
114.0
200.0
1194.0
46.0
32.0
51.0
102.0
65.0
865.0
75.0
3414.0
210.0
30.0
238.0
71.0
18.0
430.0
70.0
408.0
4512.0
154.0
54.0
200.0
588.0
54.0
62.0
150.0
20.0
99.0
167.0
48.0
86.0
142.0
154.0
1611.0
83.0
932.0
78.0
30.0
33.0
114.0
326.0
98.0
388.0
10.0
143.0
59.0
123.0
654.0
36.0
138.0
92.0
122.0
216.0
1877.0
952.0
69.0
236.0
30.0
18.0
885.0
110.0
3938.0
1054.0
36.0
470.0
72.0
3.0
40.0
224.0
608.0
428.0
383.0
4538.0
116.0
54.0
51.0
1129.0
664.0
519.0
72.0
406.0
104.0
1578.0
384.0
144.0
84.0
914.0
240.0
60.0
624.0
98.0
76.0
30.0
312.0
242.0
63.0
299.0
90.0
530.0
186.0
36.0
10.0
596.0
86.0
36.0
171.0
114.0
59.0
110.0
100.0
74.0
1300.0
38.0
186.0
195.0
520.0
2066.0
230.0
208.0
14.0
224.0
130.0
34.0
1424.0
324.0
246.0
80.0
249.0
220.0
106.0
60.0
48.0
159.0
128.0
30.0
418.0
129.0
412.0
52.0
151.0
396.0
292.0
1831.0
264.0
112.0
30.0
180.0
33.0
138.0
144.0
74.0
162.0
32.0
674.0
126.0
60

# Preprocess

In [6]:
skills = train_df["content_id"].unique()
joblib.dump(skills, 'skills.pkl.zip')
n_skill = len(skills)
print("number skills", len(skills))

number skills 13523


In [7]:
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))

joblib.dump(group, "group.pkl.zip")
del train_df
gc.collect()

0

In [8]:
train_indexes = list(group.index)[:TRAIN_SAMPLES]
valid_indexes = list(group.index)[TRAIN_SAMPLES:]
train_group = group[group.index.isin(train_indexes)]
valid_group = group[group.index.isin(valid_indexes)]
del group, train_indexes, valid_indexes
print(len(train_group), len(valid_group))

320000 73656


In [9]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, correctness, max_seq=100):
        super(SAKTDataset, self).__init__()
        self.samples, self.n_skill, self.max_seq, self.correctness = {}, n_skill, max_seq, correctness
        
        self.user_ids = []
        for i, user_id in enumerate(group.index):
            # if(i % 10000 == 0):
            #     print(f'Processed {i} users')
            content_id, answered_correctly = group[user_id]
            if len(content_id) >= ACCEPTED_USER_CONTENT_SIZE:
                if len(content_id) > self.max_seq:
                    total_questions = len(content_id)
                    last_pos = total_questions // self.max_seq
                    for seq in range(last_pos):
                        index = f"{user_id}_{seq}"
                        self.user_ids.append(index)
                        start = seq * self.max_seq
                        end = (seq + 1) * self.max_seq
                        self.samples[index] = (content_id[start:end], answered_correctly[start:end])
                    if len(content_id[end:]) >= ACCEPTED_USER_CONTENT_SIZE:
                        index = f"{user_id}_{last_pos + 1}"
                        self.user_ids.append(index)
                        self.samples[index] = (content_id[end:], answered_correctly[end:])
                else:
                    index = f'{user_id}'
                    self.user_ids.append(index)
                    self.samples[index] = (content_id, answered_correctly)
                
                
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        content_id, answered_correctly = self.samples[user_id]
        seq_len = len(content_id)
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            content_id_seq[:] = content_id[-self.max_seq:]
            answered_correctly_seq[:] = answered_correctly[-self.max_seq:]
        else:
            content_id_seq[-seq_len:] = content_id
            answered_correctly_seq[-seq_len:] = answered_correctly

        target_id = content_id_seq[1:]
        label = answered_correctly_seq[1:]
        
        x = content_id_seq[:-1].copy()
        for i in range(len(x)):
            x[i] += self.correctness[x[i]]
        x += (answered_correctly_seq[:-1] == 1) * self.n_skill
        
        return x, target_id, label

In [10]:
train_dataset = SAKTDataset(train_group, n_skill, correctness, max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=8)
del train_group
valid_dataset = SAKTDataset(valid_group, n_skill, correctness, max_seq=MAX_SEQ)
valid_dataloader = DataLoader(valid_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False, num_workers=8)
del valid_group

In [11]:
del df

# Define Model

In [12]:
class FFN(nn.Module):
    def __init__(self, state_size=200, forward_expansion=1, bn_size=MAX_SEQ-1, dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(bn_size)
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.lr1(x))
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads=8, dropout=DROPOUT_RATE, forward_expansion=1):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)
        

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x = self.dropout(self.layer_normal_2(x + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout=DROPOUT_RATE, forward_expansion=1, num_layers=1, heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(10 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, forward_expansion = forward_expansion) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout=DROPOUT_RATE, forward_expansion = 1, enc_layers=1, heads = 8):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, max_seq, embed_dim, dropout, forward_expansion, num_layers=enc_layers)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight

In [13]:
def train_fn(model, dataloader, optimizer, scheduler, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        target_mask = (target_id != 0)

        optimizer.zero_grad()
        output, _, = model(x, target_id)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss.append(loss.item())

        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

In [14]:
def valid_fn(model, dataloader, criterion, device="cpu"):
    model.eval()

    valid_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        target_mask = (target_id != 0)

        output, _, = model(x, target_id)
        loss = criterion(output, label)
        valid_loss.append(loss.item())

        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(valid_loss)

    return loss, acc, auc

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_DIM, dropout=DROPOUT_RATE, enc_layers=1)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=MAX_LEARNING_RATE, steps_per_epoch=len(train_dataloader), epochs=EPOCHS
)

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [16]:
best_auc = 0
max_steps = 3
step = 0
for epoch in tqdm(range(EPOCHS)):
    loss, acc, auc = train_fn(model, train_dataloader, optimizer, scheduler, criterion, device)
    print("[epoch - {}/{}] [train: - {:.3f}] [acc - {:.4f}] [auc - {:.4f}]".format(epoch+1, EPOCHS, loss, acc, auc))
    loss, acc, auc = valid_fn(model, valid_dataloader, criterion, device)
    print("[epoch - {}/{}] [valid: - {:.3f}] [acc - {:.4f}] [auc - {:.4f}]\n".format(epoch+1, EPOCHS, loss, acc, auc))
    if auc > best_auc:
        best_auc = auc
        step = 0
        torch.save(model.state_dict(), "sakt_model.pt")
    else:
        step += 1
        if step >= max_steps:
            break
torch.save(model.state_dict(), "sakt_model_final.pt")

  0%|          | 0/30 [00:00<?, ?it/s][epoch - 1/30] [train: - 0.405] [acc - 0.6835] [auc - 0.6681]
[epoch - 1/30] [valid: - 0.375] [acc - 0.7133] [auc - 0.7406]

  3%|▎         | 1/30 [07:18<3:32:10, 438.97s/it][epoch - 2/30] [train: - 0.376] [acc - 0.7105] [auc - 0.7358]
[epoch - 2/30] [valid: - 0.369] [acc - 0.7201] [auc - 0.7549]

  7%|▋         | 2/30 [14:19<3:22:16, 433.46s/it][epoch - 3/30] [train: - 0.369] [acc - 0.7175] [auc - 0.7500]
[epoch - 3/30] [valid: - 0.366] [acc - 0.7224] [auc - 0.7587]

 10%|█         | 3/30 [24:31<3:40:40, 490.41s/it]


KeyboardInterrupt: 

In [None]:
del train_dataset, valid_dataset

# Ouput Test

In [24]:
question = pd.read_csv('./questions.csv')
