- replace data loading method with datatable to reduce memory use

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
import random
import datatable as dt
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
random.seed(1)
MAX_SEQ = 160

## Load data

In [4]:
%%time
dtype = {'timestamp':'int64', 
         'user_id':'int32' ,
         'content_id':'int16',
         'content_type_id':'int8',
         'answered_correctly':'int8'}

train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(dtype.keys())).to_pandas()
train_df.head()

CPU times: user 1min 17s, sys: 10.5 s, total: 1min 28s
Wall time: 2min 37s


Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly
0,0,115,5692,False,1
1,56943,115,5716,False,1
2,118363,115,128,False,1
3,131167,115,7860,False,1
4,137965,115,7922,False,1


In [5]:
train_df = train_df[train_df.content_type_id == False]
train_df.drop("content_type_id", axis=1, inplace=True)
#arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

## Preprocess

In [6]:
skills = train_df["content_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))

number skills 13523


In [7]:
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))

del train_df
gc.collect()

0

In [8]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=MAX_SEQ): 
        super(SAKTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = group
        
#         self.user_ids = [x for x in group.index]
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]
            if len(q) < 2: 
                continue
            self.user_ids.append(user_id)
            
            #if len(q)>self.max_seq:
            #    group[user_id] = (q[-self.max_seq:],qa[-self.max_seq:])

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        
        if seq_len >= self.max_seq:
            if random.random()>0.1:
                start = random.randint(0,(seq_len-self.max_seq))
                end = start + self.max_seq
                q[:] = q_[start:end]
                qa[:] = qa_[start:end]
            else:
                q[:] = q_[-self.max_seq:]
                qa[:] = qa_[-self.max_seq:]
        else:
            if random.random()>0.1:
                start = 0
                end = random.randint(2,seq_len)
                seq_len = end - start
                q[-seq_len:] = q_[0:seq_len]
                qa[-seq_len:] = qa_[0:seq_len]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_

        
        target_id = q[1:]
        label = qa[1:]

        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill

        return x, target_id, label

In [9]:
dataset = SAKTDataset(group, n_skill)
dataloader = DataLoader(dataset, batch_size=2048, shuffle=True, num_workers=8)

item = dataset.__getitem__(5)
# print(item[0])
# print(item[1])
# print(item[2])

## Define model

In [10]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=128): # 100->MAX_SEQ
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)

        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SAKTModel(n_skill, embed_dim=128)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [12]:
def train_epoch(model, train_iterator, optim, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()

        optim.zero_grad()
        output, atten_weight = model(x, target_id)
        loss = criterion(output, label)
        loss.backward()
        optim.step()
        train_loss.append(loss.item())

        output = output[:, -1]
        label = label[:, -1] 
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

In [13]:
epochs = 35 
for epoch in range(epochs):
    loss, acc, auc = train_epoch(model, dataloader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, loss, acc, auc))

loss - 0.2228: 100%|██████████| 193/193 [00:58<00:00,  3.32it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 0 train_loss - 0.24 acc - 0.612 auc - 0.645


loss - 0.2033: 100%|██████████| 193/193 [00:56<00:00,  3.39it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 1 train_loss - 0.21 acc - 0.668 auc - 0.727


loss - 0.2120: 100%|██████████| 193/193 [00:57<00:00,  3.39it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 2 train_loss - 0.21 acc - 0.680 auc - 0.743


loss - 0.2118: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 3 train_loss - 0.21 acc - 0.684 auc - 0.749


loss - 0.2113: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 4 train_loss - 0.21 acc - 0.685 auc - 0.751


loss - 0.1982: 100%|██████████| 193/193 [00:56<00:00,  3.39it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 5 train_loss - 0.21 acc - 0.687 auc - 0.752


loss - 0.2096: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 6 train_loss - 0.21 acc - 0.686 auc - 0.753


loss - 0.1938: 100%|██████████| 193/193 [00:57<00:00,  3.35it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 7 train_loss - 0.20 acc - 0.686 auc - 0.753


loss - 0.1980: 100%|██████████| 193/193 [00:57<00:00,  3.36it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 8 train_loss - 0.20 acc - 0.687 auc - 0.755


loss - 0.2078: 100%|██████████| 193/193 [00:56<00:00,  3.40it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 9 train_loss - 0.20 acc - 0.690 auc - 0.757


loss - 0.1927: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 10 train_loss - 0.20 acc - 0.688 auc - 0.754


loss - 0.1918: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 11 train_loss - 0.20 acc - 0.689 auc - 0.756


loss - 0.1869: 100%|██████████| 193/193 [00:57<00:00,  3.36it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 12 train_loss - 0.20 acc - 0.690 auc - 0.757


loss - 0.1974: 100%|██████████| 193/193 [00:56<00:00,  3.39it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 13 train_loss - 0.20 acc - 0.690 auc - 0.757


loss - 0.1929: 100%|██████████| 193/193 [00:57<00:00,  3.35it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 14 train_loss - 0.20 acc - 0.689 auc - 0.757


loss - 0.1969: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 15 train_loss - 0.20 acc - 0.690 auc - 0.758


loss - 0.1950: 100%|██████████| 193/193 [00:57<00:00,  3.35it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 16 train_loss - 0.20 acc - 0.692 auc - 0.759


loss - 0.1949: 100%|██████████| 193/193 [00:57<00:00,  3.34it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 17 train_loss - 0.20 acc - 0.691 auc - 0.758


loss - 0.2077: 100%|██████████| 193/193 [00:56<00:00,  3.40it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 18 train_loss - 0.20 acc - 0.690 auc - 0.756


loss - 0.1959: 100%|██████████| 193/193 [00:57<00:00,  3.36it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 19 train_loss - 0.20 acc - 0.690 auc - 0.758


loss - 0.2096: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 20 train_loss - 0.20 acc - 0.691 auc - 0.759


loss - 0.1849: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 21 train_loss - 0.20 acc - 0.691 auc - 0.758


loss - 0.2000: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 22 train_loss - 0.20 acc - 0.691 auc - 0.759


loss - 0.2045: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 23 train_loss - 0.20 acc - 0.691 auc - 0.759


loss - 0.1771: 100%|██████████| 193/193 [00:56<00:00,  3.41it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 24 train_loss - 0.20 acc - 0.690 auc - 0.758


loss - 0.2299: 100%|██████████| 193/193 [00:57<00:00,  3.35it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 25 train_loss - 0.20 acc - 0.690 auc - 0.757


loss - 0.1983: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 26 train_loss - 0.20 acc - 0.692 auc - 0.759


loss - 0.2216: 100%|██████████| 193/193 [00:57<00:00,  3.36it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 27 train_loss - 0.20 acc - 0.691 auc - 0.759


loss - 0.2140: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 28 train_loss - 0.20 acc - 0.692 auc - 0.759


loss - 0.1819: 100%|██████████| 193/193 [00:56<00:00,  3.39it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 29 train_loss - 0.20 acc - 0.692 auc - 0.760


loss - 0.1825: 100%|██████████| 193/193 [00:56<00:00,  3.40it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 30 train_loss - 0.20 acc - 0.692 auc - 0.760


loss - 0.2256: 100%|██████████| 193/193 [00:57<00:00,  3.38it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 31 train_loss - 0.20 acc - 0.693 auc - 0.760


loss - 0.2132: 100%|██████████| 193/193 [00:57<00:00,  3.34it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 32 train_loss - 0.20 acc - 0.692 auc - 0.759


loss - 0.2085: 100%|██████████| 193/193 [00:57<00:00,  3.36it/s]
  0%|          | 0/193 [00:00<?, ?it/s]

epoch - 33 train_loss - 0.20 acc - 0.692 auc - 0.759


loss - 0.1956: 100%|██████████| 193/193 [00:57<00:00,  3.37it/s]


epoch - 34 train_loss - 0.20 acc - 0.691 auc - 0.759


In [14]:
torch.save(model.state_dict(), "SAKT.pt")

In [15]:
del dataset
gc.collect()

40

## Test

In [16]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, skills, max_seq=MAX_SEQ): 
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.skills = skills
        self.n_skill = len(skills)
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
        x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        return x, questions

In [17]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [18]:
import psutil
model.eval()

prev_test_df = None

for (test_df, sample_prediction_df) in tqdm(iter_test):
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_ac = prev_group[prev_user_id][1]
            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0],prev_group_content), 
                                       np.append(group[prev_user_id][1],prev_group_ac))
 
            else:
                group[prev_user_id] = (prev_group_content,prev_group_ac)
            if len(group[prev_user_id][0])>MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_ac = group[prev_user_id][1][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content,new_group_ac)

    prev_test_df = test_df.copy()
 
    
    test_df = test_df[test_df.content_type_id == False]
                
    test_dataset = TestDataset(group, test_df, skills)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()

        with torch.no_grad():
            output, att_weight = model(x, target_id)
        
        
        output = torch.sigmoid(output)
        output = output[:, -1]

        outs.extend(output.view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] =  outs
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 66.69it/s]

100%|██████████| 1/1 [00:00<00:00, 87.23it/s]
2it [00:00, 11.59it/s]
100%|██████████| 1/1 [00:00<00:00, 63.90it/s]

100%|██████████| 1/1 [00:00<00:00, 55.84it/s]


37.0
37.0
37.0


4it [00:00,  5.20it/s]
