In [1]:
import time
import copy
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, auc

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau,CosineAnnealingLR

## 数据转换

In [2]:
#读取文件
dtype = {'row_id': 'int64',
         'timestamp': 'int64',
         'user_id': 'int32',
         'content_id': 'int16',
         'content_type_id': 'int8',
         'task_container_id': 'int16',
         'user_answer': 'int8',
         'answered_correctly': 'int8',
         'prior_question_elapsed_time': 'float32',
         'prior_question_had_explanation': 'boolean'}

columns = ['timestamp','user_id','content_id','content_type_id','task_container_id','user_answer','answered_correctly','prior_question_elapsed_time']

train_df = pd.read_csv(
    '/home/yao/dataset/Riiid-AIEd-Challenge-2020/train.csv',
    header=0,
    usecols=columns,
    dtype=dtype)

diff_df = pd.read_csv('data_init/difficulty.csv')
lectures_df = pd.read_csv('/home/yao/dataset/Riiid-AIEd-Challenge-2020/lectures.csv')

#过滤与合并df
train_df = train_df[
    train_df['prior_question_elapsed_time'].notnull() &
    (train_df['answered_correctly']!=-1) & 
    (train_df['prior_question_elapsed_time']!=0 )
]

train_df["prior_question_elapsed_time"] = train_df["prior_question_elapsed_time"] // 1000

ques_df = pd.read_csv('/home/yao/dataset/Riiid-AIEd-Challenge-2020/questions.csv')

train_df = train_df.merge(ques_df,how="left",left_on='content_id',right_on='question_id')
train_df = train_df.drop(columns=['question_id'])

diff_df['difficulty'] *= 10
diff_df['difficulty']  = diff_df['difficulty'].round().astype('int')

train_df = train_df.merge(diff_df,left_on='content_id',right_on='content_id')

train_df = train_df.merge(lectures_df,how='left',left_on='content_id',right_on='lecture_id')
train_df = train_df.drop(columns=['lecture_id',
                                  'part_y','type_of','tag','correct_answer'])

In [3]:
train_df

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,bundle_id,part_x,tags,difficulty
0,56943,115,5716,0,2,2,1,37.0,5716,5,168,6
1,8270862135,24418,5716,0,2892,2,1,30.0,5716,5,168,6
2,18755288862,91216,5716,0,1452,3,0,9.0,5716,5,168,6
3,18755325533,91216,5716,0,1457,2,1,11.0,5716,5,168,6
4,6811228776,141455,5716,0,1232,0,0,28.0,5716,5,168,6
...,...,...,...,...,...,...,...,...,...,...,...,...
98683972,7327293505,1896513376,5823,0,4239,0,1,21.0,5823,5,8,8
98683973,4339553896,2070144393,10008,0,1406,3,1,23.0,10005,6,55,8
98683974,4339553896,2070144393,10007,0,1406,1,0,23.0,10005,6,27,1
98683975,4339553896,2070144393,10006,0,1406,0,1,23.0,10005,6,73,8


In [13]:
#统计用户做题数量,发现相差悬殊
user_content_num_dist = train_df.groupby("user_id")['timestamp'].agg('count')
user_content_num_dist.describe()

count    393531.000000
mean        250.765447
std         732.270049
min           1.000000
25%          29.000000
50%          39.000000
75%         153.000000
max       17608.000000
Name: timestamp, dtype: float64

In [14]:
#根据上面的统计,把序列长度设置为128
PAD_TOKEN = 0
SEED = 1
SEQ_LEN = 128

In [15]:
#在用户中随机取SEQ_LEN*5数量的题目的,按4:1构建训练集验证集
groups = train_df.sample(frac=1,random_state=SEED).groupby("user_id").head(SEQ_LEN*5)
groups = groups.sort_values(['user_id','timestamp'])
groups

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,bundle_id,part_x,tags,difficulty
0,56943,115,5716,0,2,2,1,37.0,5716,5,168,6
23880,118363,115,128,0,0,0,1,55.0,128,1,131 149 92,8
29279,131167,115,7860,0,3,0,1,19.0,7860,1,131 104 81,8
50877,137965,115,7922,0,4,1,1,11.0,7922,1,131 149 92,8
70262,157063,115,156,0,5,2,1,5.0,156,1,131 101 162 38,8
...,...,...,...,...,...,...,...,...,...,...,...,...
28383230,428564420,2147482888,3586,0,22,0,1,18.0,3586,5,8,6
63790257,428585000,2147482888,6341,0,23,3,1,14.0,6341,5,60,5
55980911,428613475,2147482888,4212,0,24,3,1,14.0,4212,5,95,5
22617902,428649406,2147482888,6343,0,25,1,0,22.0,6343,5,60,6


In [16]:
train_data = []
val_data = []
for _, row in  groups.groupby("user_id").agg({
    "content_id":list,
    "content_type_id":list, 
    'difficulty':list,
    "part_x":list, 
    'content_type_id':list,
    'tags':list,
    "prior_question_elapsed_time":list,
    "answered_correctly":list, 
}).iterrows():
    
    length = len(row["content_id"])
    row = row.to_dict()
    if length % SEQ_LEN == 0 and length>SEQ_LEN:
        for i in range(int(length/SEQ_LEN)-2):
            row_ = {k:row[k][i*SEQ_LEN:(i+1)*SEQ_LEN] for k in row}
            
            train_data.append({
            "content_id" : row_["content_id"],
            "content_type_id":row_["content_type_id"],
            "difficulty_id":row_["difficulty"],
            "tag_id":row_["tags"],
            "prior_question_elapsed_time" : row_["prior_question_elapsed_time"],
            "part_id": row_["part_x"],
            "pad_flag" : [False]*SEQ_LEN,
            "answered_correctly" : row_["answered_correctly"],
            })

        row_ = {k:row[k][(i+1)*SEQ_LEN:(i+2)*SEQ_LEN] for k in row}
        
        if len(row_["content_id"]) ==0:
            continue
        val_data.append({
                "content_id" : row_["content_id"],
                "content_type_id":row_["content_type_id"],
                "difficulty_id":row_["difficulty"],
                "tag_id":row_["tags"],
                "prior_question_elapsed_time" : row_["prior_question_elapsed_time"],
                "part_id": row_["part_x"],
                "pad_flag" : [False]*SEQ_LEN,
                "answered_correctly" : row_["answered_correctly"],
            })
        
        
    elif length<SEQ_LEN:
        pads = [PAD_TOKEN]*(SEQ_LEN-length)
        train_data.append({
            "content_id" : row["content_id"]+pads,
            "content_type_id": row["content_type_id"]+pads,
            "difficulty_id":row["difficulty"]+pads,
            "tag_id": row["tags"]+pads,
            "prior_question_elapsed_time" : row["prior_question_elapsed_time"]+pads,
            "part_id": row["part_x"]+pads,
            "pad_flag" : [False]*length + [True]*(SEQ_LEN-length),
            "answered_correctly" : row["answered_correctly"]+pads,
        })
        
    
    else:
        for i in range(length//SEQ_LEN):
            row_ = {k:row[k][i*SEQ_LEN:(i+1)*SEQ_LEN] for k in row}
            train_data.append({
                "content_id" : row_["content_id"],
                "content_type_id":row_["content_type_id"],
                "difficulty_id":row_["difficulty"],
                "tag_id":row_["tags"],
                "prior_question_elapsed_time" : row_["prior_question_elapsed_time"],
                "part_id": row_["part_x"],
                "pad_flag" : [False]*SEQ_LEN,
                "answered_correctly" : row_["answered_correctly"],
            })
            
            
        pads = [PAD_TOKEN]*(SEQ_LEN-length%SEQ_LEN)
        row_ = {k:row[k][(i+1)*SEQ_LEN:(i+2)*SEQ_LEN] for k in row}
        
        if len(row_["content_id"]) ==0:
            continue
        
        val_data.append({
                "content_id" : row_["content_id"]+pads,
                "content_type_id": row_["content_type_id"]+pads,
                "difficulty_id":row_["difficulty"]+pads,
                "tag_id": row_["tags"]+pads,
                "prior_question_elapsed_time" : row_["prior_question_elapsed_time"]+pads,
                "part_id": row_["part_x"]+pads,
                "pad_flag" : [False]*len(row_["content_id"]) + [True]*(SEQ_LEN-length%SEQ_LEN),
                "answered_correctly" : row_["answered_correctly"]+pads,
            })

In [17]:
for i in range(len(val_data)-1,0,-1):
    if len(val_data[i]['content_id']) == 0:
        print(val_data[i]['content_type_id'])

In [320]:
class RiiidData(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        return idx, self.data[idx]["content_id"], \
            self.data[idx]["content_type_id"], \
            self.data[idx]["difficulty_id"], \
            self.data[idx]["tag_id"], \
            self.data[idx]["part_id"], \
            self.data[idx]["prior_question_elapsed_time"], \
            self.data[idx]["pad_flag"], \
            self.data[idx]["answered_correctly"]

    
def collate_fn(batch):
    _, content_id, task_id, difficulty_id, tag_id, \
        part_id, prior_question_elapsed_time, pad_flag, labels = zip(*batch)
    
    content_id = torch.Tensor(content_id).long()
    task_id = torch.Tensor(task_id).long()
    difficulty_id = torch.Tensor(difficulty_id).long()
#     tag_id = torch.Tensor(tag_id).long()
    part_id = torch.Tensor(part_id).long()
    prior_question_elapsed_time = torch.Tensor(prior_question_elapsed_time).long()
    masks = torch.Tensor(pad_flag).bool()
    labels = torch.Tensor(labels).long()

    return content_id, task_id, difficulty_id, tag_id, part_id, \
        prior_question_elapsed_time, masks, labels

In [321]:
train_set = RiiidData(train_data)
val_set = RiiidData(val_data)

In [322]:
len(train_set)

527525

In [323]:
len(val_set)

108157

## 模型训练

In [324]:
import math
import logging
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder,TransformerEncoderLayer


device = torch.device('cuda:1')
# device = torch.device('cpu')

In [325]:
class Riiid(nn.Module):
    def __init__(self,dmodel,max_len,nhead=8,nhid=256,nlayers=6,dropout=0.3):
        super(Riiid,self).__init__()

        self.dmodel = dmodel
        self.task_embed = nn.Embedding(num_embeddings=2,embedding_dim=dmodel)
        self.difficulty_embed = nn.Embedding(num_embeddings=11,embedding_dim=dmodel)
        self.tag_embed = nn.Embedding(num_embeddings=189,embedding_dim=dmodel)
        self.elapsetime_embed = nn.Embedding(num_embeddings=301,embedding_dim=dmodel)
        self.part_embed = nn.Embedding(num_embeddings=8,embedding_dim=dmodel)
        # self.pos_encoder = PositionalEncoding(dmodel)
        self.pos_embed = nn.Embedding(max_len, dmodel)
        
        encoder_layer = TransformerEncoderLayer(d_model=dmodel,
            nhead=nhead,dim_feedforward=nhid,dropout=dropout)

        self.encoder = TransformerEncoder(encoder_layer=encoder_layer,
            num_layers=nlayers)

        self.fc = nn.Linear(dmodel,1)
        
        self.init_weights()


    def init_weights(self):
        initrange = 0.1

        self.task_embed.weight.data.uniform_(-initrange, initrange)
        self.difficulty_embed.weight.data.uniform_(-initrange, initrange)
        self.tag_embed.weight.data.uniform_(-initrange, initrange)
        self.elapsetime_embed.weight.data.uniform_(-initrange, initrange)
        self.part_embed.weight.data.uniform_(-initrange, initrange)
        self.pos_embed.weight.data.uniform_(-initrange, initrange)

        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)


    def forward(self,x_pos,x_task,x_diff,x_tag,x_et,x_part,pad_mask):
        
        # embeds = self.task_embed(x_task) + \
        #     self.difficulty_embed(x_diff) + \
        #     self.tag_embed(x_tag) + \
        #     self.elapsetime_embed(x_et) + \
        #     self.part_embed(x_part) + \
        #     self.pos_embed(x_pos)
        
        tag_embeds= []
        for tag_batch in list(x_tag):
            t_embeds= []
            for tag_id in tag_batch:
                if type(tag_id) == str:
                    tag_id = [int(tid) for tid in tag_id.split(' ')]
                    s = self.tag_embed(torch.tensor(tag_id).to(device))
                    t_embeds.append(torch.sum(s,dim=0))
                else:
                    s = self.tag_embed(torch.tensor([tag_id]).to(device))
                    t_embeds.append(s.squeeze(0))
            tag_embeds.append(torch.stack(t_embeds))
        
        tag_embeds = torch.stack(tag_embeds).to(device)

        embeds = self.difficulty_embed(x_diff) + \
            tag_embeds + \
            self.elapsetime_embed(x_et) + \
            self.part_embed(x_part) + \
            self.pos_embed(x_pos)
        
        embeds = embeds.transpose(0,1)
        embeds = embeds * math.sqrt(self.dmodel)
        output = self.encoder(src=embeds,src_key_padding_mask=pad_mask)
        output = output.transpose(1,0)

        output = self.fc(output)

        return output

In [326]:
NUM_EPOCH = 50
BATCH_SIZE = 64
LR = 5e-4
DMODEL = 128


In [327]:
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=2)
val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=2)

dataloaders = {
    'train':train_loader,
    'val':val_loader
}

In [328]:
model = Riiid(dmodel = DMODEL,max_len = SEQ_LEN)
model.to(device)

Riiid(
  (task_embed): Embedding(2, 128)
  (difficulty_embed): Embedding(11, 128)
  (tag_embed): Embedding(189, 128)
  (elapsetime_embed): Embedding(301, 128)
  (part_embed): Embedding(8, 128)
  (pos_embed): Embedding(128, 128)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): 

In [329]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer,'min',factor=0.1,patience=5)

In [330]:
def trainer(model,dataloaders,criterion,optimizer,num_epochs=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    

    
    for epoch in range(num_epochs):
        
        print('Epoch {}/{}'.format(epoch,num_epochs -1))
        print('-' * 10)
        
        epoch_start = time.time()
        
        for phase in ['train','val']:
            if phase == 'train':
                model.train()
            else :
                model.eval()

            running_loss = []
            running_acc = []
            running_auc = []
            
            for data in dataloaders[phase]:
                content_id, task_id, difficulty_id, tag_id, part_id, \
                    elapsed_time, masks, labels = data
                pos_id = torch.arange(0, content_id.shape[1]).unsqueeze(0).repeat(
                    content_id.shape[0], 1)
#                 print(tag_id)
                pos_id = pos_id.to(device)
                task_id = task_id.to(device)
                difficulty_id = difficulty_id.to(device)
#                 tag_id = tag_id.to(device)
                part_id = part_id.to(device)
                elapsed_time = elapsed_time.to(device)
                masks = masks.to(device)
                labels = labels.to(device)
#                 labels_embed= F.one_hot(labels).float()
                labels_embed= labels.view(-1,SEQ_LEN,1).float()
    
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    
                    outputs = model(pos_id,task_id, difficulty_id,
                        tag_id,elapsed_time,part_id,masks)

#                     _,preds = torch.max(outputs,2)
                    preds = (F.sigmoid(outputs)>=0.5).long()
                    loss = criterion(outputs, labels_embed)
                    running_loss.append(loss.item())
                    
                    running_acc.append(torch.sum(preds == labels_embed.data).item()/labels.numel())
                    
#                     if len(preds.unique())==2 and len(labels.unique())==2:
#                         print(labels.unique())
#                         running_auc.append(roc_auc_score(
#                                 labels.cpu().numpy(),
#                                 preds.cpu().numpy()
#                             )/labels.numel())
                    
                    if phase =='train':
                        loss.backward()
                        optimizer.step()
                

            epoch_acc = np.mean(running_acc)
            epoch_auc = np.mean(running_auc)
            epoch_loss = np.mean(running_loss)
            # writer.add_scalar('Loss/{}'.format(phase), epoch_loss, epoch+1)
            
            
            print('{} Loss: {:.4f} ACC: {:.4f}  ACC: {:.4f}  Lr:: {}  spend: {}s'.
                  format(phase,epoch_loss,epoch_acc,epoch_auc,
                         optimizer.param_groups[0]['lr'],int(time.time()-epoch_start)))
            
            if phase =='val':
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
            if phase == 'val':
                scheduler.step(epoch_loss)
                
        print()
        
        if (epoch) %5 == 0:
            path = './acc_{}.pth'.format(round(float(best_acc),6))
            torch.save(best_model_wts,path)
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m  {:.0f}s'.format(time_elapsed // 60,time_elapsed % 60))
          
    model.load_state_dict(best_model_wts)
    
    return model

In [None]:
trainer(model,dataloaders,criterion,optimizer,NUM_EPOCH)

Epoch 0/49
----------
train Loss: 0.3596 ACC: 0.8120  ACC: nan  Lr:: 0.0005  spend: 10917s


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


val Loss: 0.3624 ACC: 0.8143  ACC: nan  Lr:: 0.0005  spend: 12104s

Epoch 1/49
----------
train Loss: 0.3546 ACC: 0.8157  ACC: nan  Lr:: 0.0005  spend: 11029s
val Loss: 0.3603 ACC: 0.8150  ACC: nan  Lr:: 0.0005  spend: 12189s

Epoch 2/49
----------
train Loss: 0.3530 ACC: 0.8168  ACC: nan  Lr:: 0.0005  spend: 10760s
val Loss: 0.3595 ACC: 0.8151  ACC: nan  Lr:: 0.0005  spend: 11966s

Epoch 3/49
----------
train Loss: 0.3521 ACC: 0.8175  ACC: nan  Lr:: 0.0005  spend: 10941s
val Loss: 0.3602 ACC: 0.8148  ACC: nan  Lr:: 0.0005  spend: 12097s

Epoch 4/49
----------
train Loss: 0.3515 ACC: 0.8179  ACC: nan  Lr:: 0.0005  spend: 11245s
val Loss: 0.3573 ACC: 0.8161  ACC: nan  Lr:: 0.0005  spend: 12441s

Epoch 5/49
----------
train Loss: 0.3510 ACC: 0.8183  ACC: nan  Lr:: 0.0005  spend: 10964s
val Loss: 0.3570 ACC: 0.8159  ACC: nan  Lr:: 0.0005  spend: 12144s

Epoch 6/49
----------
