In [1]:
import time
import copy
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, auc, roc_curve

import torch
import torch.nn as nn
import torch.nn.functional as F

## 数据转换

In [None]:
#读取文件
dtype = {'row_id': 'int64',
         'timestamp': 'int64',
         'user_id': 'int32',
         'content_id': 'int16',
         'content_type_id': 'int8',
         'task_container_id': 'int16',
         'user_answer': 'int8',
         'answered_correctly': 'int8',
         'prior_question_elapsed_time': 'float32',
         'prior_question_had_explanation': 'boolean'}

columns = ['timestamp','user_id','content_id','content_type_id','task_container_id','user_answer','answered_correctly','prior_question_elapsed_time']

train_df = pd.read_csv(
    '/home/yao/dataset/Riiid-AIEd-Challenge-2020/train.csv',
    header=0,
    usecols=columns,
    dtype=dtype)

diff_df = pd.read_csv('data_init/difficulty.csv')
lectures_df = pd.read_csv('/home/yao/dataset/Riiid-AIEd-Challenge-2020/lectures.csv')

#过滤与合并df
train_df = train_df[
    train_df['prior_question_elapsed_time'].notnull() &
    (train_df['answered_correctly']!=-1) & 
    (train_df['prior_question_elapsed_time']!=0 )
]

train_df["prior_question_elapsed_time"] = train_df["prior_question_elapsed_time"] // 1000

ques_df = pd.read_csv('/home/yao/dataset/Riiid-AIEd-Challenge-2020/questions.csv')

train_df = train_df.merge(ques_df,how="left",left_on='content_id',right_on='question_id')
train_df = train_df.drop(columns=['question_id'])

diff_df['difficulty'] *= 10
diff_df['difficulty']  = diff_df['difficulty'].round().astype('int')

train_df = train_df.merge(diff_df,left_on='content_id',right_on='content_id')

train_df = train_df.merge(lectures_df,how='left',left_on='content_id',right_on='lecture_id')
train_df = train_df.drop(columns=['lecture_id',
                                  'part_y','type_of','tags','correct_answer'])

train_df['tag'] += 1
train_df['tag'] = train_df['tag'].fillna(0).astype('int')


In [None]:
train_df

In [None]:
train_df['content_type_id'].unique()

In [None]:
#统计用户做题数量,发现相差悬殊
user_content_num_dist = train_df.groupby("user_id")['timestamp'].agg('count')
user_content_num_dist.describe()

In [None]:
#根据上面的统计,把序列长度设置为128
PAD_TOKEN = 0
SEED = 1
SEQ_LEN = 128

In [None]:
#在用户中随机取SEQ_LEN*5数量的题目的,按4:1构建训练集验证集
groups = train_df.sample(frac=1,random_state=SEED).groupby("user_id").head(SEQ_LEN*5)
groups = groups.sort_values(['user_id','timestamp'])
groups

In [None]:
train_data = []
val_data = []
for _, row in  groups.groupby("user_id").agg({
    "content_id":list,
    "content_type_id":list, 
    'difficulty':list,
    "part_x":list, 
    'content_type_id':list,
    'tag':list,
    "prior_question_elapsed_time":list,
    "answered_correctly":list, 
}).iterrows():
    
    length = len(row["content_id"])
    row = row.to_dict()
    if length % SEQ_LEN == 0 and length>SEQ_LEN:
        for i in range(int(length/SEQ_LEN)-2):
            row_ = {k:row[k][i*SEQ_LEN:(i+1)*SEQ_LEN] for k in row}
            
            train_data.append({
            "content_id" : row_["content_id"],
            "content_type_id":row_["content_type_id"],
            "difficulty_id":row_["difficulty"],
            "tag_id":row_["tag"],
            "prior_question_elapsed_time" : row_["prior_question_elapsed_time"],
            "part_id": row_["part_x"],
            "pad_flag" : [False]*SEQ_LEN,
            "answered_correctly" : row_["answered_correctly"],
            })

        row_ = {k:row[k][(i+1)*SEQ_LEN:(i+2)*SEQ_LEN] for k in row}
        
        if len(row_["content_id"]) ==0:
            continue
        val_data.append({
                "content_id" : row_["content_id"],
                "content_type_id":row_["content_type_id"],
                "difficulty_id":row_["difficulty"],
                "tag_id":row_["tag"],
                "prior_question_elapsed_time" : row_["prior_question_elapsed_time"],
                "part_id": row_["part_x"],
                "pad_flag" : [False]*SEQ_LEN,
                "answered_correctly" : row_["answered_correctly"],
            })
        
        
    elif length<SEQ_LEN:
        pads = [PAD_TOKEN]*(SEQ_LEN-length)
        train_data.append({
            "content_id" : row["content_id"]+pads,
            "content_type_id": row["content_type_id"]+pads,
            "difficulty_id":row["difficulty"]+pads,
            "tag_id": row["tag"]+pads,
            "prior_question_elapsed_time" : row["prior_question_elapsed_time"]+pads,
            "part_id": row["part_x"]+pads,
            "pad_flag" : [False]*length + [True]*(SEQ_LEN-length),
            "answered_correctly" : row["answered_correctly"]+pads,
        })
        
    
    else:
        for i in range(length//SEQ_LEN):
            row_ = {k:row[k][i*SEQ_LEN:(i+1)*SEQ_LEN] for k in row}
            train_data.append({
                "content_id" : row_["content_id"],
                "content_type_id":row_["content_type_id"],
                "difficulty_id":row_["difficulty"],
                "tag_id":row_["tag"],
                "prior_question_elapsed_time" : row_["prior_question_elapsed_time"],
                "part_id": row_["part_x"],
                "pad_flag" : [False]*SEQ_LEN,
                "answered_correctly" : row_["answered_correctly"],
            })
            
            
        pads = [PAD_TOKEN]*(SEQ_LEN-length%SEQ_LEN)
        row_ = {k:row[k][(i+1)*SEQ_LEN:(i+2)*SEQ_LEN] for k in row}
        
        if len(row_["content_id"]) ==0:
            continue
        
        val_data.append({
                "content_id" : row_["content_id"]+pads,
                "content_type_id": row_["content_type_id"]+pads,
                "difficulty_id":row_["difficulty"]+pads,
                "tag_id": row_["tag"]+pads,
                "prior_question_elapsed_time" : row_["prior_question_elapsed_time"]+pads,
                "part_id": row_["part_x"]+pads,
                "pad_flag" : [False]*len(row_["content_id"]) + [True]*(SEQ_LEN-length%SEQ_LEN),
                "answered_correctly" : row_["answered_correctly"]+pads,
            })

In [None]:
for i in range(len(val_data)-1,0,-1):
    if len(val_data[i]['content_id']) == 0:
        print(val_data[i]['content_type_id'])

In [None]:
class RiiidData(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        return idx, self.data[idx]["content_id"], \
            self.data[idx]["content_type_id"], \
            self.data[idx]["difficulty_id"], \
            self.data[idx]["tag_id"], \
            self.data[idx]["part_id"], \
            self.data[idx]["prior_question_elapsed_time"], \
            self.data[idx]["pad_flag"], \
            self.data[idx]["answered_correctly"]

    
def collate_fn(batch):
    _, content_id, task_id, difficulty_id, tag_id, \
        part_id, prior_question_elapsed_time, pad_flag, labels = zip(*batch)
    
    content_id = torch.Tensor(content_id).long()
    task_id = torch.Tensor(task_id).long()
    difficulty_id = torch.Tensor(difficulty_id).long()
    tag_id = torch.Tensor(tag_id).long()
    part_id = torch.Tensor(part_id).long()
    prior_question_elapsed_time = torch.Tensor(prior_question_elapsed_time).long()
    masks = torch.Tensor(pad_flag).bool()
    labels = torch.Tensor(labels)

    return content_id, task_id, difficulty_id, tag_id, part_id, \
        prior_question_elapsed_time, masks, labels

In [None]:
train_set = RiiidData(train_data)
val_set = RiiidData(val_data)

In [None]:
for i in train_data:
    if len(i['content_id']) != 32:
        print(len(i['content_id']))

In [None]:
len(train_set)

In [None]:
len(val_set)

## 模型训练

In [None]:
from model.transformer_model import Riiid

In [None]:
NUM_EPOCH = 50
BATCH_SIZE = 32
LR = 3e-3
DMODEL = 256

device = torch.device('cuda:1')
# device = torch.device('cpu')

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=2)
val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=2)

dataloaders = {
    'train':train_loader,
    'val':val_loader
}

In [None]:
model = None
torch.cuda.empty_cache()

In [None]:
model = Riiid(dmodel = DMODEL,max_len = SEQ_LEN)
model.to(device)
model

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [None]:
def trainer(model,dataloaders,criterion,optimizer,num_epochs=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    

    
    for epoch in range(num_epochs):
        
        print('Epoch {}/{}'.format(epoch,num_epochs -1))
        print('-' * 10)
        
        epoch_start = time.time()
        
        for phase in ['train','val']:
            if phase == 'train':
                model.train()
            else :
                model.eval()

            running_loss = []
            running_acc = []
            
            for data in dataloaders[phase]:
                content_id, task_id, difficulty_id, tag_id, part_id, \
                    elapsed_time, masks, labels = data
                pos_id = torch.arange(0, content_id.shape[1]).unsqueeze(0).repeat(
                    content_id.shape[0], 1)

                pos_id = pos_id.to(device)
                task_id = task_id.to(device)
                difficulty_id = difficulty_id.to(device)
                tag_id = tag_id.to(device)
                part_id = part_id.to(device)
                elapsed_time = elapsed_time.to(device)
                masks = masks.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    
                    outputs = model(pos_id,task_id, difficulty_id,
                        tag_id,elapsed_time,part_id,masks)

                    _,preds = torch.max(outputs,2)
                    loss = criterion(outputs[:,:,1], labels)
                    running_loss.append(loss.item())

                    running_acc.append(torch.sum(preds == labels.data).item()/labels.numel())
                    if phase =='train':
                        loss.backward()
                        optimizer.step()
                

            epoch_acc = np.mean(running_acc)
            epoch_loss = np.mean(running_loss)
            # writer.add_scalar('Loss/{}'.format(phase), epoch_loss, epoch+1)
            
            
            print('{} Loss: {:.4f} ACC: {:.6f}  Lr:: {}  spend: {}s'.format(phase,epoch_loss,epoch_acc,
                                                                           optimizer.param_groups[0]['lr'],int(time.time()-epoch_start)))
            
            if phase =='val':
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
            # if phase == 'val':
            #     scheduler.step(epoch_loss)
                
        print()
        
        if (epoch+1) %20 == 0:
            path = './acc_{}.pth'.format(round(float(best_acc),6))
            torch.save(best_model_wts,path)
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m  {:.0f}s'.format(time_elapsed // 60,time_elapsed % 60))
          
    model.load_state_dict(best_model_wts)
    
    return model

In [None]:
trainer(model,dataloaders,criterion,optimizer,NUM_EPOCH)

In [None]:
!nvidia-smi