In [465]:
import matplotlib
import matplotlib.pyplot as plt

import os
import pdb
import itertools
import numpy as np
import pandas as pd
import datatable as dt
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics

import gc
import h5py
import joblib

import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [466]:
if torch.cuda.is_available(): 
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
print(device)

cuda:6


In [467]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [468]:
class Params:
    def __init__(self, **entries):
        self.__dict__.update(entries)
        
    def update(self, **kargs):
        self.__dict__.update(kargs)

In [469]:
path = './kaggle/input/riiid-test-answer-prediction'
train_file = f'{path}/train.csv'
train_dtypes = {'row_id': 'int64',
              'timestamp': 'int64',
              'user_id': 'int32',
              'content_id': 'int16',
              'content_type_id': 'int8',
              'task_container_id': 'int16',
              'user_answer': 'int8',
              'answered_correctly': 'int8',
              'prior_question_elapsed_time': 'float32', 
              'prior_question_had_explanation': 'boolean',
             }

questions = pd.read_csv(f'{path}/questions.csv')
lectures = pd.read_csv(f'{path}/lectures.csv')
print('Question shapes:', questions.shape)
print('Lecture shapes:', lectures.shape)

Question shapes: (13523, 5)
Lecture shapes: (418, 4)


In [470]:
colnames = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id', 'user_answer', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
# chunks = pd.read_csv(train_file, chunksize=1e3, dtype=train_dtypes, header=None, names=colnames, index_col=False)
chunks = pd.read_csv(train_file, chunksize=1e6, dtype=train_dtypes)

In [471]:
question_tags = list(map(lambda x: map(lambda v: int(v) + 1, str(x).split()) if str(x).strip() != 'nan' else [0], questions.tags.values))
question_tags = list(set(itertools.chain(*question_tags)))

n_tags = len(question_tags)
n_parts = len(set(questions.part.unique()))

print(f'n_tags {n_tags}, n_parts {n_parts}')

n_tags 189, n_parts 7


In [472]:
params_dict = {
    'load_state': True,
    'use_buffer': True,
    'is_offline': False,
    'batch_norm': False,
    'is_test': False,
    'n_chunks': 10,
    'n_epoch': 1,
    'learning_rate': 3e-3,
    'batch_size': 256,
    'num_workers': 4,
    'cuda': torch.cuda.is_available(),
    'num_questions': questions.question_id.nunique(),
    'num_lectures': lectures.lecture_id.nunique(),
    'num_total_q_tags': n_tags,
    'num_total_q_part': n_parts,
    'dropout': 0.1,
    'fm_n_layers': 1,
    'fm_h_size': 32,
    'emb_size': 64,
    'sparse_size': 8,
    'dnn_n_layers': 4,
    'dnn_h_size': 64,
    'input_size': 512,
    'buffer_size_limit': 1e3,
    'limit_f1': 10, #lec-ques
    'limit_f2': 5, # part
    'limit_f3': 5, # tag
    'limit_f4': 5, # user
    'n_features': 8,
    'output_dir': '/kaggle/working/data',
    'extra_dir':'/kaggle/input/deepfm-input'
}
params = Params(**params_dict)
print(params.__dict__)

{'load_state': True, 'use_buffer': True, 'is_offline': False, 'batch_norm': False, 'is_test': False, 'n_chunks': 10, 'n_epoch': 1, 'learning_rate': 0.003, 'batch_size': 256, 'num_workers': 4, 'cuda': True, 'num_questions': 13523, 'num_lectures': 418, 'num_total_q_tags': 189, 'num_total_q_part': 7, 'dropout': 0.1, 'fm_n_layers': 1, 'fm_h_size': 32, 'emb_size': 64, 'sparse_size': 8, 'dnn_n_layers': 4, 'dnn_h_size': 64, 'input_size': 512, 'buffer_size_limit': 1000.0, 'limit_f1': 10, 'limit_f2': 5, 'limit_f3': 5, 'limit_f4': 5, 'n_features': 8, 'output_dir': '/kaggle/working/data', 'extra_dir': '/kaggle/input/deepfm-input'}


In [473]:
def split_data(train_part, n_tail=10):
    valid = train_part.groupby('user_id').tail(n_tail)
    train = train_part[~train_part.index.isin(valid.index)]
    return train, valid

In [474]:
class LectureData(Dataset):
    
    def __init__(self, params, train_df=None, question_df=None, lecture_df=None, is_train=True):
        # read init-data
        self.params = params
        self.is_train = is_train

        self.scaler_ans_correct_mean = StandardScaler()
        self.scaler_ans_correct_count = StandardScaler()
        self.scaler_prior_question_time_mean = StandardScaler()
        self.scaler_prior_question_elapsed_time = StandardScaler()
#         self.scaler_timestamp = MinMaxScaler()

        self.user_columns = ['user_id', 'ans_correct_sum', 'ans_correct_mean', 'ans_correct_count', 'lecture_set', 'lecture_tags_set', 
                             'prior_question_time_mean']

        self.train_columns = ['user_id', 'content_id', 'content_type_id', 'task_container_id',
                              'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
        
        self.merge_columns = ['user_id', 'question_id', 'question_index', 'answered_correctly', 'ans_correct_mean', 'ans_correct_count',
                              'part', 'tags_set', 'prior_question_time_mean', 'task_ans_correct_mean',
                              'question_ans_correct_mean', 'question_ans_correct_count',
                              'lecture_question_correct_mean', 'user_question_part_scores_mean', 'user_question_tag_scores_mean',
                              'prior_question_had_explanation', 'prior_question_elapsed_time']
        
        self.prior_batch, self.current_batch, self.buffer_df, self.merge_df = None, None, None, None
        self.user2idx, self.question2idx, self.lecture2idx = None, None, None
        
        # Restore all data
        if params.load_state:
            self.load_state()
        else:
            self.set_traindata(train_df, question_df, lecture_df)
        
    def proc_traindata(self, train_df):
        
        train_df['prior_question_elapsed_time'].fillna(0., inplace=True)
        train_df['prior_question_had_explanation'].fillna(False, inplace=True)
        train_df.loc[:, 'prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype(int)        
        train_df.loc[:, 'prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].map(lambda x: np.log(x + 1.))
#         train_df.loc[:, 'user_index'] = train_df['user_id'].apply(lambda x: self.user2idx[x])

        return train_df
    
    def proc_questiondata(self, question_df):
        
        question_df.tags.fillna('-1', inplace=True)
        question_df.loc[:, 'question_index'] = question_df['question_id'].map(lambda x: self.question2idx[x])
        question_df['tags_set'] = question_df['tags'].map(lambda x: list(map(lambda s: int(s) + 1, str(x).split())))
        question_df['part'] = question_df['part'].map(lambda x: x - 1)

        return question_df
        
    def proc_lecturedata(self, lecture_df):
        
        lecture_df['lecture_index'] = lecture_df['lecture_id'].map(lambda x: self.lecture2idx[x])
        lecture_df['lecture_id'] = lecture_df['lecture_id'].astype(int)

        return lecture_df
    
    def load_state(self):
        
        self.user_df = pd.read_parquet(os.path.join(self.params.extra_dir,'user_df.gzip'))
        self.question_df = pd.read_parquet(os.path.join(self.params.extra_dir,'question_df.gzip'))
        self.lecture_df = pd.read_parquet(os.path.join(self.params.extra_dir,'lecture_df.gzip'))
        
        self.lecture_question_scores = pd.read_parquet(os.path.join(self.params.extra_dir,'lecture_question_scores.gzip'))
        self.user_question_tag_scores = pd.read_parquet(os.path.join(self.params.extra_dir,'user_question_tag_scores.gzip'))
        self.user_question_part_scores = pd.read_parquet(os.path.join(self.params.extra_dir,'user_question_part_scores.gzip'))
        
        self.user_df = self.user_df.set_index('user_id')
        self.lecture_question_scores = self.lecture_question_scores.set_index(['question_id', 'lecture_id'])
        self.user_question_tag_scores = self.user_question_tag_scores.set_index(['user_id', 'tag_id'])
        self.user_question_part_scores = self.user_question_part_scores.set_index(['user_id', 'part'])
        self.question_df = self.question_df.set_index('question_id')
        self.lecture_df = self.lecture_df.set_index('lecture_id')
        
        f = h5py.File(os.path.join(self.params.extra_dir, 'data2idx.h5'), 'r')
        self.user2idx = f['user2idx'][:]
        self.n_users = len(self.user2idx)
        self.user2idx = dict(zip(self.user2idx, range(self.n_users)))
        
        self.question2idx = f['question2idx'][:]
        self.n_questions = len(self.question2idx)
        self.question2idx = dict(zip(self.question2idx, range(self.n_questions)))
        
        self.lecture2idx = f['lecture2idx'][:]
        self.n_lectures = len(self.lecture2idx)
        self.lecture2idx = dict(zip(self.lecture2idx, range(self.n_lectures)))
        f.close()
        
        self.scaler_ans_correct_count = joblib.load(os.path.join(self.params.extra_dir, 'scaler_ans_correct_count.save')) 
        self.scaler_prior_question_elapsed_time = joblib.load(os.path.join(self.params.extra_dir, 'scaler_prior_question_elapsed_time.save')) 
    
    def init_info(self, train_df, question_df, lecture_df):
        
        self.user_list = list(train_df['user_id'].unique())
        self.question_list = list(question_df['question_id'].unique())
        self.lecture_list = list(lecture_df['lecture_id'].unique())

        self.n_users = len(self.user_list)
        self.n_questions = len(self.question_list)
        self.n_lectures = len(self.lecture_list)
        
        self.user2idx = dict(zip(self.user_list, range(self.n_users)))
        self.question2idx = dict(zip(self.question_list, range(self.n_questions)))
        self.lecture2idx = dict(zip(self.lecture_list, range(self.n_lectures)))
        
    def fit_scaler(self, user_df):
        
        self.scaler_ans_correct_count.partial_fit(np.expand_dims(user_df['ans_correct_count'].values, 1))
        self.scaler_prior_question_time_mean.partial_fit(np.expand_dims(user_df['prior_question_time_mean'].values, 1))
        
        
    def transform_scaler(self, user_df):
                
        user_df['prior_question_elapsed_time'] = \
            self.scaler_prior_question_elapsed_time.transform(np.expand_dims(user_df['prior_question_elapsed_time'].values, 1)).squeeze(1)
        
#         user_df['timestamp'] = \
#             self.scaler_timestamp.transform(np.expand_dims(user_df['timestamp'].values, 1)).squeeze(1)

        user_df['ans_correct_count'] = \
            self.scaler_ans_correct_count.transform(np.expand_dims(user_df['ans_correct_count'].values, 1)).squeeze(1)

        user_df['prior_question_time_mean'] = \
            self.scaler_prior_question_time_mean.transform(np.expand_dims(user_df['prior_question_time_mean'].values, 1)).squeeze(1)
        
        return user_df

    def feature_traindata(self, train_df):
        
        self.scaler_prior_question_elapsed_time.partial_fit(np.expand_dims(train_df['prior_question_elapsed_time'].values, 1))
#         self.scaler_timestamp.partial_fit(np.expand_dims(train_df['timestamp'].values, 1))

        grouped_by_task_id = train_df[train_df.content_type_id==0].groupby('task_container_id').agg({
            'answered_correctly': ['sum', 'count', 'mean']
        }).reset_index()
        grouped_by_task_id.columns = ['task_container_id', 'task_ans_correct_sum', 'task_ans_correct_count', 'task_ans_correct_mean']
        
        return grouped_by_task_id

    def feature_questiondata(self, train_df):
        
        question_train_df = train_df[train_df['content_type_id']==0]
        
        grouped_by_question = question_train_df.groupby('content_id').agg({
            'answered_correctly': ['sum', 'count', 'mean']
        }).reset_index() 
        grouped_by_question.columns = ['question_id', 'question_ans_correct_sum', 'question_ans_correct_count', 'question_ans_correct_mean']
    
        return grouped_by_question
    
    def feature_lecturedata(self, train_df):
        
        lecture_train_df = train_df[train_df['content_type_id']==1]
        
        grouped_by_lecture = lecture_train_df.groupby('content_id').agg({
            'user_id': 'count'
        }).reset_index()

        grouped_by_lecture.columns = ['lecture_id', 'lecture_population']
        
        return grouped_by_lecture
        
    def feature_userdata(self, train_df):
        
        
        # F: user ans correct mean 
        q_train_x = train_df[train_df['content_type_id']==0]        
        q_train = q_train_x.merge(self.question_df, left_on='content_id', right_on='question_id', how='left', right_index=True)
        question_grouped_by_user = q_train.groupby(['user_id']).agg({
            'answered_correctly':['sum', 'count', 'mean'],
            'prior_question_elapsed_time': 'mean',
        }).reset_index()
        question_grouped_by_user.columns = ['user_id', 'ans_correct_sum', 'ans_correct_count', 'ans_correct_mean',
                                           'prior_question_time_mean']        
        question_grouped_by_user.fillna(0, inplace=True)
#         assert question_grouped_by_user.isnull().any().sum() == 0
        
        # new feature
        user_question_part_scores = q_train.groupby(['user_id', 'part']).agg({
            'answered_correctly': ['sum', 'count', 'mean']
        }).reset_index()
        user_question_part_scores.columns = ['user_id', 'part', 'user_question_part_scores_sum', 
                                             'user_question_part_scores_count', 'user_question_part_scores_mean']
        user_question_part_scores['part'] = user_question_part_scores['part'].astype(int)
        
#         assert user_question_part_scores.isnull().any().sum() == 0
        
        # new feature
        q_train_subset = q_train[['user_id', 'tags_set', 'answered_correctly']]
        q_train_subset = q_train_subset.explode('tags_set')
        q_train_subset.columns = ['user_id', 'tag_id', 'answered_correctly']
        
        user_question_tag_scores = q_train_subset.groupby(['user_id', 'tag_id']).agg({
            'answered_correctly': ['sum', 'count', 'mean']
        }).reset_index()
        user_question_tag_scores.columns = ['user_id', 'tag_id', 'user_question_tag_scores_sum', 
                                            'user_question_tag_scores_count', 'user_question_tag_scores_mean']
        user_question_tag_scores['tag_id'] = user_question_tag_scores['tag_id'].astype(int)
                        
#         assert user_question_tag_scores.isnull().any().sum() == 0
        
        l_train_x = train_df[train_df['content_type_id']==1]        
        l_train = l_train_x.merge(self.lecture_df, left_on='content_id', right_on='lecture_id', how='left', right_index=True)
        lecture_grouped_by_user = l_train.groupby(['user_id']).agg({
            'lecture_id': lambda x: list(set(x)),
            'tag': lambda x: list(set(x)),
#             'part': lambda x: list(set(x)),
#             'type_of': lambda x: list(set(x))
        }).reset_index()

        lecture_grouped_by_user.columns = ['user_id', 'lecture_set', 'lecture_tags_set']
#                                           'lecture_part_set', 'lecture_typeof_set']
        lecture_grouped_by_user.fillna(0, inplace=True)
#         assert lecture_grouped_by_user.isnull().any().sum() == 0

        # Gen user-feature
        question_grouped_by_user = question_grouped_by_user.set_index('user_id')
        lecture_grouped_by_user = lecture_grouped_by_user.set_index('user_id')
        
        user_df = question_grouped_by_user.join(lecture_grouped_by_user, on='user_id', how='outer').reset_index()
        
        if self.is_train:
            self.fit_scaler(user_df)
#             user_df = self.transform_scaler(user_df)
                
        # Get lecture-question scores
        l_train = l_train_x[['user_id', 'content_id']]
        l_train = l_train.rename(columns={'content_id': 'lecture_id'})
        q_train = q_train_x[['user_id', 'content_id', 'answered_correctly']]
        q_train = q_train.rename(columns={'content_id': 'question_id'})
        
        #optim        
        lecture_question_scores = l_train.merge(q_train, on='user_id')
        lecture_question_scores = lecture_question_scores.groupby(['lecture_id', 'question_id']).agg({
            'answered_correctly': ['sum', 'count', 'mean']
        }).reset_index()
        lecture_question_scores.columns = ['lecture_id', 'question_id', 
                                           'lecture_question_correct_sum', 'lecture_question_correct_count', 'lecture_question_correct_mean']
        lecture_question_scores['lecture_id'] = lecture_question_scores['lecture_id'].astype(int)
        lecture_question_scores['question_id'] = lecture_question_scores['question_id'].astype(int)
        
#         assert lecture_question_scores.isnull().any().sum() == 0

        # filter all NAN
        user_df = user_df[self.user_columns]
        
#         user_df['tag_vec_score_sum'] = user_df['tag_vec_score_sum'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_tags)))
#         user_df['tag_vec_score_count'] = user_df['tag_vec_score_count'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_tags)))
        
#         user_df['part_vec_score_sum'] = user_df['part_vec_score_sum'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_part)))
#         user_df['part_vec_score_count'] = user_df['part_vec_score_count'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_part)))

        user_df['lecture_set'] = user_df['lecture_set'].apply(lambda x: x if isinstance(x, list) else [])
        user_df['lecture_tags_set'] = user_df['lecture_tags_set'].apply(lambda x: x if isinstance(x, list) else [])
#         user_df['lecture_part_set'] = user_df['lecture_part_set'].apply(lambda x: x if isinstance(x, list) else [])
#         user_df['lecture_typeof_set'] = user_df['lecture_typeof_set'].apply(lambda x: x if isinstance(x, list) else [])
                             
#         assert user_df.isnull().any().sum() == 0
        
        return (user_df, 
                lecture_question_scores, 
                user_question_part_scores, 
                user_question_tag_scores)
    
    def finalize_data(self, data_df):
        # Filter out all
        self.lecture_question_scores.loc[self.lecture_question_scores['lecture_question_correct_count'] < self.params.limit_f1, 'lecture_question_correct_mean'] = 0.5
        self.user_question_part_scores.loc[self.user_question_part_scores['user_question_part_scores_count'] < self.params.limit_f2, 'user_question_part_scores_mean'] = 0.5
        self.user_question_tag_scores.loc[self.user_question_tag_scores['user_question_tag_scores_count'] < self.params.limit_f3, 'user_question_tag_scores_mean'] = 0.5
        self.user_df.loc[self.user_df['ans_correct_count'] < self.params.limit_f4, 'ans_correct_mean'] = 0.5

        # Only filter content-type this step
        data_df = data_df[data_df['content_type_id']==0]
        data_df = data_df[self.train_columns]
        data_df = data_df.rename(columns={'content_id': 'question_id'})
       
        # Get lectures
        ql_df = data_df[['user_id', 'question_id']].set_index('user_id').join(self.user_df, on='user_id', how='left')
        ql_df = ql_df.explode('lecture_set')
        ql_df = ql_df.rename(columns={'lecture_set': 'lecture_id'})
        ql_df.reset_index(inplace=True)
        ql_df = ql_df.set_index(['question_id', 'lecture_id']).join(self.lecture_question_scores, on=['question_id', 'lecture_id'], how='left')
        ql_df = ql_df.groupby(['user_id', 'question_id'])['lecture_question_correct_mean'].mean()
        
        # user-tags-scores
        user_tags_df = data_df[['user_id', 'question_id']].set_index('question_id').join(self.question_df, on='question_id', how='left').reset_index()
        user_tags_df = user_tags_df[['user_id', 'question_id', 'tags_set']]
        user_tags_df = user_tags_df.explode('tags_set')
        user_tags_df = user_tags_df.rename(columns={'tags_set': 'tag_id'})
        user_tags_df = user_tags_df.set_index(['user_id', 'tag_id']).join(self.user_question_tag_scores, on=['user_id', 'tag_id'], how='left')\
                                   .groupby(['user_id', 'question_id'])['user_question_tag_scores_mean'].mean()
        
        # Final df
        merge_df = data_df.join(ql_df, on=['user_id', 'question_id'], how='left')\
                          .join(user_tags_df, on=['user_id', 'question_id'], how='left')\
                          .join(self.question_df, on='question_id', how='left')\
                          .join(self.user_df, on='user_id', how='left')\
                          .join(self.user_question_part_scores, on=['user_id', 'part'], how='left')\
                          .join(self.task_df, on='task_container_id', how='left')
                          
        del data_df, ql_df, user_tags_df
        merge_df = merge_df[self.merge_columns]

        # scaler transform
        merge_df = self.transform_scaler(merge_df)        
        merge_df['ans_correct_mean'].fillna(0.65, inplace=True)
        merge_df['ans_correct_count'].fillna(0., inplace=True)
        merge_df['prior_question_time_mean'].fillna(0., inplace=True)
        merge_df['question_ans_correct_mean'].fillna(0.5, inplace=True)
        merge_df['lecture_question_correct_mean'].fillna(0.5, inplace=True)
        merge_df['user_question_part_scores_mean'].fillna(0.5, inplace=True)
        merge_df['user_question_tag_scores_mean'].fillna(0.5, inplace=True)
        merge_df['task_ans_correct_mean'].fillna(0.5, inplace=True)
        
        merge_df['prior_question_elapsed_time'].fillna(0., inplace=True)
        merge_df['prior_question_had_explanation'].fillna(0, inplace=True)
        
#         merge_df['timestamp'].fillna(0., inplace=True)
#         merge_df['part'] = merge_df['part'].fillna(0).astype(int)
#         merge_df['tags_set'] = merge_df['tags_set'].apply(lambda x: x if isinstance(x, list) else [])
        
#         merge_df['tag_vec_score_count'] = merge_df['tag_vec_score_count'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_tags)))
#         merge_df['tag_vec_score_sum'] = merge_df['tag_vec_score_sum'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_tags)))
        
#         merge_df['part_vec_score_count'] = merge_df['part_vec_score_count'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_part)))
#         merge_df['part_vec_score_sum'] = merge_df['part_vec_score_sum'].apply(lambda x: x if isinstance(x, list) else list(np.zeros(self.params.num_total_q_part)))
        
#         assert merge_df.isnull().any().sum() == 0
        
        return merge_df  
    
    def set_traindata(self, train_df, question_df, lecture_df):
        # Fetch init info
        self.init_info(train_df, question_df, lecture_df)
        
        # Refine all dfs
        train_df = self.proc_traindata(train_df)
        question_df = self.proc_questiondata(question_df)
        lecture_df = self.proc_lecturedata(lecture_df)
                
        # Update all data
        self.task_df = self.feature_traindata(train_df)
        
        question_stats_df = self.feature_questiondata(train_df)
        question_stats_df.set_index('question_id', inplace=True)
        question_df.set_index('question_id', inplace=True)
        
        self.question_df = question_df.merge(question_stats_df, on='question_id', how='left').reset_index()
        self.question_df.fillna(0.5, inplace=True)
#         assert self.question_df.isnull().any().sum() == 0, 'Question with NAN'
        
        lecture_stats_df = self.feature_lecturedata(train_df)
        lecture_stats_df.set_index('lecture_id', inplace=True)
        lecture_df.set_index('lecture_id', inplace=True)
        
        self.lecture_df = lecture_df.merge(lecture_stats_df, on='lecture_id', how='left').reset_index()
        self.lecture_df.fillna(0., inplace=True)
#         assert self.lecture_df.isnull().any().sum() == 0, 'Lecture with NAN'
        
        # update all question/lecture before user-df
        (self.user_df, self.lecture_question_scores,
            self.user_question_part_scores, self.user_question_tag_scores) = self.feature_userdata(train_df)
        
        # Fetch users w question only        
        self.set_index()
        self.merge_df = self.finalize_data(train_df)
        
    def update_newdata(self, batch_df, skip_stat=True):
        
        if skip_stat:
            n_newusers, n_newquestions, n_newlectures = 0, 0, 0
        else:
            # fetch all new instances        
            new_users = list(set(batch_df.user_id.unique()) - set(self.user2idx.keys()))
            new_questions = list(set(batch_df.loc[batch_df['content_type_id']==0, 'content_id'].unique()) - set(self.question2idx.keys()))
            new_lectures = list(set(batch_df.loc[batch_df['content_type_id']==1, 'content_id'].unique()) - set(self.lecture2idx.keys()))

            n_newusers = len(new_users)
            n_newquestions = len(new_questions)
            n_newlectures = len(new_lectures)

            new_user2idx = dict(zip(new_users, range(self.n_users, self.n_users + n_newusers)))
            new_question2idx = dict(zip(new_questions, range(self.n_questions, self.n_questions + n_newquestions)))
            new_lecture2idx = dict(zip(new_lectures, range(self.n_lectures, self.n_lectures + n_newlectures)))

            self.user2idx.update(new_user2idx)
            self.question2idx.update(new_question2idx)
            self.lecture2idx.update(new_lecture2idx)

            self.n_users = self.n_users + n_newusers
            self.n_questions = self.n_questions + n_newquestions
            self.n_lectures = self.n_lectures + n_newlectures
        
            # Append new questions + lectures
            if n_newquestions > 0:
                extra_questions = pd.DataFrame(list(new_question2idx.items()), 
                                               columns=['question_id', 'question_index'])
                self.question_df = pd.concat([self.question_df, extra_questions], axis=0, ignore_index=True)

            if n_newlectures > 0:
                extra_lectures = pd.DataFrame(list(new_lecture2idx.items()), 
                                               columns=['lecture_id', 'lecture_index'])
                self.lecture_df = pd.concat([self.lecture_df, extra_lectures], axis=0, ignore_index=True)
            
        # Add user-index
        batch_df = self.proc_traindata(batch_df)                
        return batch_df, n_newusers, n_newquestions, n_newlectures    
    
    def test_batch(self, batch_df):
        # update new data
        self.is_train = False
        batch_df, n_newusers, n_newquestions, n_newlectures = self.update_newdata(batch_df)
        
        # fetch prior labels
        gt_prior_batch = eval(batch_df.iloc[0]["prior_group_answers_correct"])
        
        # HERE stop updating first for 1st submission
        if self.current_batch is not None and len(gt_prior_batch) > 0:
            # save prior-batch with labels
            self.prior_batch = self.current_batch
            
            # Assign label to prev-batch
            self.prior_batch['answered_correctly'] = gt_prior_batch
            self.prior_batch = self.prior_batch[self.train_columns]
            
            # add to buffer-df
            if self.params.use_buffer:
                self.buffer_df = pd.concat([self.buffer_df, self.prior_batch], axis=0, ignore_index=True)
                del self.prior_batch
             
        else:
            self.prior_batch = batch_df
        
        self.current_batch = batch_df
        # create dummy labels
        self.current_batch['answered_correctly'] = 0
                            
        # Update new-batch-data
        self.merge_df = self.finalize_data(self.current_batch)       
        
        return n_newusers, n_newquestions, n_newlectures
    
    def set_batch(self, batch_df):
        
        self.is_train = True
        batch_df, n_newusers, n_newquestions, n_newlectures = self.update_newdata(batch_df)
        
        # Update new-batch-data
        self.merge_df = self.finalize_data(batch_df)    
        return batch_df, n_newusers, n_newquestions, n_newlectures
        
    def finetune_batch(self):
        """ Only finetune on previous batch-data with labels
        """
        if self.buffer_df is not None and len(self.buffer_df) > params.buffer_size_limit:
            print('--> Dataset activated finetune buffer')
            self.is_train = True
            
            stat_buff, train_buff = split_data(self.buffer_df, n_tail=5)
            self.agg_newdata(stat_buff)
            self.merge_df = self.finalize_data(train_buff)
            self.buffer_df = None
        
            # clean buffer
            self.lecture_question_scores = self.lecture_question_scores.iloc[-100000:]
            self.user_question_tag_scores = self.user_question_tag_scores.iloc[-100000:]
            self.user_question_part_scores = self.user_question_part_scores.iloc[-100000:]
            
            return True
        
        return False
    
    def stats_data(self, train_df):
        
        # Fetch all local-stats
        task_stats_df = self.feature_traindata(train_df)
        question_stats_df = self.feature_questiondata(train_df)
        lecture_stats_df = self.feature_lecturedata(train_df)
        user_compound = self.feature_userdata(train_df)
        
        return train_df, question_stats_df, lecture_stats_df, task_stats_df, user_compound
    
    def reset_index(self):
        
        self.user_df = self.user_df.reset_index()
        self.lecture_question_scores = self.lecture_question_scores.reset_index()
        self.user_question_tag_scores = self.user_question_tag_scores.reset_index()
        self.user_question_part_scores = self.user_question_part_scores.reset_index()
        self.question_df = self.question_df.reset_index()
        self.lecture_df = self.lecture_df.reset_index()
        self.task_df = self.task_df.reset_index()
        
    def set_index(self):
        
        self.user_df = self.user_df.set_index('user_id')
        self.lecture_question_scores = self.lecture_question_scores.set_index(['question_id', 'lecture_id'])
        self.user_question_tag_scores = self.user_question_tag_scores.set_index(['user_id', 'tag_id'])
        self.user_question_part_scores = self.user_question_part_scores.set_index(['user_id', 'part'])
        self.question_df = self.question_df.set_index('question_id')
        self.lecture_df = self.lecture_df.set_index('lecture_id')
        self.task_df = self.task_df.set_index('task_container_id')

    def agg_newdata(self, train_df, is_finetune=False):
        self.reset_index()
        # update info new-data
        self.is_train = True
        if is_finetune:
            n_newusers, n_newquestions, n_newlectures = 0, 0, 0
        else:
            train_df, n_newusers, n_newquestions, n_newlectures = self.update_newdata(train_df)
        
        # run stats on new-data
        train_df, question_stats_df, lecture_stats_df, task_stats_df, user_compound = self.stats_data(train_df)
        new_user_df, new_lecture_question_scores, new_user_question_part_scores, new_user_question_tag_scores = user_compound
        
        self.task_df = pd.concat([self.task_df, task_stats_df], axis=0, ignore_index=True)
        self.task_df = self.task_df.groupby('task_container_id').agg({
            'task_ans_correct_sum': 'sum',
            'task_ans_correct_count': 'sum'
        }).reset_index()
        self.task_df['task_ans_correct_mean'] = self.task_df['task_ans_correct_sum'] / self.task_df['task_ans_correct_count']
        
        self.user_df = pd.concat([self.user_df, new_user_df], axis=0, ignore_index=True)
        self.user_df = self.user_df.groupby('user_id').agg({
            'ans_correct_sum': 'sum',
            'ans_correct_count': 'sum',
            'prior_question_time_mean': 'mean',
            'lecture_set': lambda x: list(set(itertools.chain(*x))),
            'lecture_tags_set': lambda x: list(set(itertools.chain(*x))),
#             'lecture_part_set': lambda x: list(set(itertools.chain(*x))),
#             'lecture_typeof_set': lambda x: list(set(itertools.chain(*x))),
#             'tag_vec_score_sum': lambda x: list(np.sum(np.array([*x]), 0)),
#             'tag_vec_score_count': lambda x: list(np.sum(np.array([*x]), 0)),
#             'part_vec_score_sum': lambda x: list(np.sum(np.array([*x]), 0)),
#             'part_vec_score_count': lambda x: list(np.sum(np.array([*x]), 0)),
        }).reset_index()
#         pdb.set_trace()
        self.user_df['ans_correct_mean'] = self.user_df['ans_correct_sum'] / self.user_df['ans_correct_count']
                
        self.lecture_question_scores = pd.concat([self.lecture_question_scores, 
                                                  new_lecture_question_scores], axis=0, ignore_index=True)
        self.lecture_question_scores = self.lecture_question_scores.groupby(['lecture_id', 'question_id']).agg({
            'lecture_question_correct_sum': 'sum',
            'lecture_question_correct_count': 'sum',
        }).reset_index()
        
        self.lecture_question_scores['lecture_question_correct_mean'] = self.lecture_question_scores['lecture_question_correct_sum'] / self.lecture_question_scores['lecture_question_correct_count']

#         assert self.lecture_question_scores.isnull().any().sum() == 0
        
        self.user_question_part_scores = pd.concat([self.user_question_part_scores, 
                                                  new_user_question_part_scores], axis=0, ignore_index=True)
        self.user_question_part_scores = self.user_question_part_scores.groupby(['user_id', 'part']).agg({
            'user_question_part_scores_sum': 'sum',
            'user_question_part_scores_count': 'sum',
        }).reset_index()
        self.user_question_part_scores['user_question_part_scores_mean'] = self.user_question_part_scores['user_question_part_scores_sum'] / self.user_question_part_scores['user_question_part_scores_count']
        
#         assert self.user_question_part_scores.isnull().any().sum() == 0
        
        self.user_question_tag_scores = pd.concat([self.user_question_tag_scores, 
                                                  new_user_question_tag_scores], axis=0, ignore_index=True)
        self.user_question_tag_scores = self.user_question_tag_scores.groupby(['user_id', 'tag_id']).agg({
            'user_question_tag_scores_sum': 'sum',
            'user_question_tag_scores_count': 'sum',
        }).reset_index()
        self.user_question_tag_scores['user_question_tag_scores_mean'] = self.user_question_tag_scores['user_question_tag_scores_sum'] / self.user_question_tag_scores['user_question_tag_scores_count']        
#         assert self.user_question_tag_scores.isnull().any().sum() == 0
        
        # update question-part
        subset = self.question_df.loc[self.question_df['question_id'].isin(question_stats_df['question_id'].values), :].copy()
        subset = subset.merge(question_stats_df, on='question_id', how='left')
        subset['question_ans_correct_sum'] = subset[['question_ans_correct_sum_x','question_ans_correct_sum_y']].sum(axis=1)
        subset['question_ans_correct_count'] = subset[['question_ans_correct_count_x','question_ans_correct_count_y']].sum(axis=1)        
        subset['question_ans_correct_mean'] = subset['question_ans_correct_sum'] / subset['question_ans_correct_count']

        self.question_df.loc[self.question_df['question_id'].isin(question_stats_df['question_id'].values), 
                             'question_ans_correct_mean'] = subset['question_ans_correct_mean'].values
        self.question_df.loc[self.question_df['question_id'].isin(question_stats_df['question_id'].values), 
                             'question_ans_correct_count'] = subset['question_ans_correct_count'].values
        self.question_df.loc[self.question_df['question_id'].isin(question_stats_df['question_id'].values), 
                             'question_ans_correct_sum'] = subset['question_ans_correct_sum'].values
        
        del subset, question_stats_df, lecture_stats_df
        del new_lecture_question_scores, new_user_question_part_scores, new_user_question_tag_scores
        # update lecture-part
        
        # Update new data for training
        self.set_index()
        
        return n_newusers, n_newquestions, n_newlectures
    
    def cleanup(self):
        print('--> Dataset cleaning ...')
        if self.buffer_df is not None:
            del self.buffer_df
            self.buffer_df = None
        if self.merge_df is not None:
            del self.merge_df
            self.merge_df = None
        gc.collect()

    def __len__(self):
        return self.merge_df.shape[0] if self.merge_df is not None else 0
    
    def __getitem__(self, index):
        ins = self.merge_df.iloc[index]
        target = ins['answered_correctly']
        
        # get user-info
#         user_id = ins['user_index']
        user_ans_correct_mean = ins['ans_correct_mean']
        user_ans_correct_count = ins['ans_correct_count'] 
        user_prior_question_time_mean = ins['prior_question_time_mean']
        user_prior_question_elapsed_time = ins['prior_question_elapsed_time']
        user_prior_question_had_explanation = ins['prior_question_had_explanation']
        
        # user-item interaction
        user_item_correct_score = ins['lecture_question_correct_mean']
        user_tag_correct_score = ins['user_question_tag_scores_mean']
        user_part_correct_score = ins['user_question_part_scores_mean']
        user_task_ans_correct = ins['task_ans_correct_mean']
                
        # item/question-info
#         item_id = ins['question_index']
#         item_part_vec = np.zeros(self.params.num_total_q_part)
#         item_part_vec[list([ins['part']])] = 1.
#         item_part = item_part_vec
        
#         item_tags_vec = np.zeros(self.params.num_total_q_tags)
#         item_tags_vec[list(ins['tags_set'])] = 1.
#         item_tags = item_tags_vec
        item_ans_correct_mean = ins['question_ans_correct_mean']
        
        return (torch.FloatTensor([user_task_ans_correct]),
                torch.FloatTensor([user_item_correct_score]), 
                torch.FloatTensor([user_ans_correct_mean]), 
                torch.FloatTensor([user_ans_correct_count]),
                torch.FloatTensor([user_prior_question_time_mean]),
                torch.FloatTensor([user_tag_correct_score]),
                torch.FloatTensor([user_part_correct_score]),
                torch.FloatTensor([user_prior_question_elapsed_time]),
                torch.FloatTensor([user_prior_question_had_explanation]),
                torch.FloatTensor([item_ans_correct_mean]),
                torch.FloatTensor([target]))

In [475]:
class FM_COMP(nn.Module):
    def __init__(self, n_layers, h_size, emb_size, sparse_size, n_features, dropout=0.0, batch_norm=False):
        super(FM_COMP, self).__init__()
        self.batch_norm = batch_norm
        self.dropout = nn.Dropout(p=dropout)
        
        if batch_norm:
            self.bnorm = nn.BatchNorm1d(h_size)
            self.bn = nn.BatchNorm1d(emb_size)
        self.layers = nn.ModuleList()
        self.score = nn.Linear(h_size, 1)
        
        _in_size = emb_size
        for i in range(n_layers):
            _out_size = h_size * (n_layers - i)
            self.layers.append(nn.Linear(_in_size, _out_size))
            _in_size = _out_size
            
        self.sparse_layer = nn.Linear(sparse_size, 1)

#         self.combine_layer = nn.Linear(1 + n_features**2, 1)
        
    def forward(self, x, sp_f):
        
        # bi-pooling part
        summed_feature_emb = torch.sum(x, dim=1) # [None, K]
        summed_squared_faeture_emb = torch.square(summed_feature_emb) # [None, K]
        
        square_feature_emb = torch.square(x) # [None, F, K]
        squared_summed_feature_emb = torch.sum(square_feature_emb, dim=1)
        bi_pool = 0.5 * (summed_squared_faeture_emb - squared_summed_feature_emb) # [None, K]
        
        if self.batch_norm:
            bi_pool = self.bn(bi_pool)
        bi_pool = self.dropout(bi_pool)
        
        for i, h_layer in enumerate(self.layers):
            bi_pool = h_layer(bi_pool)       
            if self.batch_norm:
                bi_pool = self.bnorm(bi_pool)
            bi_pool = F.relu(bi_pool)
            bi_pool = self.dropout(bi_pool)
        
        bi_pool = self.score(bi_pool)
        
#         x_norm = F.normalize(x, p=2, dim=-1)
#         x_fm = torch.matmul(x_norm, x_norm.permute(0, 2, 1)) # [None, F, F]
#         x_fm = torch.triu(x_fm, diagonal=1)
#         x_fm = x_fm.reshape(-1, x.shape[1]**2)
#         combine = torch.cat([x_fm, sparse_out], dim=-1)
#         out = self.combine_layer(combine)
        
        # sparse-feature part
        sparse_out = self.sparse_layer(sp_f)
        out = bi_pool + sparse_out
        
        return out

In [476]:
class DNN_COMP(nn.Module):
    def __init__(self, n_layers, h_size, input_size, dropout=0.0, batch_norm=False):
        super(DNN_COMP, self).__init__()
        self.batch_norm = batch_norm
        self.dropout = nn.Dropout(p=dropout)
        self.layers = nn.ModuleList()
        
        if batch_norm:
            self.bnorm = nn.BatchNorm1d(h_size)
        self.score = nn.Linear(h_size, 1)
        
        _in_size = input_size
        for i in range(n_layers):
            _out_size = h_size * (n_layers - i)
            self.layers.append(nn.Linear(_in_size, _out_size))
            _in_size = _out_size
        
    def forward(self, x):
        for i, h_layer in enumerate(self.layers):
            x = h_layer(x)            
            if self.batch_norm:
                x = self.bnorm(x)
            x = F.relu(x)
            x = self.dropout(x)
        out = self.score(x)
        return out

In [477]:
class DEEPFM(nn.Module):
    
    def __init__(self, params):
        super(DEEPFM, self).__init__()
        self.params = params
        self.fm_comp = FM_COMP(params.fm_n_layers, 
                               params.fm_h_size, 
                               params.emb_size, 
                               params.sparse_size, 
                               params.n_features,
                               dropout=params.dropout,
                               batch_norm=params.batch_norm)
        
        self.dnn_comp = DNN_COMP(params.dnn_n_layers, 
                                 params.dnn_h_size, 
                                 params.input_size, 
                                 dropout=params.dropout,
                                 batch_norm=params.batch_norm)
        
#         self.user_emb = nn.Embedding(params.num_users, params.emb_size)
#         self.question_emb = nn.Embedding(params.num_questions, params.emb_size)
        
        self.ans_mean = nn.Linear(1, params.emb_size, bias=False)
        self.ans_count = nn.Linear(1, params.emb_size, bias=False)
                
        self.prior_time_mean = nn.Linear(1, params.emb_size, bias=False)
        self.question_correct_emb = nn.Linear(1, params.emb_size, bias=False)
        
        self.user_question_correct_emb = nn.Linear(1, params.emb_size, bias=False)
        self.user_tag_correct_emb = nn.Linear(1, params.emb_size, bias=False)
        self.user_part_correct_emb = nn.Linear(1, params.emb_size, bias=False)
        self.task_ans_correct_emb = nn.Linear(1, params.emb_size, bias=False)
        
        self.prior_time = nn.Linear(1, params.emb_size, bias=False)
        self.prior_question_explained = nn.Linear(1, params.emb_size, bias=False)
        
    def update_size(self, n_users, n_questions, n_lectures):
        # add new users
        if n_users > 0:
            extra_users = torch.rand(n_users, self.params.emb_size).to(device) \
                          if self.params.cuda \
                          else torch.rand(n_users, self.params.emb_size)

            self.user_emb.weight = torch.nn.Parameter(torch.cat([self.user_emb.weight, extra_users], dim=0))
            self.user_emb.num_embeddings = self.user_emb.weight.shape[0]   
            
        if n_questions > 0:
            extra_questions = torch.rand(n_questions, self.params.emb_size).to(device) \
                              if self.params.cuda \
                              else torch.rand(n_questions, self.params.emb_size)
            
            self.question_emb.weight = torch.nn.Parameter(torch.cat([self.question_emb.weight, extra_questions], dim=0))
            self.question_emb.num_embeddings = self.question_emb.weight.shape[0]
    
    
    def forward(self, x_batch):
        (b_user_task_ans_correct, b_user_item_correct_score, b_ans_correct_mean, b_ans_correct_count, b_prior_question_time_mean,
        b_user_tag_correct_score, b_user_part_correct_score, b_prior_time, b_question_explained,
        b_item_correct_mean) = x_batch
        
#         pdb.set_trace()
        input_sparse = torch.cat((b_user_task_ans_correct, b_user_item_correct_score, b_ans_correct_mean, b_ans_correct_count, b_prior_question_time_mean,
                                  b_question_explained, b_prior_time, b_item_correct_mean), dim=1)
        
        # user
        b_user_task_ans_correct = self.task_ans_correct_emb(b_user_task_ans_correct)
        b_prior_question_time_mean = self.prior_time_mean(b_prior_question_time_mean)
        b_ans_correct_mean = self.ans_mean(b_ans_correct_mean)
        b_ans_correct_count = self.ans_count(b_ans_correct_count)
        b_user_item_correct_score = self.user_question_correct_emb(b_user_item_correct_score)
        b_user_tag_correct_score = self.user_tag_correct_emb(b_user_tag_correct_score)
        b_user_part_correct_score = self.user_part_correct_emb(b_user_part_correct_score)
        b_prior_time = self.prior_time(b_prior_time)
        b_question_explained = self.prior_question_explained(b_question_explained)

        # item
#         b_item_id = self.question_emb(b_item_id).squeeze(1)
#         b_item_part = self.part_emb(b_item_part)
#         b_item_tags = self.tag_emb(b_item_tags)
        b_item_correct_mean = self.question_correct_emb(b_item_correct_mean)
        
        input_emb = torch.cat((b_user_task_ans_correct, b_user_item_correct_score, b_ans_correct_mean, b_ans_correct_count, b_prior_question_time_mean,
                               b_question_explained, b_prior_time, b_item_correct_mean), dim=-1)
        
        input_emb = input_emb.reshape(-1, self.params.n_features, self.params.emb_size)
        
        # FM-part
        fm_out = self.fm_comp(input_emb, input_sparse)
        
        # Deep-part
        input_emb = input_emb.reshape(-1, self.params.emb_size * self.params.n_features)
        dnn_out = self.dnn_comp(input_emb)
        
        # Combine
        out = torch.sigmoid(fm_out + dnn_out)
        
        return out

In [478]:
class GBMTrainer(object):
    
    def __init__(self, dataset, params):
        
        self.params = params
        self.dataset = dataset

        self.lgbm_params = {
            'objective': 'binary',
            'boosting' : 'gbdt',
            'max_bin': 800,
            'learning_rate': 0.0175,
            'num_leaves': 80
        }
        self.model_lgbm = None
        
        self.selected_columns = ['answered_correctly', 'ans_correct_mean', 'ans_correct_count',
                                 'prior_question_time_mean', 'task_ans_correct_mean',
                                 'question_ans_correct_mean', 'lecture_question_correct_mean', 
                                 'user_question_part_scores_mean', 'user_question_tag_scores_mean',
                                 'prior_question_had_explanation', 'prior_question_elapsed_time']
    
    def incre_update(self, data_batch, stat_batch, val_batch):
        
        # Stat-data
        self.dataset.agg_newdata(stat_batch)
#         pdb.set_trace()
        # Traing-data
        data_batch, n_newusers, n_newquestions, n_newlectures = self.dataset.set_batch(data_batch)
     
        self.params.update(**{
            'num_users': self.dataset.n_users,
            'num_questions': self.dataset.n_questions,
            'num_lectures': self.dataset.n_lectures
        })
        
        print(f'[Train] New cases: {n_newusers}, {n_newquestions}, {n_newlectures}')
        print(f'[Train] All cases: {self.params.num_users}, {self.params.num_questions}, {self.params.num_lectures}')
                
        df_train = self.dataset.merge_df[self.selected_columns]
        df_train_x = df_train.loc[:, df_train.columns != 'answered_correctly']
        df_train_y = df_train['answered_correctly']
        lgb_train = lgb.Dataset(df_train_x, df_train_y, 
                                categorical_feature = ['part', 'prior_question_had_explanation'])
        
        # Valid data
        _, n_newusers, n_newquestions, n_newlectures = self.dataset.set_batch(val_batch)
        print(f'------ [Valid] new-users: {n_newusers}')
        df_val = self.dataset.merge_df[self.selected_columns]
        df_val_x = df_val.loc[:, df_val.columns != 'answered_correctly']
        df_val_y = df_val['answered_correctly']
        lgb_val = lgb.Dataset(df_val_x, df_val_y, 
                              categorical_feature = ['part', 'prior_question_had_explanation'], 
                              reference=lgb_train)
    
        self.model_lgbm = lgb.train(self.lgbm_params, lgb_train,
                                    valid_sets=[lgb_train, lgb_val],
                                    verbose_eval=50,
                                    num_boost_round=10000,
                                    early_stopping_rounds=12,
                                    init_model = self.model_lgbm if self.model_lgbm is not None else None)
        
        y_pred = self.model_lgbm.predict(df_val_x)
        y_true = np.array(df_val_y)
        auc = metrics.roc_auc_score(y_true, y_pred)
        acc = metrics.accuracy_score(y_true, (y_pred >= 0.5).astype(int))
        
        print(f'AUC: {auc} -- ACC: {acc}')
        
        self.dataset.agg_newdata(data_batch)
        
        lgb.plot_importance(self.model_lgbm)
        plt.show()


In [479]:
class Trainer(object):
    
    def __init__(self, dataset, params):
        
        self.params = params
        self.dataset = dataset
        self.model = DEEPFM(params)
               
        if params.cuda:
            print('Moving model to gpus ...')
            self.model.to(device)
            
        self.optimizer = optim.Adam(self.model.parameters(), lr=params.learning_rate)
        self.criteria = nn.BCELoss()
                
        
    def get_dataloader(self, batch_size):
        data_loader = DataLoader(self.dataset, 
                                 batch_size=batch_size, 
                                 drop_last=False,
                                 num_workers=4,
                                 shuffle=True)
        return data_loader

    def save_model(self, save_path):
        # save model as .pt or .pth file
        torch.save(self.model.state_dict(), save_path)
            
    def load_model(self, model_path):
        
        if torch.cuda.is_available():
            checkpoint = torch.load(model_path)
        else:
            # this helps avoid errors when loading single-GPU-trained weights onto CPU-model
            checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
            
        self.model.load_state_dict(checkpoint)

    def infer(self, databatch):
        
        # All test-cases
        num_test = len(databatch.loc[databatch['content_type_id'] == 0])
        if num_test == 0:
            return []
        
        self.model.eval()
        
        n_users, n_questions, n_lectures, = self.dataset.test_batch(databatch)
#         print(f'[Infer] New cases: {n_users}, {n_questions}, {n_lectures}')
        
        # add new entries to model
#         self.model.update_size(n_users, n_questions, n_lectures)
        
        batch_size = min(num_test, self.params.batch_size)
        test_loader = DataLoader(self.dataset, 
                                 batch_size=batch_size, 
                                 drop_last=False,
                                 num_workers=4,
                                 shuffle=False)
        test_outputs = []
        for i, test_batch in enumerate(test_loader):
            input_batch = ()
            for feature in test_batch[:-1]:
                input_batch += (feature.to(device) if self.params.cuda else feature, )

            output_batch = self.model(input_batch).detach()
#             assert torch.isnan(output_batch).any() == False, 'NAN in prediction!'
            test_outputs.append(output_batch.cpu().numpy() if self.params.cuda else output_batch.numpy())
        
        test_outputs = np.concatenate(test_outputs, axis=0).squeeze(1)
#         assert num_test == len(test_outputs)
        
        return test_outputs
    
    def incre_update(self, data_batch, stat_batch, val_batch):
        
        self.dataset.agg_newdata(stat_batch)
        
        _, n_newusers, n_newquestions, n_newlectures = self.dataset.set_batch(data_batch)
     
        self.params.update(**{
            'num_users': self.dataset.n_users,
            'num_questions': self.dataset.n_questions,
            'num_lectures': self.dataset.n_lectures
        })
        
        print(f'[Train] New cases: {n_newusers}, {n_newquestions}, {n_newlectures}')
        print(f'[Train] All cases: {self.params.num_users}, {self.params.num_questions}, {self.params.num_lectures}')

#         self.model.update_size(n_newusers, n_newquestions, n_newlectures)
        self.train()
        
        # VALID
        self.val(val_batch)
        
        # Add the rest
        self.dataset.agg_newdata(data_batch)
        self.dataset.agg_newdata(val_batch)
            
    def finetune_batch(self):
        # update batch-data for finetuning
        is_finetune = self.dataset.finetune_batch()
        if is_finetune:
            batch_size = min(self.params.batch_size, len(self.dataset))
            
            # start finetune model
            self.model.train()
            train_loader = self.get_dataloader(batch_size)
            self.train_step(train_loader, print_step=100, msg='Finetune')

    def train_step(self, train_loader, print_step=200, msg='Train'):
        self.model.train()
        train_auc = AverageMeter()
        for i, databatch in enumerate(train_loader):
            
            # Move to device
            input_batch = ()
            for feature in databatch[:-1]:
                input_batch += (feature.to(device) if self.params.cuda else feature, )
            b_target = databatch[-1].to(device) if self.params.cuda else databatch[-1]
            
            # FW model
            output_batch = self.model(input_batch)     
            loss = self.criteria(output_batch, b_target)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
                        
            if not self.params.is_test:
                if self.params.cuda:
                    b_target = b_target.cpu().numpy()
                    output_batch = output_batch.detach().cpu().numpy()
                else:
                    b_target = b_target.numpy()
                    output_batch = output_batch.detach().numpy()
                try:
                    acc = metrics.accuracy_score(b_target, (output_batch >= 0.5).astype(int))
                    auc = metrics.roc_auc_score(b_target, output_batch)
                    train_auc.update(auc, b_target.shape[0])
                except:
                    auc, acc = None, None
                    pass

                if print_step > 0 and i % print_step == 0:
                    print(f'+++ [{msg}] Loss: {loss.item()} AUC: {auc} ACC: {acc}')
                    
        if not self.params.is_test:
            print(f'>>> [{msg}] TOTAL AUC: {train_auc.avg}')
            
    def val_step(self, val_loader, print_step=300, msg='Valid'):
        self.model.eval()
        val_auc = AverageMeter()
        for i, databatch in enumerate(val_loader):
            
            # Move to device
            input_batch = ()
            for feature in databatch[:-1]:
                input_batch += (feature.to(device) if self.params.cuda else feature, )
            b_target = databatch[-1].to(device) if self.params.cuda else databatch[-1]
            
            # FW model
            output_batch = self.model(input_batch)     
                       
            if self.params.cuda:
                b_target = b_target.cpu().numpy()
                output_batch = output_batch.detach().cpu().numpy()
            else:
                b_target = b_target.numpy()
                output_batch = output_batch.detach().numpy()

            try:
                acc = metrics.accuracy_score(b_target, (output_batch >= 0.5).astype(int))
                auc = metrics.roc_auc_score(b_target, output_batch)
                val_auc.update(auc, b_target.shape[0])
            except:
                auc, acc = None, None
                pass
            
            if print_step > 0 and i % print_step == 0:
                print(f'- [{msg}] AUC: {auc} ACC: {acc}')
        
        print(f'>>> [{msg}] TOTAL AUC: {val_auc.avg}')
    
    def val(self, val_batch):
        _, n_newusers, n_newquestions, n_newlectures = self.dataset.set_batch(val_batch)
        print(f'------ [Valid] new-users: {n_newusers}')
        val_loader = self.get_dataloader(self.params.batch_size)
        self.model.update_size(n_newusers, n_newquestions, n_newlectures)
        self.val_step(val_loader, print_step=400)

    def train(self):
        self.model.train()
        train_loader = self.get_dataloader(self.params.batch_size)
        for epoch in range(self.params.n_epoch):
            print(f'Epoch: {epoch}')
            self.train_step(train_loader, print_step=200)    
            

# Training all batches

In [480]:
def convert_train_to_test(train_part, group=0):
    train_part.loc[train_part['answered_correctly']==-1, 'answered_correctly'] = 0
    train_part['answered_correctly'] = train_part['answered_correctly'].astype(str)
    train_part = train_part.rename(columns={'answered_correctly': 'prior_group_answers_correct'})
    train_part.iloc[0, 7] = '[' + ','.join(train_part['prior_group_answers_correct'].values) + ']'
#     train_part.loc[:, 'content_type_id'] = [1 if np.random.rand() > 1./1e4 else 0 for _ in range(train_part.shape[0])] 
    train_part.loc[:, 'content_type_id'] = [1 if np.random.rand() > 0.5 else 0 for _ in range(train_part.shape[0])] 
#     train_part.loc[:, 'content_type_id'] = 1

    train_part['group_num'] = [group]*train_part.shape[0]
    train_part = train_part.set_index('group_num')
    
    return train_part

In [481]:
def gen_mini_chunks(data_chunk, csize=int(1e5)):
    data_chunk = data_chunk.sort_values(by ='timestamp')
    for i in range(0, int(data_chunk.shape[0]), csize):
        yield data_chunk.iloc[i:i+csize]

In [482]:
def dump_all(ml_trainer):
    ml_trainer.dataset.reset_index()
    ml_trainer.dataset.user_df.to_parquet('./save/user_df.gzip',compression='gzip')
    ml_trainer.dataset.question_df.to_parquet('./save/question_df.gzip',compression='gzip')
    ml_trainer.dataset.lecture_df.to_parquet('./save/lecture_df.gzip',compression='gzip')
    ml_trainer.dataset.task_df.to_parquet('./save/task_df.gzip',compression='gzip')

    ml_trainer.dataset.lecture_question_scores.to_parquet('./save/lecture_question_scores.gzip',compression='gzip')
    ml_trainer.dataset.user_question_tag_scores.to_parquet('./save/user_question_tag_scores.gzip',compression='gzip')
    ml_trainer.dataset.user_question_part_scores.to_parquet('./save/user_question_part_scores.gzip',compression='gzip')

    f = h5py.File('./save/data2idx.h5', 'w')
    f.create_dataset('user2idx', data=list(ml_trainer.dataset.user2idx.keys()))
    f.create_dataset('question2idx', data=list(ml_trainer.dataset.question2idx.keys()))
    f.create_dataset('lecture2idx', data=list(ml_trainer.dataset.lecture2idx.keys()))
    f.close()

    joblib.dump(ml_trainer.dataset.scaler_ans_correct_mean, './save/scaler_ans_correct_mean.save') 
    joblib.dump(ml_trainer.dataset.scaler_ans_correct_count, './save/scaler_ans_correct_count.save') 
    joblib.dump(ml_trainer.dataset.scaler_prior_question_elapsed_time, './save/scaler_prior_question_elapsed_time.save')


In [483]:
torch.cuda.empty_cache()
params.load_state = False
# save_part = None
if params.load_state:
    print(f'LOADING ALL DATASET ...')
    mydata = LectureData(params) 
    trainer = Trainer(mydata, params)
    
    # filter data to speedup infer-time
#     trainer.dataset.user_df = trainer.dataset.user_df.iloc[:1000]
    trainer.dataset.lecture_question_scores = trainer.dataset.lecture_question_scores.iloc[-10000:]
    trainer.dataset.user_question_tag_scores = trainer.dataset.user_question_tag_scores.iloc[-10000:]
    trainer.dataset.user_question_part_scores = trainer.dataset.user_question_part_scores.iloc[-10000:]
    
    print('LOADING PRETRAINED MODEL ...')
    trainer.load_model(os.path.join(params.extra_dir, 'model_latest.pth'))
    
else:
    print(f'STARTING TRAINING FROM SCRATCH ...')
    mydata = None
    trainer = None
    start = time.time()
    for n, train_part in enumerate(chunks):
        mini_chunks = gen_mini_chunks(train_part)
        for train_part in mini_chunks:
#             train_part = train_part.sample(frac=0.01)      
            rest_part, valid_part = split_data(train_part, n_tail=6)
            stat_part, train_part = split_data(rest_part, n_tail=18)
        
            print(f'Train: {train_part.shape} / Stats: {stat_part.shape} / Valid: {valid_part.shape}')
            train_part = train_part.sort_values(by ='timestamp')

            print(f'***Training chunk-{n}:')
            if trainer is None:
                mydata = LectureData(params, train_part, questions.copy(), lectures.copy())  
                n_users, n_questions, n_lectures = mydata.n_users, mydata.n_questions, mydata.n_lectures
                params.update(**{
                    'num_users': n_users,
                    'num_questions': n_questions,
                    'num_lectures': n_lectures
                })
                print(f'[Train] All cases: {n_users}, {n_questions}, {n_lectures}')
                trainer = Trainer(mydata, params)
    #             trainer = GBMTrainer(mydata, params)

            else:
#                 train_part = convert_train_to_test(train_part, group=n)
#                 s1 = time.time()
#                 pred = trainer.infer(train_part)
#                 trainer.finetune_batch()
#                 print(f'Testing Time: {time.time() - s1}')
#                 continue

                trainer.incre_update(train_part, stat_part, valid_part)

        if n >= params.n_chunks:
            break
                
        print(f'Batch-Time elapsed: {time.time() - start}')

    print(f'Training finished in {time.time() - start} seconds')
    
    #--------------
    trainer.save_model('./save/model_latest.pth')
    dump_all(trainer)
    

STARTING TRAINING FROM SCRATCH ...
Train: (48084, 10) / Stats: (29704, 10) / Valid: (22212, 10)
***Training chunk-0:
[Train] All cases: 3603, 13523, 418
Moving model to gpus ...
Train: (21042, 10) / Stats: (69739, 10) / Valid: (9219, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6635755300521851 AUC: 0.5602557478687678 ACC: 0.61328125
>>> [Train] TOTAL AUC: 0.6470353298586643
------ [Valid] new-users: 0
- [Valid] AUC: 0.6447109588171839 ACC: 0.63671875
>>> [Valid] TOTAL AUC: 0.6725854220265585
Train: (16213, 10) / Stats: (77090, 10) / Valid: (6697, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5874923467636108 AUC: 0.6689094477151668 ACC: 0.6875
>>> [Train] TOTAL AUC: 0.6756540828978197
------ [Valid] new-users: 0
- [Valid] AUC: 0.7202775149795018 ACC: 0.69140625
>>> [Valid] TOTAL AUC: 0.6821178194041345
Train: (13273, 10) / Stats: (81274, 10) / Valid: (5453, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.571252167224884 AUC: 0.7196843853820597 ACC: 0.68359375
>>> [Train] TOTAL AUC: 0.6898763974428531
------ [Valid] new-users: 0
- [Valid] AUC: 0.6931910378830431 ACC: 0.65234375
>>> [Valid] TOTAL AUC: 0.6973109551899943
Train: (11262, 10) / Stats: (83984, 10) / Valid: (4754, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5709611773490906 AUC: 0.6989202657807309 ACC: 0.7109375
>>> [Train] TOTAL AUC: 0.6973521437101553
------ [Valid] new-users: 0
- [Valid] AUC: 0.6405716060888474 ACC: 0.58203125
>>> [Valid] TOTAL AUC: 0.6936685652336131
Train: (8958, 10) / Stats: (87238, 10) / Valid: (3804, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.575045108795166 AUC: 0.7104789590254706 ACC: 0.71484375
>>> [Train] TOTAL AUC: 0.6965451947870406
------ [Valid] new-users: 0
- [Valid] AUC: 0.7144043436601724 ACC: 0.66796875
>>> [Valid] TOTAL AUC: 0.698613509311984
Train: (7654, 10) / Stats: (89060, 10) / Valid: (3286, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5822306871414185 AUC: 0.6996904024767803 ACC: 0.703125
>>> [Train] TOTAL AUC: 0.6926771066079345
------ [Valid] new-users: 0
- [Valid] AUC: 0.6801943060935867 ACC: 0.625
>>> [Valid] TOTAL AUC: 0.6964665117807542
Train: (5952, 10) / Stats: (91502, 10) / Valid: (2546, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6139647960662842 AUC: 0.6668005354752344 ACC: 0.6328125
>>> [Train] TOTAL AUC: 0.6976343628047775
------ [Valid] new-users: 0
- [Valid] AUC: 0.7417840375586854 ACC: 0.64453125
>>> [Valid] TOTAL AUC: 0.6924085406260327
Train: (4740, 10) / Stats: (93212, 10) / Valid: (2048, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5641509294509888 AUC: 0.7424476888918791 ACC: 0.69921875
>>> [Train] TOTAL AUC: 0.6973615719516222
------ [Valid] new-users: 0
- [Valid] AUC: 0.6814743589743589 ACC: 0.63671875
>>> [Valid] TOTAL AUC: 0.6987044139711851
Train: (4392, 10) / Stats: (93793, 10) / Valid: (1815, 10)
***Training chunk-0:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5610247254371643 AUC: 0.7806522725799131 ACC: 0.72265625
>>> [Train] TOTAL AUC: 0.7062491688669731
------ [Valid] new-users: 0
- [Valid] AUC: 0.6722507773335872 ACC: 0.63671875
>>> [Valid] TOTAL AUC: 0.6890219430284128
Batch-Time elapsed: 178.84075593948364
Train: (49116, 10) / Stats: (28237, 10) / Valid: (22647, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6299916505813599 AUC: 0.7138671875 ACC: 0.65234375
>>> [Train] TOTAL AUC: 0.7254599092557087
------ [Valid] new-users: 0
- [Valid] AUC: 0.7299975532175189 ACC: 0.6484375
>>> [Valid] TOTAL AUC: 0.7078061755274556
Train: (21461, 10) / Stats: (69290, 10) / Valid: (9249, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5702736377716064 AUC: 0.7283933421547462 ACC: 0.71484375
>>> [Train] TOTAL AUC: 0.7255564291284956
------ [Valid] new-users: 0
- [Valid] AUC: 0.7445597165991903 ACC: 0.69921875
>>> [Valid] TOTAL AUC: 0.7175927668536835
Train: (16284, 10) / Stats: (76855, 10) / Valid: (6861, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5901410579681396 AUC: 0.667367535744323 ACC: 0.68359375
>>> [Train] TOTAL AUC: 0.7153256819514751
------ [Valid] new-users: 0
- [Valid] AUC: 0.6651062753036437 ACC: 0.62890625
>>> [Valid] TOTAL AUC: 0.7012510947786961
Train: (13625, 10) / Stats: (80668, 10) / Valid: (5707, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5705718994140625 AUC: 0.7084717607973422 ACC: 0.72265625
>>> [Train] TOTAL AUC: 0.7187626477015201
------ [Valid] new-users: 0
- [Valid] AUC: 0.6759765624999999 ACC: 0.625
>>> [Valid] TOTAL AUC: 0.7065172603595911
Train: (11385, 10) / Stats: (83836, 10) / Valid: (4779, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5693925023078918 AUC: 0.7068181818181818 ACC: 0.734375
>>> [Train] TOTAL AUC: 0.7144739122353881
------ [Valid] new-users: 0
- [Valid] AUC: 0.6882416396979503 ACC: 0.6640625
>>> [Valid] TOTAL AUC: 0.7141205754052194
Train: (9083, 10) / Stats: (87031, 10) / Valid: (3886, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6025427579879761 AUC: 0.6904953145917001 ACC: 0.6796875
>>> [Train] TOTAL AUC: 0.7118666617650029
------ [Valid] new-users: 0
- [Valid] AUC: 0.7389676910953507 ACC: 0.69921875
>>> [Valid] TOTAL AUC: 0.7153527003469264
Train: (7962, 10) / Stats: (88545, 10) / Valid: (3493, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.589780330657959 AUC: 0.7353792704111176 ACC: 0.69140625
>>> [Train] TOTAL AUC: 0.7178453870885726
------ [Valid] new-users: 0
- [Valid] AUC: 0.7238202545068929 ACC: 0.7265625
>>> [Valid] TOTAL AUC: 0.7012264137927828
Train: (6328, 10) / Stats: (90981, 10) / Valid: (2691, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5681527853012085 AUC: 0.7302697302697302 ACC: 0.73046875
>>> [Train] TOTAL AUC: 0.6982629314134725
------ [Valid] new-users: 0
- [Valid] AUC: 0.6867971419695558 ACC: 0.65234375
>>> [Valid] TOTAL AUC: 0.6819890445719606
Train: (4847, 10) / Stats: (93046, 10) / Valid: (2107, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6069643497467041 AUC: 0.705954533712219 ACC: 0.6875
>>> [Train] TOTAL AUC: 0.7101049780736419
------ [Valid] new-users: 0
- [Valid] AUC: 0.7271484375 ACC: 0.6796875
>>> [Valid] TOTAL AUC: 0.7141900913216791
Train: (2967, 10) / Stats: (95827, 10) / Valid: (1206, 10)
***Training chunk-1:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5511529445648193 AUC: 0.7594698243961515 ACC: 0.73046875
>>> [Train] TOTAL AUC: 0.7147942851205337
------ [Valid] new-users: 0
- [Valid] AUC: 0.6544122414575634 ACC: 0.6640625
>>> [Valid] TOTAL AUC: 0.6679162138644532
Batch-Time elapsed: 478.655104637146
Train: (52120, 10) / Stats: (22977, 10) / Valid: (24903, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6153711080551147 AUC: 0.7064039408866996 ACC: 0.66796875
+++ [Train] Loss: 0.5967087149620056 AUC: 0.7549182855380819 ACC: 0.66796875
>>> [Train] TOTAL AUC: 0.734615228617
------ [Valid] new-users: 0
- [Valid] AUC: 0.7271667791204074 ACC: 0.68359375
>>> [Valid] TOTAL AUC: 0.7055475776845566
Train: (23658, 10) / Stats: (65808, 10) / Valid: (10534, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5785655975341797 AUC: 0.7370816599732263 ACC: 0.703125
>>> [Train] TOTAL AUC: 0.7281664178871309
------ [Valid] new-users: 0
- [Valid] AUC: 0.7237978643814187 ACC: 0.7421875
>>> [Valid] TOTAL AUC: 0.7216180121723176
Train: (17907, 10) / Stats: (74268, 10) / Valid: (7825, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5876211524009705 AUC: 0.6897832817337461 ACC: 0.671875
>>> [Train] TOTAL AUC: 0.7155820737239343
------ [Valid] new-users: 0
- [Valid] AUC: 0.7902592807051405 ACC: 0.71484375
>>> [Valid] TOTAL AUC: 0.7139268223901445
Train: (14546, 10) / Stats: (79272, 10) / Valid: (6182, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5884630084037781 AUC: 0.7073863636363636 ACC: 0.69140625
>>> [Train] TOTAL AUC: 0.7263065601405265
------ [Valid] new-users: 0
- [Valid] AUC: 0.6933333333333335 ACC: 0.6484375
>>> [Valid] TOTAL AUC: 0.7139088870636081
Train: (12280, 10) / Stats: (82609, 10) / Valid: (5111, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5567460060119629 AUC: 0.7352597179222566 ACC: 0.71875
>>> [Train] TOTAL AUC: 0.7312988776323212
------ [Valid] new-users: 0
- [Valid] AUC: 0.6580640924612555 ACC: 0.63671875
>>> [Valid] TOTAL AUC: 0.7149598105269516
Train: (9894, 10) / Stats: (85916, 10) / Valid: (4190, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6102725863456726 AUC: 0.7004790801660812 ACC: 0.6796875
>>> [Train] TOTAL AUC: 0.7209454338004638
------ [Valid] new-users: 0
- [Valid] AUC: 0.6819875776397516 ACC: 0.65625
>>> [Valid] TOTAL AUC: 0.7145119985215485
Train: (8786, 10) / Stats: (87410, 10) / Valid: (3804, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5705999732017517 AUC: 0.6948853615520282 ACC: 0.75
>>> [Train] TOTAL AUC: 0.7167943740482569
------ [Valid] new-users: 0
- [Valid] AUC: 0.800630715862504 ACC: 0.7265625
>>> [Valid] TOTAL AUC: 0.7248289703837327
Train: (7098, 10) / Stats: (89846, 10) / Valid: (3056, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5934860706329346 AUC: 0.6921078921078921 ACC: 0.6796875
>>> [Train] TOTAL AUC: 0.7114912436552013
------ [Valid] new-users: 0
- [Valid] AUC: 0.7368653421633553 ACC: 0.6953125
>>> [Valid] TOTAL AUC: 0.7003133157558679
Train: (5919, 10) / Stats: (91471, 10) / Valid: (2610, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5942683815956116 AUC: 0.7026307026307027 ACC: 0.6953125
>>> [Train] TOTAL AUC: 0.7226088742430784
------ [Valid] new-users: 0
- [Valid] AUC: 0.7528153153153154 ACC: 0.69140625
>>> [Valid] TOTAL AUC: 0.6919884286227899
Train: (4265, 10) / Stats: (93946, 10) / Valid: (1789, 10)
***Training chunk-2:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.626514196395874 AUC: 0.6704805491990847 ACC: 0.65625
>>> [Train] TOTAL AUC: 0.7122315883945383
------ [Valid] new-users: 0
- [Valid] AUC: 0.7782725961236906 ACC: 0.70703125
>>> [Valid] TOTAL AUC: 0.7213606769147052
Batch-Time elapsed: 811.7013437747955
Train: (48259, 10) / Stats: (29815, 10) / Valid: (21926, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6142556667327881 AUC: 0.7325879765395895 ACC: 0.6875
>>> [Train] TOTAL AUC: 0.7341269276904562
------ [Valid] new-users: 0
- [Valid] AUC: 0.6793587786259542 ACC: 0.609375
>>> [Valid] TOTAL AUC: 0.7060197817762983
Train: (21043, 10) / Stats: (69925, 10) / Valid: (9032, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6051256060600281 AUC: 0.6896371363190585 ACC: 0.65625
>>> [Train] TOTAL AUC: 0.7264950396138805
------ [Valid] new-users: 0
- [Valid] AUC: 0.7529921059332825 ACC: 0.69921875
>>> [Valid] TOTAL AUC: 0.7189164787544974
Train: (15586, 10) / Stats: (77919, 10) / Valid: (6495, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5825940370559692 AUC: 0.6921164772727272 ACC: 0.69140625
>>> [Train] TOTAL AUC: 0.7285273567019288
------ [Valid] new-users: 0
- [Valid] AUC: 0.7669754072181412 ACC: 0.71484375
>>> [Valid] TOTAL AUC: 0.7222750588927094
Train: (12736, 10) / Stats: (82024, 10) / Valid: (5240, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5777285099029541 AUC: 0.6896719827286023 ACC: 0.703125
>>> [Train] TOTAL AUC: 0.7291978021716219
------ [Valid] new-users: 0
- [Valid] AUC: 0.7206485904595303 ACC: 0.6875
>>> [Valid] TOTAL AUC: 0.722549490359411
Train: (10120, 10) / Stats: (85612, 10) / Valid: (4268, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.529882550239563 AUC: 0.7265471958209389 ACC: 0.76171875
>>> [Train] TOTAL AUC: 0.7227390269699373
------ [Valid] new-users: 0
- [Valid] AUC: 0.7142590866728797 ACC: 0.66796875
>>> [Valid] TOTAL AUC: 0.7154212859977842
Train: (8719, 10) / Stats: (87646, 10) / Valid: (3635, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.572097659111023 AUC: 0.7665461006409037 ACC: 0.70703125
>>> [Train] TOTAL AUC: 0.7223952199644272
------ [Valid] new-users: 0
- [Valid] AUC: 0.7786204642039882 ACC: 0.71484375
>>> [Valid] TOTAL AUC: 0.7170370932085824
Train: (6913, 10) / Stats: (90162, 10) / Valid: (2925, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5334320068359375 AUC: 0.7741277290348908 ACC: 0.7421875
>>> [Train] TOTAL AUC: 0.7244351532583805
------ [Valid] new-users: 0
- [Valid] AUC: 0.7361882968290292 ACC: 0.73828125
>>> [Valid] TOTAL AUC: 0.7163472462166861
Train: (5278, 10) / Stats: (92487, 10) / Valid: (2235, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5736026763916016 AUC: 0.7528416429863084 ACC: 0.69921875
>>> [Train] TOTAL AUC: 0.7122528449209399
------ [Valid] new-users: 0
- [Valid] AUC: 0.7107514880952381 ACC: 0.62109375
>>> [Valid] TOTAL AUC: 0.6879573029811072
Train: (4641, 10) / Stats: (93355, 10) / Valid: (2004, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5841841101646423 AUC: 0.7287248144220573 ACC: 0.6953125
>>> [Train] TOTAL AUC: 0.7346743952222693
------ [Valid] new-users: 0
- [Valid] AUC: 0.7090909090909092 ACC: 0.62109375
>>> [Valid] TOTAL AUC: 0.7147285197517651
Train: (3332, 10) / Stats: (95224, 10) / Valid: (1444, 10)
***Training chunk-3:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5710919499397278 AUC: 0.6995896032831738 ACC: 0.70703125
>>> [Train] TOTAL AUC: 0.7310205976950043
------ [Valid] new-users: 0
- [Valid] AUC: 0.6704101562500001 ACC: 0.640625
>>> [Valid] TOTAL AUC: 0.6968882297394956
Batch-Time elapsed: 1161.9799461364746
Train: (48366, 10) / Stats: (29469, 10) / Valid: (22165, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5886649489402771 AUC: 0.7517113783533764 ACC: 0.67578125
>>> [Train] TOTAL AUC: 0.7352859224847663
------ [Valid] new-users: 0
- [Valid] AUC: 0.6694214876033058 ACC: 0.62890625
>>> [Valid] TOTAL AUC: 0.7098500645475654
Train: (21444, 10) / Stats: (69399, 10) / Valid: (9157, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.576583981513977 AUC: 0.7524522799575822 ACC: 0.7109375
>>> [Train] TOTAL AUC: 0.7215339255186558
------ [Valid] new-users: 0
- [Valid] AUC: 0.7566325190438666 ACC: 0.71875
>>> [Valid] TOTAL AUC: 0.7278735873996999
Train: (16139, 10) / Stats: (77076, 10) / Valid: (6785, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.561362624168396 AUC: 0.7787114845938377 ACC: 0.72265625
>>> [Train] TOTAL AUC: 0.7222058374519074
------ [Valid] new-users: 0
- [Valid] AUC: 0.7199060854115108 ACC: 0.6875
>>> [Valid] TOTAL AUC: 0.72115139143062
Train: (12921, 10) / Stats: (81704, 10) / Valid: (5375, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6296743154525757 AUC: 0.6670572916666667 ACC: 0.67578125
>>> [Train] TOTAL AUC: 0.7277751486046211
------ [Valid] new-users: 0
- [Valid] AUC: 0.7041111754487551 ACC: 0.66015625
>>> [Valid] TOTAL AUC: 0.718185332508879
Train: (10142, 10) / Stats: (85572, 10) / Valid: (4286, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5696973204612732 AUC: 0.7269944909202205 ACC: 0.703125
>>> [Train] TOTAL AUC: 0.7212376866514552
------ [Valid] new-users: 0
- [Valid] AUC: 0.6403903903903904 ACC: 0.6015625
>>> [Valid] TOTAL AUC: 0.6994755087181295
Train: (9011, 10) / Stats: (87185, 10) / Valid: (3804, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5171425342559814 AUC: 0.7799647266313934 ACC: 0.76953125
>>> [Train] TOTAL AUC: 0.7404964917190481
------ [Valid] new-users: 0
- [Valid] AUC: 0.6897552609011148 ACC: 0.65234375
>>> [Valid] TOTAL AUC: 0.7217135066323981
Train: (7571, 10) / Stats: (89240, 10) / Valid: (3189, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5644018054008484 AUC: 0.729858803986711 ACC: 0.703125
>>> [Train] TOTAL AUC: 0.7136402100410714
------ [Valid] new-users: 0
- [Valid] AUC: 0.6831831831831832 ACC: 0.66796875
>>> [Valid] TOTAL AUC: 0.7160854883954191
Train: (6199, 10) / Stats: (91237, 10) / Valid: (2564, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.593055009841919 AUC: 0.6581861508270255 ACC: 0.66796875
>>> [Train] TOTAL AUC: 0.717514913412507
------ [Valid] new-users: 0
- [Valid] AUC: 0.7839556277056277 ACC: 0.73828125
>>> [Valid] TOTAL AUC: 0.7298976085242991
Train: (5101, 10) / Stats: (92748, 10) / Valid: (2151, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5699625015258789 AUC: 0.7133505118740859 ACC: 0.69140625
>>> [Train] TOTAL AUC: 0.7229386898529627
------ [Valid] new-users: 0
- [Valid] AUC: 0.7246992215145082 ACC: 0.67578125
>>> [Valid] TOTAL AUC: 0.717979547837394
Train: (3954, 10) / Stats: (94402, 10) / Valid: (1644, 10)
***Training chunk-4:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5812650918960571 AUC: 0.7612903225806451 ACC: 0.703125
>>> [Train] TOTAL AUC: 0.731879825559731
------ [Valid] new-users: 0
- [Valid] AUC: 0.689120151371807 ACC: 0.66015625
>>> [Valid] TOTAL AUC: 0.7098923244214699
Batch-Time elapsed: 1539.8255388736725
Train: (50108, 10) / Stats: (26289, 10) / Valid: (23603, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6237910985946655 AUC: 0.7157869012707723 ACC: 0.66796875
>>> [Train] TOTAL AUC: 0.7375010318717737
------ [Valid] new-users: 0
- [Valid] AUC: 0.6762906777587472 ACC: 0.61328125
>>> [Valid] TOTAL AUC: 0.7039002541932299
Train: (22513, 10) / Stats: (67658, 10) / Valid: (9829, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5834735631942749 AUC: 0.7291378059238736 ACC: 0.69140625
>>> [Train] TOTAL AUC: 0.7339752535482956
------ [Valid] new-users: 0
- [Valid] AUC: 0.7284308048639259 ACC: 0.65625
>>> [Valid] TOTAL AUC: 0.7239740711717707
Train: (16680, 10) / Stats: (76314, 10) / Valid: (7006, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.576947808265686 AUC: 0.7145917001338686 ACC: 0.67578125
>>> [Train] TOTAL AUC: 0.7218920272901239
------ [Valid] new-users: 0
- [Valid] AUC: 0.7262264150943396 ACC: 0.6640625
>>> [Valid] TOTAL AUC: 0.7045590364389477
Train: (13715, 10) / Stats: (80628, 10) / Valid: (5657, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.6049433350563049 AUC: 0.7029837251356239 ACC: 0.6796875
>>> [Train] TOTAL AUC: 0.7301039309256719
------ [Valid] new-users: 0
- [Valid] AUC: 0.7569391392920803 ACC: 0.6875
>>> [Valid] TOTAL AUC: 0.714754050764528
Train: (11281, 10) / Stats: (84000, 10) / Valid: (4719, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5477458238601685 AUC: 0.7451436388508892 ACC: 0.71875
>>> [Train] TOTAL AUC: 0.7246474110643106
------ [Valid] new-users: 0
- [Valid] AUC: 0.7033112582781457 ACC: 0.69921875
>>> [Valid] TOTAL AUC: 0.7222211985915958
Train: (9871, 10) / Stats: (85797, 10) / Valid: (4332, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0
+++ [Train] Loss: 0.5418368577957153 AUC: 0.7772273821809426 ACC: 0.7265625
>>> [Train] TOTAL AUC: 0.7289188153265111
------ [Valid] new-users: 0
- [Valid] AUC: 0.7248535777947543 ACC: 0.6796875
>>> [Valid] TOTAL AUC: 0.7189000955618998
Train: (8732, 10) / Stats: (87617, 10) / Valid: (3651, 10)
***Training chunk-5:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[Train] New cases: 0, 0, 0
[Train] All cases: 3603, 13523, 418
Epoch: 0


Exception ignored in: <function _releaseLock at 0x7f7ae9ad4820>
Traceback (most recent call last):
  File "/home/ntrong/.conda/envs/myenv/lib/python3.8/logging/__init__.py", line 223, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


+++ [Train] Loss: 0.5756256580352783 AUC: 0.7147951288434368 ACC: 0.68359375
>>> [Train] TOTAL AUC: 0.7182633719540252
------ [Valid] new-users: 0
- [Valid] AUC: 0.7126157777068028 ACC: 0.69921875


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ntrong/.conda/envs/myenv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-483-8d262aaa8ee8>", line 55, in <module>
    trainer.incre_update(train_part, stat_part, valid_part)
  File "<ipython-input-479-075ce0d10d0c>", line 94, in incre_update
    self.val(val_batch)
  File "<ipython-input-479-075ce0d10d0c>", line 193, in val
    self.val_step(val_loader, print_step=400)
  File "<ipython-input-479-075ce0d10d0c>", line 157, in val_step
    for i, databatch in enumerate(val_loader):
  File "/home/ntrong/.conda/envs/myenv/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 345, in __next__
    data = self._next_data()
  File "/home/ntrong/.conda/envs/myenv/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 841, in _next_data
    idx, data = self._get_data()
  File "/home/ntrong/.conda/envs/myenv/lib/p

TypeError: object of type 'NoneType' has no len()

In [None]:
import riiideducation
# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()

# You can only iterate through a result from `env.iter_test()` once
# so be careful not to lose it once you start iterating.
iter_test = env.iter_test()

In [None]:
################################
# Submission
################################

print(f'Start testing ....')
for (test_df, sample_prediction_df) in iter_test:
    # do prediction
    pred = trainer.infer(test_df)

    # fill all value first
    test_df['answered_correctly'] = 0.5
    
    # Only fill question-type
    test_df.loc[test_df['content_type_id'] == 0, 'answered_correctly'] = pred

    # submit prediction    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    
    trainer.finetune_batch()