In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pickle

In [2]:
class Riiid:
    
    path = None
    
    dtype={'row_id': 'int64', 'timestamp': 'int64',
           'user_id': 'int32', 'content_id': 'int16',
           'content_type_id': 'int8', 'task_container_id': 'int16',
           'user_answer': 'int8', 'answered_correctly': 'int8',
           'prior_question_elapsed_time': 'float32',
           'prior_question_had_explanation': 'boolean',
          }
    
    usecols=['timestamp', 'user_id', 'content_id',
             'content_type_id','task_container_id',
             'answered_correctly',
             'prior_question_elapsed_time','prior_question_had_explanation']
    
    questions_df = None
    lectures_df = None
    
    stats_user_target = None
    stats_question_target = None

    hist_question_cumcount = pd.DataFrame(columns=['question_cumcount'])
    hist_time_between = pd.DataFrame(columns=['timestamp'])
    hist_prior_question_elapsed_time_cumsum = pd.DataFrame(columns=['prior_question_elapsed_time_cumsum'])
    hist_target_cumsum = pd.DataFrame(columns=['target_cumsum'])
    
    features = None
    _na_dict = None
    _dtype_dict = None

    def __init__(self):
        pass

    
    @staticmethod
    def _proc_question_tags(df):
        return pd.concat([df.drop('tags', 1), df['tags'].str.get_dummies(sep=" ")], 1)
    
    @classmethod
    def load_and_process_questions(cls):
        
        cls.questions_df = pd.read_csv(cls.path/'questions.csv')
        
        # drop columns
        cls.questions_df = cls.questions_df.drop(columns=['correct_answer'])
        
        # add number of tags
        cls.questions_df['num_of_tags'] = cls.questions_df['tags'].map(lambda x: len(str(x).split()))
        
        # add number of questions in bundle
        tmp = cls.questions_df[['question_id', 'bundle_id']] \
            .groupby('bundle_id').count() \
            .rename(columns={'question_id':'bundle_size'})
        
        cls.questions_df = cls.questions_df.join(tmp, on='bundle_id')
        
        # one hot encode tags
        #cls.questions_df = cls._proc_question_tags(cls.questions_df)
        
    @classmethod
    def load_and_process_lectures(cls):
        
        cls.lectures_df = pd.read_csv(cls.path/'lectures.csv')

        # process lectures data
        cls.lectures_df['type_of'] = cls.lectures_df['type_of'].astype('category')
#         types_of = ('type_starter', 'type_concept', 'type_intention', 'type_solving question')
#         cls.lectures_df['type_of'].cat.set_categories(types_of, ordered=False, inplace=True)

    
    @classmethod
    def set_data_path(cls, path):
        cls.path = Path(path)
    
    @classmethod
    def _get_user_target_stats(cls, train):
        
        # user part
        cls.stats_user_target = train.loc[train['content_type_id']==0,
                                          ['user_id', 'answered_correctly']] \
            .groupby('user_id').agg(['mean', 'std', 'skew',])
        
        cls.stats_user_target.columns = cls.stats_user_target.columns.droplevel()
        cls.stats_user_target.columns = ['user_mean', 'user_std', 'user_skew']
        
        cls.stats_user_target = cls.stats_user_target.astype(
            dtype = {'user_mean':'float32', 'user_std':'float32', 'user_skew':'float32'})
        
        cls.stats_user_target.fillna(0.0, inplace=True)

    @classmethod
    def _get_question_target_stats(cls, train):

        # question part
        cls.stats_question_target = train.loc[train['content_type_id']==0,
                                              ['content_id', 'answered_correctly']] \
            .groupby('content_id').agg(['mean', 'std', 'skew',])
        
        cls.stats_question_target.columns = cls.stats_question_target.columns.droplevel()
        cls.stats_question_target.columns = ['question_mean', 'question_std', 'question_skew']
        
        cls.stats_question_target = cls.stats_question_target.astype(
            dtype = {'question_mean':'float32', 'question_std':'float32', 'question_skew':'float32'})
        
        cls.stats_question_target.fillna(0.0, inplace=True)
    
    @staticmethod
    def _make_question_cumcount(df, inference=True, prev_group=None):
        """For test/validation datasets only."""
        
        if inference:
            if prev_group is not None:
                # collect data from previous group first
                prev_group['question_cumcount'] = prev_group[['user_id', 'content_type_id', 'content_id']]\
                    .groupby(['user_id', 'content_type_id']).transform('cumcount') + 1
            
        
                prev_group['question_cumcount'] += prev_group.join(Riiid.hist_question_cumcount,
                                                                   on='user_id',
                                                                   rsuffix='_tmp')\
                                     .fillna({'question_cumcount_tmp':0})\
                                     .astype({'question_cumcount_tmp':'int16'})['question_cumcount_tmp']
            
                # update hist_question_cumcount with collected data
                Riiid.hist_question_cumcount = pd.concat([Riiid.hist_question_cumcount,
                                                      prev_group.loc[prev_group['content_type_id']==0,
                                                             ['user_id', 'question_cumcount']]\
                                                      .groupby('user_id').max()])
        
                Riiid.hist_question_cumcount = Riiid.hist_question_cumcount[
                    ~Riiid.hist_question_cumcount.index.duplicated(keep='last')]
                
                # and then build question_cumcount for current group
                df['question_cumcount'] = 0
        
                df['question_cumcount'] += df.join(Riiid.hist_question_cumcount, on='user_id', rsuffix='_tmp')\
                                     .fillna({'question_cumcount_tmp':0})\
                                     .astype({'question_cumcount_tmp':'int16'})['question_cumcount_tmp']
            
            
            else:
                # build question_cumcount for current group
                df['question_cumcount'] = 0
        
                df['question_cumcount'] += df.join(Riiid.hist_question_cumcount, on='user_id', rsuffix='_tmp')\
                                     .fillna({'question_cumcount_tmp':0})\
                                     .astype({'question_cumcount_tmp':'int16'})['question_cumcount_tmp']
        else:
            # not for inference
            df['question_cumcount'] = df[['user_id', 'content_type_id', 'content_id']]\
                .groupby(['user_id', 'content_type_id']).transform('cumcount') + 1
        
            df['question_cumcount'] += df.join(Riiid.hist_question_cumcount, on='user_id', rsuffix='_tmp')\
                                     .fillna({'question_cumcount_tmp':0})\
                                     .astype({'question_cumcount_tmp':'int16'})['question_cumcount_tmp']
        
            #update hist
            Riiid.hist_question_cumcount = pd.concat([Riiid.hist_question_cumcount,
                                                      df.loc[df['content_type_id']==0,
                                                             ['user_id', 'question_cumcount']]\
                                                      .groupby('user_id').max()])
        
            Riiid.hist_question_cumcount = Riiid.hist_question_cumcount[
                    ~Riiid.hist_question_cumcount.index.duplicated(keep='last')]

        return df
    
    @staticmethod
    def _make_target_cumsum(df, inference=True, prev_group=None):
        """For test/validation datasets only."""
        
        if inference:
            if prev_group is not None:
                # collect data from previous group first
                prev_group['target_cumsum'] = prev_group[['user_id', 'content_type_id', 'answered_correctly']]\
                    .groupby(['user_id', 'content_type_id']).transform('cumsum')
                
                prev_group['target_cumsum'] += prev_group.join(Riiid.hist_target_cumsum,
                                                               on='user_id',
                                                               rsuffix='_tmp')\
                                     .fillna({'target_cumsum_tmp':0})\
                                     .astype({'target_cumsum_tmp':int})['target_cumsum_tmp']
                
                # update hist_target_cumsum with collected data
                Riiid.hist_target_cumsum = pd.concat([Riiid.hist_target_cumsum,
                                                      prev_group.loc[prev_group['content_type_id']==0,
                                                             ['user_id', 'target_cumsum']]\
                                                      .groupby('user_id').max()])
        
                Riiid.hist_target_cumsum = Riiid.hist_target_cumsum[
                    ~Riiid.hist_target_cumsum.index.duplicated(keep='last')]                

                #build target_cumsum for current group
                df['target_cumsum'] = 0
                
                # Assume that the 1st answer is correct (fillna with 1)
                df['target_cumsum'] += df.join(Riiid.hist_target_cumsum, on='user_id', rsuffix='_tmp')\
                    .fillna({'target_cumsum_tmp':1})\
                    .astype({'target_cumsum_tmp':int})['target_cumsum_tmp']                 
            else:
                # build target_cumsum for current group
                df['target_cumsum']=0
                
                # Assume that the 1st answer is correct (fillna with 1)
                df['target_cumsum'] += df.join(Riiid.hist_target_cumsum, on='user_id', rsuffix='_tmp')\
                    .fillna({'target_cumsum_tmp':1})\
                    .astype({'target_cumsum_tmp':int})['target_cumsum_tmp'] 
        else:
            # not for inference
            df['target_cumsum'] = df[['user_id', 'content_type_id', 'answered_correctly']]\
                .groupby(['user_id', 'content_type_id']).transform('cumsum')
        
            df['target_cumsum'] += df.join(Riiid.hist_target_cumsum, on='user_id', rsuffix='_tmp')\
                                     .fillna({'target_cumsum_tmp':0})\
                                     .astype({'target_cumsum_tmp':int})['target_cumsum_tmp']
        
            #update hist
            Riiid.hist_target_cumsum = pd.concat([Riiid.hist_target_cumsum,
                                                      df.loc[df['content_type_id']==0,
                                                             ['user_id', 'target_cumsum']]\
                                                      .groupby('user_id').max()])
        
            Riiid.hist_target_cumsum = Riiid.hist_target_cumsum[
                    ~Riiid.hist_target_cumsum.index.duplicated(keep='last')] 

        return df 
    
    
    @staticmethod
    def _make_prior_question_elapsed_time_cumsum(df):
        """For test/validation datasets only."""
        
        df['prior_question_elapsed_time_cumsum'] = df[['user_id', 'content_type_id', 'prior_question_elapsed_time']]\
                .groupby(['user_id', 'content_type_id']).transform('cumsum')
        
        df['prior_question_elapsed_time_cumsum'] += df.join(Riiid.hist_prior_question_elapsed_time_cumsum,
                                                            on='user_id', rsuffix='_tmp')\
                                     .fillna({'prior_question_elapsed_time_cumsum_tmp':0})\
                                     .astype({'prior_question_elapsed_time_cumsum_tmp':int})[
            'prior_question_elapsed_time_cumsum_tmp']
        
        # update history
        Riiid.hist_prior_question_elapsed_time_cumsum = pd.concat([Riiid.hist_prior_question_elapsed_time_cumsum,
                                                      df.loc[df['content_type_id']==0,
                                                             ['user_id', 'prior_question_elapsed_time_cumsum']]\
                                                      .groupby('user_id').max()])
        
        Riiid.hist_prior_question_elapsed_time_cumsum = Riiid.hist_prior_question_elapsed_time_cumsum[
                    ~Riiid.hist_prior_question_elapsed_time_cumsum.index.duplicated(keep='last')] 

        return df        
    
    @staticmethod
    def _make_time_between(df):
        """For test/validation datasets only."""
        
        df['time_between'] = df[['user_id', 'content_type_id', 'timestamp']]\
                .groupby(['user_id', 'content_type_id']).transform('diff')
        
        s = df.join(Riiid.hist_time_between,
                    on='user_id',
                    rsuffix='_tmp').fillna({'timestamp_tmp':0})['timestamp_tmp'].values

        df['time_between'] = np.where(~df['time_between'].isna(),
                                      df['time_between'].values,
                                      df['timestamp'].values - s)
        df['time_between'] = df['time_between'].astype({'time_between':'int64'})
        
        #update hist
        Riiid.hist_time_between = pd.concat([Riiid.hist_time_between,
                                             df.loc[df['content_type_id']==0, ['user_id', 'timestamp']]\
                                             .groupby('user_id').max()])
        
        Riiid.hist_time_between = Riiid.hist_time_between[~Riiid.hist_time_between.index.duplicated(keep='last')]

        return df
        
    def setup_data_stats(self, df):
        
        if Riiid.stats_user_target is None:
            Riiid._get_user_target_stats(df)
        print('train_user_target_stats - Done')
        
        if Riiid.stats_question_target is None:
            Riiid._get_question_target_stats(df)
        print('train_question_target_stats - Done')
        
        if Riiid._dtype_dict is None:
            Riiid._dtype_dict = df.dtypes.to_dict()
            del Riiid._dtype_dict['answered_correctly']
        print('_dtype_dict - Done')
        
        if Riiid._na_dict is None:
            Riiid._na_dict = {
                              'part': 0,
                              'num_of_tags': 0,
                              'bundle_size': 0,
                              'question_mean': Riiid.stats_question_target['question_mean'].mean(axis=0),
                              'question_std': Riiid.stats_question_target['question_std'].mean(axis=0),
                              'question_skew': Riiid.stats_question_target['question_skew'].mean(axis=0),
                             }
        print('_na_dict - Done')
    
    @classmethod
    def get_features(self, df):
        # save features
        if Riiid.features is None:
            Riiid.features = list(df.columns)
            Riiid.features.remove('answered_correctly')
    
    def transform_data(self, df, test=False, verbose=False, inference=True, prev_group=None):
        
        if not test: # we need questions and lectures for test
            # step 0 = keep questions only
            df = df.loc[df[df['content_type_id']==0].index]
            if verbose: print('step 0 (keep questions only) - Done')

        # step 1 = fillna for prior_question_elapsed_time and prior_question_had_explanation
        df = df.fillna({'prior_question_elapsed_time':0.,
                        'prior_question_had_explanation':False})
        if verbose: print('step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done')
        
        # step 2 merge question without question_id, and tags
        df = df.join(self.questions_df, on='content_id') \
               .drop(columns=['question_id',
                              'tags'])
        
        # fillna fillna mainly for lectures
        df = df.fillna({'prior_question_elapsed_time':0.,
                        'prior_question_had_explanation':False,
                        'bundle_id':0, 'num_of_tags':0, 'bundle_size':0,
                        'part':0, 'n_answer_options':0})
        # change dtype
        df = df.astype({'bundle_id':'int16', 'num_of_tags':'int8',
                        'bundle_size':'int8', 'prior_question_had_explanation':'bool',
                        'part':'int8',
                       })
        
        if verbose: print('step 2 (merge questions_df) - Done')
               
        # step 3 merge question target stats
        df = df.join(self.stats_question_target, on='content_id')
        if verbose: print('step 3 (merge question_target_stats) - Done')
        
        # step 4 merge train_user_target_stats
#         df = df.join(self.stats_train_user_target, on='user_id')
#         if verbose: print('step 4 (merge train_user_target_stats) - Done')
        
        # step 4a add time_between
        if test:
            df = self._make_time_between(df)
        else:
            df['time_between'] = df[['user_id', 'timestamp']].groupby('user_id').diff().fillna(0.).astype(int)        
        if verbose: print('step 4a (add time_between) - Done')
            
        # step 4b add question_cumcount
        if test:
            df = self._make_question_cumcount(df, inference=inference, prev_group=prev_group)
        else:
            df['question_cumcount'] = df[['user_id', 'content_id']]\
                .groupby(['user_id']).cumcount().astype('int16') + 1        
        if verbose: print('step 4b (add question_cumcount) - Done')
        
        # step 4c add target_cumsum
        if test:
            df = self._make_target_cumsum(df, inference=inference, prev_group=prev_group)
        else:
            df['target_cumsum'] = df[['user_id', 'answered_correctly']]\
                .groupby('user_id').cumsum().astype(int)
        if verbose: print('step 4c (add target_cumsum) - Done')
            
        # step 4d add prior_question_elapsed_time_cumsum
        if test:
            df = self._make_prior_question_elapsed_time_cumsum(df)
        else:
            df['prior_question_elapsed_time_cumsum'] = df[['user_id', 'prior_question_elapsed_time']]\
                .groupby('user_id').cumsum().astype(int)
        if verbose: print('step 4d (add prior_question_elapsed_time_cumsum) - Done')
            
        # step 4e add user_mean
        df['user_mean']=(df['target_cumsum'] / df['question_cumcount'])\
                .fillna(0.).replace(np.inf, 0.).astype('float32')
        if verbose: print('step 4e (add user_mean) - Done')
            
        # step 4f add time_per_question
        df['time_per_question']=(df['prior_question_elapsed_time_cumsum'] / df['question_cumcount'])\
                .fillna(0.).replace(np.inf, 0.).astype('float32')
        if verbose: print('step 4f (add time_per_question) - Done')
            
        # step 4g add timestamp_prior_time_cumsum_diff
        df['timestamp_prior_time_cumsum_diff']=df['timestamp']-df['prior_question_elapsed_time_cumsum']
        if verbose: print('step 4g (add timestamp_prior_time_cumsum_diff) - Done')
                   
            
        # step 5 fill remaining NAs (using _na_dict)
        if test and self._na_dict is not None:
            df = df.fillna(self._na_dict)
        if verbose: print('step 5 (fill remaining NAs) - Done')
        
        # step 6 convert dtypes (using _na_dict)
        
        if test and self._dtype_dict is not None:
            df = df.astype(self._dtype_dict)
        if verbose: print('step 6 (convert dtypes) - Done')
        
        return df
        
    def split_data(self, df, n_iter=30):
        """Split into train and validation datasets."""
        
        counter = 0
        train_idx = df.index
        val_idx = pd.RangeIndex(start=0, stop=0, step=1)
    
        while counter < n_iter:
            tmp_val_flag = (df.loc[train_idx, ['user_id', 'timestamp']]\
                            .groupby('user_id')\
                            .transform(max).squeeze() == df.loc[train_idx,'timestamp'])
        
            tmp_val_index = df.loc[train_idx][tmp_val_flag].index
        
            val_idx = val_idx.append(tmp_val_index).sort_values()
            train_idx = train_idx.drop(tmp_val_index)
            counter += 1
    
        return train_idx.to_list(), val_idx.to_list()
    
    
    def save_data(self, df, name):
        df.to_feather(self.path/(name + '.feather'))
        
    def load_data(self, name):
        return pd.read_feather(self.path/(name + '.feather'))

In [3]:
r = Riiid()

In [4]:
r.set_data_path(path=r'./data')
r.load_and_process_questions()
r.load_and_process_lectures()

In [None]:
# r.save_data(data_df, name='data')

In [None]:
# train_idx, val_idx = r.split_data(train_df, n_iter=120)

In [None]:
# len(train_idx), len(val_idx)

In [None]:
# r.save_data(train_df.iloc[train_idx].reset_index(drop=True), name='train_p1')
# r.save_data(train_df.iloc[val_idx].reset_index(drop=True), name='train_p2')

In [5]:
# train_df = pd.read_csv(r.path/'train.csv', nrows=200000, dtype=r.dtype, usecols=r.usecols)
# train_df = r.load_data('data') # 101 230 332
# train_df = r.load_data('train') #88 777 729
# train_df = r.load_data('train_p1') # 68 999 539
# train_df = r.load_data('train_p2') # 19 778 190
train_df = r.load_data('train_m') # 23 630 479
# val_df = r.load_data('val') # 12 452 603
# val_df_p1 = r.load_data('val_p1') # 6 101 188
# val_df_p2 = r.load_data('val_p2') # 6 351 415
val_df = r.load_data('val_m') # 3 433 724

In [6]:
train_df.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,115,5692,0,1,1,,
1,56943,115,5716,0,2,1,37000.0,False


In [7]:
train_df.shape

(23630479, 8)

In [8]:
val_df.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,534187,115,45,0,22,0,19000.0,False
1,557677,115,185,0,23,0,21000.0,False


In [9]:
val_df.shape

(3433724, 8)

In [12]:
r.questions_df.head(2)

Unnamed: 0,question_id,bundle_id,part,tags,num_of_tags,bundle_size
0,0,0,1,51 131 162 38,4,1
1,1,1,1,131 36 81,3,1


In [13]:
r.lectures_df.head(2)

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept


In [14]:
r.lectures_df['type_of'].cat.categories

Index(['concept', 'intention', 'solving question', 'starter'], dtype='object')

In [None]:
# r.setup_data_stats(train_df)

In [15]:
with open(r.path/'data_stats_user_target.pickle', mode='rb') as file:
    Riiid.stats_user_target = pickle.load(file)
with open(r.path/'data_stats_question_target.pickle', mode='rb') as file:
    Riiid.stats_question_target = pickle.load(file)

In [None]:
# with open(r.path/'data_stats_user_target.pickle', mode='wb') as file:
#     pickle.dump(Riiid.stats_user_target, file)
# with open(r.path/'data_stats_question_target.pickle', mode='wb') as file:
#     pickle.dump(Riiid.stats_question_target, file)

In [16]:
r.stats_user_target.head(2)

Unnamed: 0_level_0,user_mean,user_std,user_skew
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115,0.695652,0.465215,-0.879359
124,0.233333,0.430183,1.328338


In [17]:
r.stats_question_target.head(2)

Unnamed: 0_level_0,question_mean,question_std,question_skew
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.907721,0.28944,-2.818128
1,0.890646,0.312104,-2.503986


In [18]:
(train_df.shape,
val_df.shape)

((23630479, 8), (3433724, 8))

Transformation train

In [19]:
train_df = r.transform_data(train_df, verbose=True, inference=False)
# train_df = r.load_data('train_transformed')
# train_df = r.load_data('train_p2_transformed')
# train_df = r.load_data('data_transformed')

step 0 (keep questions only) - Done
step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done
step 2 (merge questions_df) - Done
step 3 (merge question_target_stats) - Done
step 4a (add time_between) - Done
step 4b (add question_cumcount) - Done
step 4c (add target_cumsum) - Done
step 4d (add prior_question_elapsed_time_cumsum) - Done
step 4e (add user_mean) - Done
step 4f (add time_per_question) - Done
step 4g (add timestamp_prior_time_cumsum_diff) - Done
step 5 (fill remaining NAs) - Done
step 6 (convert dtypes) - Done


In [20]:
Riiid.hist_time_between = train_df[['user_id', 'timestamp']].groupby(['user_id']).max()
Riiid.hist_question_cumcount = train_df[['user_id', 'question_cumcount']]\
            .groupby(['user_id']).max()
Riiid.hist_prior_question_elapsed_time_cumsum = train_df[['user_id', 'prior_question_elapsed_time_cumsum']]\
            .groupby(['user_id']).max()
Riiid.hist_target_cumsum = train_df[['user_id', 'target_cumsum']]\
            .groupby(['user_id']).max()

In [21]:
with open(r.path/'train_m_hist_time_between.pickle', mode='wb') as file:
    pickle.dump(Riiid.hist_time_between, file)
with open(r.path/'train_m_hist_question_cumcount.pickle', mode='wb') as file:
    pickle.dump(Riiid.hist_question_cumcount, file)
with open(r.path/'train_m_hist_prior_question_elapsed_time_cumsum.pickle', mode='wb') as file:
    pickle.dump(Riiid.hist_prior_question_elapsed_time_cumsum, file)
with open(r.path/'train_m_hist_target_cumsum.pickle', mode='wb') as file:
    pickle.dump(Riiid.hist_target_cumsum, file)

In [22]:
len(r.hist_time_between), len(r.hist_question_cumcount), len(r.hist_prior_question_elapsed_time_cumsum), len(r.hist_target_cumsum)

(116335, 116335, 116335, 116335)

In [None]:
# Riiid.hist_prior_question_elapsed_time_cumsum

In [23]:
train_df.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,part,...,question_mean,question_std,question_skew,time_between,question_cumcount,target_cumsum,prior_question_elapsed_time_cumsum,user_mean,time_per_question,timestamp_prior_time_cumsum_diff
0,0,115,5692,0,1,1,0.0,False,5692,5,...,0.745495,0.435589,-1.127249,0,1,1,0,1.0,0.0,0
1,56943,115,5716,0,2,1,37000.0,False,5716,5,...,0.734901,0.441395,-1.064443,56943,2,2,37000,1.0,18500.0,19943


In [24]:
train_df[train_df['user_id'] == 115].head(5)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,part,...,question_mean,question_std,question_skew,time_between,question_cumcount,target_cumsum,prior_question_elapsed_time_cumsum,user_mean,time_per_question,timestamp_prior_time_cumsum_diff
0,0,115,5692,0,1,1,0.0,False,5692,5,...,0.745495,0.435589,-1.127249,0,1,1,0,1.0,0.0,0
1,56943,115,5716,0,2,1,37000.0,False,5716,5,...,0.734901,0.441395,-1.064443,56943,2,2,37000,1.0,18500.0,19943
2,118363,115,128,0,0,1,55000.0,False,128,1,...,0.966869,0.178984,-5.217423,61420,3,3,92000,1.0,30666.666016,26363
3,131167,115,7860,0,3,1,19000.0,False,7860,1,...,0.954815,0.207714,-4.37965,12804,4,4,111000,1.0,27750.0,20167
4,137965,115,7922,0,4,1,11000.0,False,7922,1,...,0.953218,0.211178,-4.292724,6798,5,5,122000,1.0,24400.0,15965


In [25]:
train_df.isna().sum().sum()

0

In [None]:
Riiid._dtype_dict = None
Riiid._na_dict = None

In [None]:
r.setup_data_stats(train_df) # refactor to setup_dtype_na_dict or similar

In [None]:
Riiid._na_dict

In [None]:
Riiid._dtype_dict

In [None]:
with open(r.path/'data_na_dict.pickle', mode='wb') as file:
    pickle.dump(Riiid._na_dict, file)
with open(r.path/'data_dtype_dict.pickle', mode='wb') as file:
    pickle.dump(Riiid._dtype_dict, file)

In [26]:
train_df = train_df.reset_index(drop=True)

# r.save_data(train_df, 'data_transformed')
# r.save_data(train_df, 'train_transformed')
# r.save_data(train_df, 'train_p1_transformed')
r.save_data(train_df, 'train_m_transformed')
# r.save_data(train_df, 'train_p2_transformed')

Transformation val

In [None]:
with open(r.path/'train_p1_hist_time_between.pickle', mode='rb') as file:
    Riiid.hist_time_between = pickle.load(file)
with open(r.path/'train_p1_hist_question_cumcount.pickle', mode='rb') as file:
    Riiid.hist_question_cumcount = pickle.load(file)
with open(r.path/'train_p1_hist_prior_question_elapsed_time_cumsum.pickle', mode='rb') as file:
    Riiid.hist_prior_question_elapsed_time_cumsum = pickle.load(file)
with open(r.path/'train_p1_hist_target_cumsum.pickle', mode='rb') as file:
    Riiid.hist_target_cumsum = pickle.load(file)

In [27]:
with open(r.path/'data_na_dict.pickle', mode='rb') as file:
    Riiid._na_dict = pickle.load(file)
with open(r.path/'data_dtype_dict.pickle', mode='rb') as file:
    Riiid._dtype_dict = pickle.load(file)

In [None]:
# val_df.head(2)

In [28]:
val_df = r.transform_data(val_df, test=True, verbose=True, inference=False)
# train_df = r.transform_data(train_df, test=True, verbose=True, inference=False)
# val_df = r.transform_data(val_df_p2, test=True, verbose=True, inference=False)
# val_df = r.load_data('val_transformed')

step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done
step 2 (merge questions_df) - Done
step 3 (merge question_target_stats) - Done
step 4a (add time_between) - Done
step 4b (add question_cumcount) - Done
step 4c (add target_cumsum) - Done
step 4d (add prior_question_elapsed_time_cumsum) - Done
step 4e (add user_mean) - Done
step 4f (add time_per_question) - Done
step 4g (add timestamp_prior_time_cumsum_diff) - Done
step 5 (fill remaining NAs) - Done
step 6 (convert dtypes) - Done


In [29]:
r.save_data(val_df, 'val_m_transformed')
# r.save_data(val_df, 'val_p2_transformed')
# r.save_data(train_df, 'train_p2_transformed')

In [30]:
val_df.isna().sum()

timestamp                             0
user_id                               0
content_id                            0
content_type_id                       0
task_container_id                     0
answered_correctly                    0
prior_question_elapsed_time           0
prior_question_had_explanation        0
bundle_id                             0
part                                  0
num_of_tags                           0
bundle_size                           0
question_mean                         0
question_std                          0
question_skew                         0
time_between                          0
question_cumcount                     0
target_cumsum                         0
prior_question_elapsed_time_cumsum    0
user_mean                             0
time_per_question                     0
timestamp_prior_time_cumsum_diff      0
dtype: int64

In [31]:
val_df.shape

(3433724, 22)

In [None]:
r.get_features(val_df)

In [None]:
print(r.features)
print(len(r.features))

In [None]:
# r.features.remove('user_id')
r.features.remove('content_type_id')

In [None]:
train_df[train_df['user_id']==115].tail(2)

In [None]:
val_df[val_df['user_id']==115].head(2)

In [None]:
train_df[['timestamp', 'user_id']].groupby('user_id').max().join(
    val_df[['timestamp', 'user_id']].groupby('user_id').min(), how='outer',lsuffix='_train_max', rsuffix='_val_min').isna().sum()

### Modelling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [None]:
train_df = r.load_data('train_p2_transformed')

In [None]:
val_df_p1 = r.load_data('val_p1_transformed')
val_df_p2 = r.load_data('val_p2_transformed')

In [None]:
r.get_features(train_df)

In [None]:
r.features.remove('user_id')
r.features.remove('content_type_id')

In [None]:
features= r.features
target = 'answered_correctly'

RandomForestClassifier

In [None]:
with open(r.path/'models/rf-train-small-model.sav', 'rb') as f:
    rf = pickle.load(f)

In [None]:
params={'n_estimators':40,
        'criterion':'entropy',
        'max_depth':None,
        'min_samples_split':2,
        'min_samples_leaf':1,
        'min_weight_fraction_leaf':0.0,
        'max_features':1.0,
        'max_leaf_nodes':None,
        'min_impurity_decrease':0.0,
        'min_impurity_split':None,
        'bootstrap':True,
        'oob_score':False,
        'n_jobs':-1,
        'random_state':37,
        'verbose':1,
        'warm_start':False,
        'class_weight':None,
        'ccp_alpha':0.0,
        'max_samples':200000,}

In [None]:
rf = RandomForestClassifier(**params)

In [None]:
rf.fit(train_df[features].values, train_df[target].values)

In [None]:
with open(r.path/'models/rf-train-small-model.sav', 'wb') as f:
    pickle.dump(rf, f)

In [None]:
val_p1_preds = rf.predict_proba(val_df_p1[features])[:,1]
val_p2_preds = rf.predict_proba(val_df_p2[features])[:,1]

In [None]:
val_p1_preds

In [None]:
roc_auc_score(val_df_p2[target].values.squeeze(), val_p2_preds)

In [None]:
roc_auc_score(train_df[target], preds.mean(0)[:,1])

In [None]:
preds_estimators = np.stack([t.predict_proba(val_df_p1[features]) for t in rf.estimators_])

In [None]:
preds_estimators.shape

In [None]:
preds_estimators[:0+1,:,1].shape

In [None]:
plt.plot([roc_auc_score(val_df_p1[target], preds_estimators[:i+1,:,1].mean(0)) for i in range(len(rf.estimators_))]);

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(rf, val_df_p1[features])
fi

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

plot_fi(fi);

In [None]:
val_preds = rf.predict_proba(val_df_p1.loc[:,features])[:,1]

In [None]:
plt.figure(figsize = (16,5))
plt.margins(x=0.01, y=0.1)
plt.plot(rf.feature_importances_[np.argsort(rf.feature_importances_)][-10:], 'bo')
plt.xticks(np.arange(10),
           np.array(features)[np.argsort(rf.feature_importances_)][-10:],
           fontsize = 'small', rotation = 90);

In [None]:
val_preds.max()

In [None]:
dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], weight=None,
                     base_margin=None, missing=None,
                     silent=False, feature_names=features,
                     feature_types=None, nthread=None,)

In [None]:
dval1 = xgb.DMatrix(data=val_df_p1[features], label=val_df_p1[target], weight=None,
                     base_margin=None, missing=None,
                     silent=False, feature_names=features,
                     feature_types=None, nthread=None,)

In [None]:
dval2 = xgb.DMatrix(data=val_df_p2[features], label=val_df_p2[target], weight=None,
                     base_margin=None, missing=None,
                     silent=False, feature_names=features,
                     feature_types=None, nthread=None,)

In [None]:
dval.save_binary(r.path/'dval.xgboost', silent=False)

In [None]:
dval.get_base_margin()

In [None]:
params={'learning_rate':0.1,
        'max_depth':5,
        'eval_metric': 'auc',
        'objective':'binary:logistic'
       }

In [None]:
bst = xgb.train(params, dtrain=dtrain, num_boost_round=50, evals=[(dtrain,'train'), (dval1,'val_p1'), (dval2,'val_p2')], obj=None, feval=None,
          maximize=False, early_stopping_rounds=None, evals_result=None,
          verbose_eval=10, xgb_model=None, callbacks=None)

In [None]:
xgb.plot_importance(bst)

In [None]:
# 6 351 415

In [None]:
train_ = train_df.copy()

In [None]:
def get_val_idx(df, n_iter=17):
        """Split into train and validation datasets."""
        
        counter = 0
        val_idx = []
    
        while counter < n_iter:
            train_trans = df[['user_id', 'timestamp']].groupby('user_id').transform('max').values
            val_filter = (train_trans.squeeze() == df['timestamp'].values)
            val_idx.extend(df.iloc[val_filter].index.to_list())
            df.drop(index=df.iloc[val_filter].index, inplace=True)
            
            counter += 1
    
        return val_idx

In [None]:
val_idx = get_val_idx(train_, n_iter=2)

In [None]:
len(val_idx)

In [None]:
val_df = train_df.iloc[val_idx]

In [None]:
val_df.shape

In [None]:
len(val_df['user_id'].unique())

In [None]:
val_ = val_df.copy()

In [None]:
%%time
val_idx = get_val_idx(val_, n_iter=12)

In [None]:
len(val_idx)

In [None]:
train_df.iloc[~train_df.index.isin(val_df.index)]

In [None]:
test_df_ = pd.read_csv(r.path/'example_test.csv')
submission_df = pd.read_csv(r.path/'example_sample_submission.csv')
val_df = r.load_data('val_p2_transformed')

In [None]:
d = {'row_id':55, 'group_num':0, 'timestamp':5000, 'user_id':275030867,
     'content_id':0, 'content_type_id':1, 'task_container_id':1,
     'prior_question_elapsed_time':13000.0, 'prior_question_had_explanation':True,
     'prior_group_answers_correct':np.nan, 'prior_group_responses':np.nan}

In [None]:
test_df_ = test_df_.append(d, ignore_index=True)
l = eval(test_df_.iat[18,9])
l.append(-1)
test_df_.iat[18,9] = str(l)

In [None]:
with open(r.path/'data_stats_user_target.pickle', mode='rb') as file:
    Riiid.stats_user_target = pickle.load(file)
with open(r.path/'data_stats_question_target.pickle', mode='rb') as file:
    Riiid.stats_question_target = pickle.load(file)

In [None]:
with open(r.path/'data_na_dict.pickle', mode='rb') as file:
    Riiid._na_dict = pickle.load(file)
with open(r.path/'data_dtype_dict.pickle', mode='rb') as file:
    Riiid._dtype_dict = pickle.load(file)

In [None]:
with open(r.path/'data_hist_time_between.pickle', mode='rb') as file:
    Riiid.hist_time_between = pickle.load(file)
with open(r.path/'data_hist_question_cumcount.pickle', mode='rb') as file:
    Riiid.hist_question_cumcount = pickle.load(file)
with open(r.path/'data_hist_prior_question_elapsed_time_cumsum.pickle', mode='rb') as file:
    Riiid.hist_prior_question_elapsed_time_cumsum = pickle.load(file)
with open(r.path/'data_hist_target_cumsum.pickle', mode='rb') as file:
    Riiid.hist_target_cumsum = pickle.load(file)

In [None]:
import lightgbm as lgb
bst = lgb.Booster(model_file = str(r.path) + '/models/lgb_p2_75')

In [None]:
# lgb.plot_importance(bst)

In [None]:
target = 'answered_correctly'
features = ['content_id', 'prior_question_elapsed_time',
            'prior_question_had_explanation', 'bundle_id', 'part',
            'num_of_tags', 'bundle_size', 'question_mean',
            'question_std', 'question_skew', 'time_between',
            'question_cumcount', 'target_cumsum',
            'prior_question_elapsed_time_cumsum', 'user_mean',
            'time_per_question', 'timestamp_prior_time_cumsum_diff']

In [None]:
sub = pd.DataFrame(columns=['row_id', target])

In [None]:
%%time
prev_test_df = None

for g in range(4):
    test_df = test_df_[test_df_['group_num']==g].copy()
    
    
    if prev_test_df is None:
        prev_test_df = test_df.copy()
        test_df = r.transform_data(test_df, test=True,inference=True, prev_group=None)
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
        
    else:
        current_test = test_df.copy()
        prev_test_df[target] = eval(test_df["prior_group_answers_correct"].iat[0])
        
        test_df = r.transform_data(current_test, test=True, inference=True, prev_group=prev_test_df)
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
        prev_test_df = current_test.copy()
        
    test_df[target] =  bst.predict(test_df[features])

    
    sub = sub.append(test_df[['row_id', target]]).reset_index(drop=True)

In [None]:
%load_ext line_profiler

In [None]:
# %lprun -f function_name_only function_call_with_arguments

In [None]:
%lprun -f r.transform_data r.transform_data(test_df_, test=True, inference=True, prev_group=None)

In [None]:
%lprun -f r._make_time_between r._make_time_between(test_df_)

In [None]:
%lprun -f r._make_target_cumsum r._make_target_cumsum(test_df_)

In [None]:
%lprun -f r._make_question_cumcount r._make_question_cumcount(test_df_)

In [None]:
%lprun -f r._make_prior_question_elapsed_time_cumsum r._make_prior_question_elapsed_time_cumsum(test_df_)

In [None]:
sub.join(submission['answered_correctly'], rsuffix='_sub').iloc[-10:]

In [None]:
submission = pd.read_csv(r.path/'submission.csv')

In [None]:
submission