In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import gc

In [2]:
class Riiid:
    
    path = None
    
    dtype={'row_id': 'int64', 'timestamp': 'int64',
           'user_id': 'int32', 'content_id': 'int16',
           'content_type_id': 'int8', 'task_container_id': 'int16',
           'user_answer': 'int8', 'answered_correctly': 'int8',
           'prior_question_elapsed_time': 'float32',
           'prior_question_had_explanation': 'boolean',
          }
    
    usecols=['timestamp', 'user_id', 'content_id',
             'content_type_id','task_container_id', 'answered_correctly',
             'prior_question_elapsed_time','prior_question_had_explanation']
    
    
    questions_df = None
    lectures_df = None
    train_user_target_stats = None
    train_question_target_stats = None
    train_user_question_types_sum = None
    train_user_lecture_types_sum = None   
    scaler = None # ['timestamp', 'task_container_id', 'prior_question_elapsed_time']
    
    features = None
    _na_dict = None
    _dtype_dict = {'timestamp':'float64',
                   'task_container_id':'float64',
                   'answered_correctly':'int8',
                   'prior_question_elapsed_time':'float64',
                   'prior_question_had_explanation':'boolean',
                   'bundle_id':'int16',
                   'question_tag_count':'int8',
                   'part_1':'boolean',
                   'part_2':'boolean',
                   'part_3':'boolean',
                   'part_4':'boolean',
                   'part_5':'boolean',
                   'part_6':'boolean',
                   'part_7':'boolean',
                   'bundle_size':'int8',
                   'question_mean':'float32',
                   'question_count':'int32',
                   'question_sum':'int32',
                   'user_mean':'float32',
                   'user_count':'int16',
                   'user_sum':'int16',
                   'q_part_1_sum':'int16',
                   'q_part_2_sum':'int16',
                   'q_part_3_sum':'int16',
                   'q_part_4_sum':'int16',
                   'q_part_5_sum':'int16',
                   'q_part_6_sum':'int16',
                   'q_part_7_sum':'int16',
                   'type_concept_sum':'uint8',
                   'type_intention_sum':'uint8',
                   'type_solving question_sum':'uint8',
                   'type_starter_sum':'uint8',
                   'l_part_1_sum':'uint8',
                   'l_part_2_sum':'uint8',
                   'l_part_3_sum':'uint8',
                   'l_part_4_sum':'uint8',
                   'l_part_5_sum':'uint8',
                   'l_part_6_sum':'uint8',
                   'l_part_7_sum':'uint8',}

    def __init__(self):
        pass

    
    @staticmethod
    def _proc_questions(df):
        df.fillna('-1', inplace=True) # str(-1): str.split()
        
        # collect unique tags
        all_tags = []
        for tag in df['tags']:
            for t in tag.split():
                if int(t) not in all_tags:
                    all_tags.append(int(t))
        all_tags.sort()
        all_tags.remove(-1) # -1 means no tags
        
        # add tags as columns
        df = df.join(pd.DataFrame(columns=all_tags)).fillna(0)
        
        # set to 1 if a row contain the tag
        for row in range(len(df)):
            for t in df.at[row, 'tags'].split():
                if int(t) in df.columns:
                    df.at[row, int(t)] = 1
        
        return df
    
    @classmethod
    def load_and_process_questions(cls):
        
        cls.questions_df = pd.read_csv(cls.path/'questions.csv')
        
        # drop columns
        cls.questions_df = cls.questions_df.drop(columns=['correct_answer'])
        
        # add number of tags
        cls.questions_df['question_tag_count'] = cls.questions_df['tags'].map(lambda x: len(str(x).split()))
        
        
        # process questions data
        cls.questions_df = pd.get_dummies(cls.questions_df,
                                         prefix=['part'],
                                         columns=['part']) \
                            .astype({'part_1':'bool','part_2':'bool',
                                     'part_3':'bool','part_4':'bool',
                                     'part_5':'bool','part_6':'bool',
                                     'part_7':'bool'})
        
        # add number of questions in bundle
        tmp = cls.questions_df[['question_id', 'bundle_id']] \
            .groupby('bundle_id').count() \
            .rename(columns={'question_id':'bundle_size'})
        
        cls.questions_df = cls.questions_df.merge(tmp, how='left', on='bundle_id')
        
    @classmethod
    def load_and_process_lectures(cls):
        
        cls.lectures_df = pd.read_csv(cls.path/'lectures.csv')
        

        # process lectures data
        cls.lectures_df = pd.get_dummies(cls.lectures_df,
                                         prefix=['type', 'part'],
                                         columns=['type_of', 'part']) \
            .astype({'type_concept':'bool',
                     'type_intention':'bool',
                     'type_solving question':'bool',
                     'type_starter':'bool',
                     'part_1':'bool','part_2':'bool',
                     'part_3':'bool','part_4':'bool',
                     'part_5':'bool','part_6':'bool',
                     'part_7':'bool'})     

    
    @classmethod
    def set_data_path(cls, path):
        cls.path = Path(path)
    
    @classmethod
    def _get_user_target_stats(cls, train):
        tmp=train[train['content_type_id']==0]
        
        # user part
        cls.train_user_target_stats = tmp[['user_id', 'answered_correctly']] \
            .groupby('user_id').agg(['mean', 'count', 'sum',]).reset_index()
        
        
        cls.train_user_target_stats.columns = ['user_id', 'user_mean',
                                               'user_count', 'user_sum',]
        
        cls.train_user_target_stats = cls.train_user_target_stats.astype(
            dtype = {'user_id':'int32', 'user_mean':'float32',
                     'user_count':'int16', 'user_sum':'int16'})

    @classmethod
    def _get_question_target_stats(cls, train):
        tmp=train[train['content_type_id']==0]

        # question part
        cls.train_question_target_stats = tmp[['content_id', 'answered_correctly']] \
            .groupby('content_id').agg(['mean', 'count', 'sum',]).reset_index()
        
        cls.train_question_target_stats.columns = ['question_id', 'question_mean',
                                                   'question_count', 'question_sum',]
        
        cls.train_question_target_stats = cls.train_question_target_stats.astype(
            dtype = {'question_id':'int16', 'question_mean':'float32',
                     'question_count':'int32', 'question_sum':'int32'})
        
    @classmethod
    def _get_question_stats(cls, train):
        tmp=train[train['content_type_id']==0]
        
        cls.train_user_question_types_sum = tmp[['user_id', 'content_id']] \
            .merge(cls.questions_df, how='left',
                   left_on='content_id', right_on='question_id') \
            .drop(columns=['content_id', 'question_id','bundle_id', 'tags', 'question_tag_count', 'bundle_size']) \
            .groupby('user_id').sum().reset_index()
        
        
        cls.train_user_question_types_sum.columns = ['user_id', 'q_part_1_sum',
                                                     'q_part_2_sum','q_part_3_sum',
                                                     'q_part_4_sum','q_part_5_sum',
                                                     'q_part_6_sum','q_part_7_sum']
        
        cls.train_user_question_types_sum = cls.train_user_question_types_sum.astype(
            dtype = {'user_id':'int32', 'q_part_1_sum':'int16',
                     'q_part_2_sum':'int16', 'q_part_3_sum':'int16',
                     'q_part_4_sum':'int16', 'q_part_5_sum':'int16',
                     'q_part_6_sum':'int16', 'q_part_7_sum':'int16'})
        
    @classmethod
    def _get_lecture_stats(cls, train):
        tmp=train[train['content_type_id']==1]
        
        cls.train_user_lecture_types_sum = tmp[['user_id', 'content_id']] \
            .merge(cls.lectures_df, how='left',
                   left_on='content_id', right_on='lecture_id') \
            .drop(columns=['content_id', 'lecture_id', 'tag']) \
            .groupby('user_id').sum().reset_index()
        
        
        cls.train_user_lecture_types_sum.columns = ['user_id', 'type_concept_sum',
                                                    'type_intention_sum', 'type_solving question_sum',
                                                    'type_starter_sum', 'l_part_1_sum',
                                                    'l_part_2_sum','l_part_3_sum','l_part_4_sum','l_part_5_sum',
                                                    'l_part_6_sum', 'l_part_7_sum']
        
        cls.train_user_lecture_types_sum = cls.train_user_lecture_types_sum.astype(
            dtype = {'user_id':'int32', 'type_concept_sum':'uint8',
                     'type_intention_sum':'uint8', 'type_solving question_sum':'uint8',
                     'type_starter_sum':'uint8', 'l_part_1_sum':'uint8',
                     'l_part_2_sum':'uint8', 'l_part_3_sum':'uint8',
                     'l_part_4_sum':'uint8', 'l_part_5_sum':'uint8',
                     'l_part_6_sum':'uint8', 'l_part_7_sum':'uint8'})
     
    @classmethod
    def _get_scaler(cls, train):
        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        tmp = train[['timestamp', 'task_container_id', 'prior_question_elapsed_time']]
        cls.scaler =  scaler.fit(tmp.fillna(0.))

        
    def setup_data(self, df):
        
        if Riiid.train_user_target_stats is None:
            Riiid._get_user_target_stats(df)
        print('train_user_target_stats - Done')
        
        if Riiid.train_question_target_stats is None:
            Riiid._get_question_target_stats(df)
        print('train_question_target_stats - Done')
        
        Riiid._get_scaler(df)
        print('scaler - Done')
        
        if Riiid.train_user_question_types_sum is None:
            Riiid._get_question_stats(df)
        print('train_user_question_types_sum - Done')
        
        if Riiid.train_user_lecture_types_sum is None:
            Riiid._get_lecture_stats(df)
        print('train_user_lecture_types_sum - Done')
        
        if Riiid._na_dict is None:
            Riiid._na_dict = {'question_tag_count':int(Riiid.questions_df['question_tag_count'].mode()),
              'part_1':bool(int(Riiid.questions_df['part_1'].mode())),
              'part_2':bool(int(Riiid.questions_df['part_2'].mode())),
              'part_3':bool(int(Riiid.questions_df['part_3'].mode())),
              'part_4':bool(int(Riiid.questions_df['part_4'].mode())),
              'part_5':bool(int(Riiid.questions_df['part_5'].mode())),
              'part_6':bool(int(Riiid.questions_df['part_6'].mode())),
              'part_7':bool(int(Riiid.questions_df['part_7'].mode())),
              'bundle_size':int(Riiid.questions_df['bundle_size'].mode()),
              'question_mean':Riiid.train_question_target_stats['question_mean'].mean(),
              'question_count':int(Riiid.train_question_target_stats['question_count'].mode()),
              'question_sum':int(Riiid.train_question_target_stats['question_sum'].mode()),
              'user_mean':Riiid.train_user_target_stats['user_mean'].mean(),
              'user_count':int(Riiid.train_user_target_stats['user_count'].mode()),
              'user_sum':int(Riiid.train_user_target_stats['user_sum'].mode()),
              'q_part_1_sum':int(Riiid.train_user_question_types_sum['q_part_1_sum'].mode()),
              'q_part_2_sum':int(Riiid.train_user_question_types_sum['q_part_2_sum'].mode()),
              'q_part_3_sum':int(Riiid.train_user_question_types_sum['q_part_3_sum'].mode()),
              'q_part_4_sum':int(Riiid.train_user_question_types_sum['q_part_4_sum'].mode()),
              'q_part_5_sum':int(Riiid.train_user_question_types_sum['q_part_5_sum'].mode()),
              'q_part_6_sum':int(Riiid.train_user_question_types_sum['q_part_6_sum'].mode()),
              'q_part_7_sum':int(Riiid.train_user_question_types_sum['q_part_7_sum'].mode()),
              'type_concept_sum':int(Riiid.train_user_lecture_types_sum['type_concept_sum'].mode()),
              'type_intention_sum':int(Riiid.train_user_lecture_types_sum['type_intention_sum'].mode()),
              'type_solving question_sum':int(Riiid.train_user_lecture_types_sum['type_solving question_sum'].mode()),
              'type_starter_sum':int(Riiid.train_user_lecture_types_sum['type_starter_sum'].mode()),
              'l_part_1_sum':int(Riiid.train_user_lecture_types_sum['l_part_1_sum'].mode()),
              'l_part_2_sum':int(Riiid.train_user_lecture_types_sum['l_part_2_sum'].mode()),
              'l_part_3_sum':int(Riiid.train_user_lecture_types_sum['l_part_3_sum'].mode()),
              'l_part_4_sum':int(Riiid.train_user_lecture_types_sum['l_part_4_sum'].mode()),
              'l_part_5_sum':int(Riiid.train_user_lecture_types_sum['l_part_5_sum'].mode()),
              'l_part_6_sum':int(Riiid.train_user_lecture_types_sum['l_part_6_sum'].mode()),
              'l_part_7_sum':int(Riiid.train_user_lecture_types_sum['l_part_7_sum'].mode()),}
        print('_na_dict - Done')
    
    def process_data(self, df, test=False, verbose=False):
        
        if not test: # we need questions and lectures for test
            # step 0 = keep questions only
            df = df[df['content_type_id']==0]
            if verbose: print('step 0 (keep questions only) - Done')

        # step 1 = fillna for prior_question_elapsed_time and prior_question_had_explanation
        df = df.fillna({'prior_question_elapsed_time':0.,
                                    'prior_question_had_explanation':False})
        if verbose: print('step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done')
        
        # step 2 merge question without question_id, content_type_id and tags
        df = df.merge(self.questions_df, how='left',
                            left_on='content_id',right_on='question_id') \
                .drop(columns=['question_id','content_type_id', 'tags'])
        
        # fillna fillna mainly for lectures
        df = df.fillna({'prior_question_elapsed_time':0.,
                              'prior_question_had_explanation':False,
                              'bundle_id':0, 'question_tag_count':0, 'bundle_size':0})
        # change dtype
        df = df.astype({'bundle_id':'int16', 'question_tag_count':'int8', 'bundle_size':'int8'})
        
        if verbose: print('step 2 (merge questions_df) - Done')
               
        # step 3 merge question target stats
        df = df.merge(self.train_question_target_stats, how='left',
                            left_on='content_id', right_on='question_id') \
                .drop(columns=['content_id', 'question_id'])
        if verbose: print('step 3 (merge train_question_target_stats) - Done')
        
        # step 4 merge train_user_target_stats
        df = df.merge(self.train_user_target_stats, how='left', on='user_id')
        if verbose: print('step 4 (merge train_user_target_stats) - Done')
        
        # step 5 merge train_user_question_types_sum
        df = df.merge(self.train_user_question_types_sum, how='left', on='user_id')
        print('step 5 (merge train_user_question_types_sum) - Done')
        
        # step 6 merge train_user_lecture_types_sum and fillna with 0.
        # drop user_id
        df = df.merge(self.train_user_lecture_types_sum, how='left', on='user_id') \
            .fillna({ 'type_concept_sum':0.,'type_intention_sum':0.,
                     'type_solving question_sum':0., 'type_starter_sum':0.,
                     'l_part_1_sum':0., 'l_part_2_sum':0., 'l_part_3_sum':0.,
                     'l_part_4_sum':0., 'l_part_5_sum':0., 'l_part_6_sum':0.,
                     'l_part_7_sum':0.}) \
            .astype({ 'type_concept_sum':'uint8','type_intention_sum':'uint8',
                     'type_solving question_sum':'uint8', 'type_starter_sum':'uint8',
                     'l_part_1_sum':'uint8', 'l_part_2_sum':'uint8', 'l_part_3_sum':'uint8',
                     'l_part_4_sum':'uint8', 'l_part_5_sum':'uint8', 'l_part_6_sum':'uint8',
                     'l_part_7_sum':'uint8'}) \
            .drop(columns=['user_id'])
        if verbose: print('step 6 (merge train_user_lecture_types) - Done')
        
        # step 7 transform ['timestamp', 'task_container_id', 'prior_question_elapsed_time']
        cols_transform = ['timestamp', 'task_container_id', 'prior_question_elapsed_time']
        df[cols_transform] = self.scaler.transform(df[cols_transform])
        if verbose: print('step 7 (scaler.transform) - Done')
        
        # step 8 fill remaining NAs (using _na_dict)
        df = df.fillna(Riiid._na_dict)
        if verbose: print('step 8 (fill remaining NAs) - Done')
        
        # step 9 convert dtypes (using _na_dict)
        df = df.astype(Riiid._dtype_dict)
        if verbose: print('step 9 (convert dtypes) - Done')
        
        # save features
        if Riiid.features is None:
            Riiid.features = list(df.columns)
            Riiid.features.remove('answered_correctly')
        
        return df
        
    def split_data(self, df, n_iter=30):
        """Split into train and validation datasets.
        
        Returns: train and validation indices"""
        
        recent = pd.DataFrame(columns=['user_id', 'timestamp', 'user_id-timestamp'])
        df['user_id-timestamp'] = df['user_id'].astype(str) + '-' + df['timestamp'].astype(str)
        counter = 0
        while counter < n_iter:
            tmp = df[~df['user_id-timestamp'].isin(recent['user_id-timestamp'])][['user_id', 'timestamp']].groupby(['user_id']).max().reset_index()
            recent = recent.append(tmp, ignore_index=True)
            recent['user_id-timestamp'] = recent['user_id'].astype(str) + '-' + recent['timestamp'].astype(str)
            counter+=1
        train_idx = df[~df['user_id-timestamp'].isin(recent['user_id-timestamp'])].index
        val_idx = df[df['user_id-timestamp'].isin(recent['user_id-timestamp'])].index
        df.drop(columns=['user_id-timestamp'], inplace=True)
    
        return train_idx, val_idx
    
    
    def save_data(self, df, name):
        df.to_feather(self.path/(name + '.feather'))
        
    def load_data(self, name):
        return pd.read_feather(self.path/(name + '.feather'))
    

In [3]:
Riiid.set_data_path(path=r'./data')

In [4]:
Riiid.load_and_process_questions()

In [5]:
Riiid.load_and_process_lectures()

In [6]:
Riiid.questions_df.head(2)

Unnamed: 0,question_id,bundle_id,tags,question_tag_count,part_1,part_2,part_3,part_4,part_5,part_6,part_7,bundle_size
0,0,0,51 131 162 38,4,True,False,False,False,False,False,False,1
1,1,1,131 36 81,3,True,False,False,False,False,False,False,1


In [7]:
Riiid.lectures_df.head(2)

Unnamed: 0,lecture_id,tag,type_concept,type_intention,type_solving question,type_starter,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,89,159,True,False,False,False,False,False,False,False,True,False,False
1,100,70,True,False,False,False,True,False,False,False,False,False,False


In [8]:
r = Riiid()

In [None]:
# r.save_data(data_df, name='data')

In [None]:
# train_idx, val_idx = r.split_data(data_df)

In [None]:
# r.save_data(data_df.iloc[train_idx].reset_index(drop=True), name='train')
# r.save_data(data_df.iloc[val_idx].reset_index(drop=True), name='val')

In [9]:
# train_df = pd.read_csv(r.path/'train.csv', nrows=500000, dtype=r.dtype, usecols=r.usecols)
train_df = r.load_data('train')
# val_df = r.load_data('val')

In [10]:
train_df.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,115,5692,0,1,1,,
1,56943,115,5716,0,2,1,37000.0,False


In [None]:
val_df.head(2)

In [11]:
r.questions_df.head(2)

Unnamed: 0,question_id,bundle_id,tags,question_tag_count,part_1,part_2,part_3,part_4,part_5,part_6,part_7,bundle_size
0,0,0,51 131 162 38,4,True,False,False,False,False,False,False,1
1,1,1,131 36 81,3,True,False,False,False,False,False,False,1


In [12]:
r.lectures_df.head(2)

Unnamed: 0,lecture_id,tag,type_concept,type_intention,type_solving question,type_starter,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,89,159,True,False,False,False,False,False,False,False,True,False,False
1,100,70,True,False,False,False,True,False,False,False,False,False,False


In [13]:
Riiid.train_user_target_stats = r.load_data('train_user_target_stats')
Riiid.train_question_target_stats =r.load_data('train_question_target_stats')
Riiid.train_user_question_types_sum = r.load_data('train_user_question_types_sum')
Riiid.train_user_lecture_types_sum =r.load_data('train_user_lecture_types_sum')

In [None]:
# Riiid.train_user_target_stats = r.load_data('data_user_target_stats')
# Riiid.train_question_target_stats =r.load_data('data_question_target_stats')
# Riiid.train_user_question_types_sum = r.load_data('data_user_question_types_sum')
# Riiid.train_user_lecture_types_sum =r.load_data('data_user_lecture_types_sum')

In [14]:
r.setup_data(train_df)

train_user_target_stats - Done
train_question_target_stats - Done
scaler - Done
train_user_question_types_sum - Done
train_user_lecture_types_sum - Done
_na_dict - Done


In [None]:
# r.save_data(r.train_user_target_stats, name='train_user_target_stats')
# r.save_data(r.train_question_target_stats, name='train_question_target_stats')
# r.save_data(r.train_user_question_types_sum, name='train_user_question_types_sum')
# r.save_data(r.train_user_lecture_types_sum, name='train_user_lecture_types_sum')

In [None]:
# r.save_data(r.train_user_target_stats, name='data_user_target_stats')
# r.save_data(r.train_question_target_stats, name='data_question_target_stats')
# r.save_data(r.train_user_question_types_sum, name='data_user_question_types_sum')
# r.save_data(r.train_user_lecture_types_sum, name='data_user_lecture_types_sum')

In [15]:
r.train_user_target_stats.head(2)

Unnamed: 0,user_id,user_mean,user_count,user_sum
0,115,0.833333,12,10
1,5382,0.701031,97,68


In [16]:
r.train_question_target_stats.head(2)

Unnamed: 0,question_id,question_mean,question_count,question_sum
0,0,0.913167,6357,5805
1,1,0.896999,6932,6218


In [17]:
r.train_user_question_types_sum.head(2)

Unnamed: 0,user_id,q_part_1_sum,q_part_2_sum,q_part_3_sum,q_part_4_sum,q_part_5_sum,q_part_6_sum,q_part_7_sum
0,115,10,0,0,0,2,0,0
1,5382,12,32,0,0,53,0,0


In [18]:
r.train_user_lecture_types_sum.head(2)

Unnamed: 0,user_id,type_concept_sum,type_intention_sum,type_solving question_sum,type_starter_sum,l_part_1_sum,l_part_2_sum,l_part_3_sum,l_part_4_sum,l_part_5_sum,l_part_6_sum,l_part_7_sum
0,5382,1,0,0,0,1,0,0,0,0,0,0
1,8623,2,0,0,0,2,0,0,0,0,0,0


In [19]:
train_df.shape, #val_df.shape

((88777729, 8),)

Transformation train

In [None]:
train_df = r.process_data(train_df, verbose=True)
# del train_df
# gc.collect()
# train_df = r.load_data('data_transformed')
# train_df = r.load_data('train_transformed')

step 0 (keep questions only) - Done
step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done
step 2 (merge questions_df) - Done
step 3 (merge train_question_target_stats) - Done
step 4 (merge train_user_target_stats) - Done
step 5 (merge train_user_question_types_sum) - Done


In [None]:
# r.save_data(train_df, 'data_transformed')
# r.save_data(train_df, 'train_transformed')

In [None]:
train_df.isna().sum().sum()

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
# val_df = r.process_data(val_df, test=True, verbose=True)
val_df = r.load_data('val_transformed')

In [None]:
# r.save_data(val_df, 'val_transformed')

In [None]:
val_df.isna().sum().sum()

In [None]:
val_df.head(2)

In [None]:
val_df.shape

In [None]:
len(r.features)

In [None]:
# train_df[['timestamp', 'user_id']].groupby('user_id').max().join(
#     val_df[['timestamp', 'user_id']].groupby('user_id').min(), how='outer',lsuffix='_train', rsuffix='_val').isna().sum()