In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
import gc
from collections import deque

from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns=40
pd.options.display.max_rows=130

In [None]:
class Riiid:
    
    dtype={'row_id': 'int64', 'timestamp': 'int64',
           'user_id': 'int32', 'content_id': 'int16',
           'content_type_id': 'int8', 'task_container_id': 'int16',
           'user_answer': 'int8', 'answered_correctly': 'int8',
           'prior_question_elapsed_time': 'float32',
           'prior_question_had_explanation': 'boolean',
          }
    
    usecols=['row_id', 'timestamp', 'user_id', 'content_id',
             'content_type_id','task_container_id',
             'answered_correctly',
             'prior_question_elapsed_time','prior_question_had_explanation']
    
    questions_df = None
    tb = None
    tpq = None
    tptc = None
    
    user_hist_dict = {}
    content_hist_dict = {}
    
    features = None
    _na_dict = None
    _dtype_dict = None

    def __init__(self, path):
        Riiid.path = Path(path)

    def setup(self, path):

        with open(Path(path)/'data/questions_df.pickle', mode='rb') as file:
            Riiid.questions_df = pickle.load(file)
            
        with open(Path(path)/'data/tb.pickle', mode='rb') as file:
            Riiid.tb = pickle.load(file)
        with open(Path(path)/'data/tpq.pickle', mode='rb') as file:
            Riiid.tpq = pickle.load(file)
#         with open(Path(path)/'data/tptc.pickle', mode='rb') as file:
#             Riiid.tptc = pickle.load(file)        
                 
        with open(Path(path)/'data/data_min.pickle', mode='rb') as file:
            train_df = pickle.load(file)
            
        self._get_history_dict(train_df)
        del train_df
        gc.collect()
        
        
    @staticmethod
    def _proc_question_tags(df):
        return pd.concat([df.drop('tags', 1), df['tags'].str.get_dummies(sep=" ")], 1)
    
    @staticmethod
    def _get_streak(s):
        prev = 0
        streak = 0
        for i in s:
            if i == 0:
                streak = streak - 1 if prev == 0 else -1
            elif i == 1:
                streak = streak + 1 if prev == 1 else 1
            else: # lecture
                continue
            prev = i
        return streak
    
    @staticmethod
    def _get_history_dict(df):
        
        #shift target
        df['answered_correctly'] = df[['user_id', 'answered_correctly']].groupby('user_id').shift()
        df['answered_correctly'] = df['answered_correctly'].fillna(0).astype(np.int8)
        gc.collect()
         
        target_sum = df[['user_id', 'answered_correctly']].groupby('user_id').sum().values.astype(np.int16)
        target_count = df[['user_id', 'answered_correctly']].groupby('user_id').count().values.astype(np.int16)
        user_targets = df.groupby('user_id')['answered_correctly'].apply(np.array)
        
        timestamp = df.groupby('user_id')['timestamp'].max().values.astype(np.int64)
        df['prior_question_elapsed_time_cumsum'] = df[['user_id',
                                                  'prior_question_elapsed_time']]\
            .groupby('user_id').transform('cumsum')
        
        prior_question_elapsed_cumsum = df[['user_id',
                                            'prior_question_elapsed_time_cumsum']]\
            .groupby('user_id').max().values.astype(np.int64)
    
        # fill dictionary with default values
        for user_id in df['user_id'].unique():
            Riiid.user_hist_dict[user_id] = {}
        
        # add user content attempts
        user_content = df.groupby('user_id')['content_id'].apply(np.array).apply(np.sort).apply(np.unique)
        user_attempts = df.groupby(['user_id', 'content_id'])['content_id'].count()\
            .astype(np.uint8).groupby('user_id').apply(np.array).values
        user_attempts -= 1
    
        for user_id, content, attempt in zip(Riiid.user_hist_dict.keys(), user_content, user_attempts):
            Riiid.user_hist_dict[user_id]['n_attempts'] = dict(zip(content, attempt))
        
        del user_content, user_attempts
        gc.collect()
        
        def _get_streak(s):
            prev = 0
            streak = 0
            for i in s:
                if i == 0:
                    streak = streak - 1 if prev == 0 else -1
                elif i == 1:
                    streak = streak + 1 if prev == 1 else 1
                else: # lecture
                    continue
                prev = i
            return streak
    
        for idx, user_id in enumerate(Riiid.user_hist_dict.keys()):
            Riiid.user_hist_dict[user_id]['user_sum'] = target_sum[idx][0]
            Riiid.user_hist_dict[user_id]['user_count'] = target_count[idx][0]
            Riiid.user_hist_dict[user_id]['user_mean'] = (Riiid.user_hist_dict[user_id]['user_sum'] /
                                               Riiid.user_hist_dict[user_id]['user_count']).astype(np.float16)
            
            Riiid.user_hist_dict[user_id]['user_L10'] = deque(user_targets.loc[user_id], maxlen=10)
            Riiid.user_hist_dict[user_id]['user_L10_mean'] = np.mean(Riiid.user_hist_dict[user_id]['user_L10'])\
                .astype(np.float16)
            Riiid.user_hist_dict[user_id]['user_streak'] = _get_streak(user_targets.loc[user_id])
            
            Riiid.user_hist_dict[user_id]['timestamp'] = timestamp[idx]
            Riiid.user_hist_dict[user_id]['prior_question_elapsed_time_cumsum'] = prior_question_elapsed_cumsum[idx][0]

        del timestamp, prior_question_elapsed_cumsum
        del target_sum, target_count, user_targets
        gc.collect()

    @classmethod
    def _set_up_questions_df(cls, train):
        """Create questions_df using stats from train."""
        
        cls.questions_df = pd.read_csv(cls.path/'questions.csv')
        
        # drop columns
        cls.questions_df = cls.questions_df.drop(columns=['correct_answer'])
        
        # add number of tags
        cls.questions_df['num_of_tags'] = cls.questions_df['tags'].map(lambda x: len(str(x).split()))
        
        # encode tags
        cls.questions_df = cls.questions_df.fillna({'tags':str(-1)})
        cls.le = LabelEncoder()
        cls.questions_df['tags'] = cls.le.fit_transform(cls.questions_df['tags'])
        
        # add number of questions in bundle
        tmp = cls.questions_df[['question_id', 'bundle_id']] \
            .groupby('bundle_id').count() \
            .rename(columns={'question_id':'bundle_size'})
        
        cls.questions_df = cls.questions_df.join(tmp, on='bundle_id') 

        # add content stats
        stats = train.loc[train['content_type_id']==0, ['content_id', 'answered_correctly']] \
            .groupby('content_id').agg(['mean', 'std', 'skew'])
        
        stats.columns = stats.columns.droplevel()
        stats.columns = ['question_mean', 'question_std', 'question_skew']
        stats = stats.astype({'question_mean':'float16',
                              'question_std':'float16','question_skew':'float16'})
        
        cls.questions_df = cls.questions_df.join(stats, on='question_id')
        
        # add bundle stats
        tmp = cls.questions_df[['question_id', 'bundle_id']].set_index(keys='question_id')
        train = train.join(tmp, on='content_id')
        stats = train.loc[train['content_type_id']==0, ['bundle_id', 'answered_correctly']] \
            .groupby('bundle_id').agg(['mean', 'std', 'skew'])
        
        stats.columns = stats.columns.droplevel()
        stats.columns = ['bundle_mean', 'bundle_std', 'bundle_skew']
        stats = stats.astype({'bundle_mean':'float16',
                              'bundle_std':'float16','bundle_skew':'float16'})
        
        cls.questions_df = cls.questions_df.join(stats, on='bundle_id')

        # add tags stats
        tmp = cls.questions_df[['question_id', 'tags']].set_index(keys='question_id')
        train = train.join(tmp, on='content_id')
        stats = train.loc[train['content_type_id']==0, ['tags', 'answered_correctly']] \
            .groupby('tags').agg(['mean', 'std', 'skew'])
        
        stats.columns = stats.columns.droplevel()
        stats.columns = ['tags_mean', 'tags_std', 'tags_skew']
        stats = stats.astype({'tags_mean':'float16',
                              'tags_std':'float16','tags_skew':'float16'})
        
        cls.questions_df = cls.questions_df.join(stats, on='tags')       
        
        # fillna
        cls.questions_df = cls.questions_df.fillna(
            {'tags':str(-1),
             'question_mean':cls.questions_df['question_mean'].mean(),
             'question_std':cls.questions_df['question_std'].mean(),
             'question_skew':cls.questions_df['question_skew'].mean(),
             'bundle_mean':cls.questions_df['bundle_mean'].mean(),
             'bundle_std':cls.questions_df['bundle_std'].mean(),
             'bundle_skew':cls.questions_df['bundle_skew'].mean(),
             'tags_mean':cls.questions_df['tags_mean'].mean(),
             'tags_std':cls.questions_df['tags_std'].mean(),
             'tags_skew':cls.questions_df['tags_skew'].mean()})
        
        # set_index to question_id for optimised join
        cls.questions_df = cls.questions_df.set_index('question_id', verify_integrity=True)
        

        
    @classmethod
    def load_and_process_lectures(cls):
        
        cls.lectures_df = pd.read_csv(cls.path/'lectures.csv')

        # process lectures data
        cls.lectures_df['type_of'] = cls.lectures_df['type_of'].astype('category')
#         types_of = ('type_starter', 'type_concept', 'type_intention', 'type_solving question')
#         cls.lectures_df['type_of'].cat.set_categories(types_of, ordered=False, inplace=True)

    
    @staticmethod
    def _scan_user_data(hist, df):
        
        # prev
        user_mean, user_sum, user_count = [], [], []
        user_L10_mean,  user_streak = [], []
        # current
        n_attempts= []
        time_between = []
        prior_question_elapsed_time_cumsum = []

    
        for idx, (user_id, content_id, content_type_id,
                  timestamp, prior_question_elapsed_time) in df[['user_id', 'content_id',
                                                                 'content_type_id', 'timestamp',
                                                                 'prior_question_elapsed_time']].iterrows():
            # fill in dummy for lectures
            if content_type_id:
                n_attempts.append(0)
                user_mean.append(0.)
                user_sum.append(0)
                user_count.append(0)
                user_L10_mean.append(0.)
                user_streak.append(0)
                
                time_between.append(0)
                prior_question_elapsed_time_cumsum.append(0)
                continue

            # check if user exists
            if user_id in hist:
                # check if user already answered the question, if so update it to a maximum of 4
                if content_id in hist[user_id]['n_attempts']:
                    hist[user_id]['n_attempts'][content_id] = min(4, hist[user_id]['n_attempts'][content_id] + 1)
                # if user did not answered the question already, set the number of attempts to 0
                else:
                    hist[user_id]['n_attempts'][content_id] = 0
                    
                hist[user_id]['prior_question_elapsed_time_cumsum'] += prior_question_elapsed_time
        
            # else create user with default values
            else:
                dict_keys = ['user_mean', 'user_sum', 'user_count',
                             'user_L10', 'user_L10_mean', 'user_streak',
                             'n_attempts', 
                             'timestamp', 'prior_question_elapsed_time_cumsum', 'tb']
                dict_default_vals = [0, 0, 1, deque(maxlen=10), 0, 0, dict(zip([content_id],[0])),
                                     timestamp, prior_question_elapsed_time, 0]
                hist[user_id] = dict(zip(dict_keys, dict_default_vals))
            
            # add user data to lists
            n_attempts.append(hist[user_id]['n_attempts'][content_id])
            user_mean.append(hist[user_id]['user_mean'])
            user_sum.append(hist[user_id]['user_sum'])
            user_count.append(hist[user_id]['user_count'])
            user_L10_mean.append(hist[user_id]['user_L10_mean'])
            user_streak.append(hist[user_id]['user_streak'])

            prior_question_elapsed_time_cumsum.append(hist[user_id]['prior_question_elapsed_time_cumsum'])
            
            if timestamp > hist[user_id]['timestamp']:
                time_between.append(timestamp - hist[user_id]['timestamp'])
                hist[user_id]['tb'] = timestamp - hist[user_id]['timestamp']
                hist[user_id]['timestamp'] = timestamp
            elif timestamp == hist[user_id]['timestamp']:
                time_between.append(hist[user_id]['tb'])
            else: # This should not happen
                raise ValueError('Current timestamp is lower then previous')
                
        
        return (user_mean, user_count, user_sum, n_attempts,
                time_between, prior_question_elapsed_time_cumsum,
                user_L10_mean, user_streak)
            
    def update_hist_data(self, prev_test_df):
        for (user_id, content_id,
             content_type_id, answered_correctly) in prev_test_df[['user_id', 'content_id',
                                                                 'content_type_id','answered_correctly']].values:
            # skip lectures
            if content_type_id:
                continue
        
            # update user features
            Riiid.user_hist_dict[user_id]['user_sum'] += answered_correctly
            Riiid.user_hist_dict[user_id]['user_mean'] = np.float16(Riiid.user_hist_dict[user_id]['user_sum'] /
                                                 Riiid.user_hist_dict[user_id]['user_count'])
            Riiid.user_hist_dict[user_id]['user_count'] += 1
            Riiid.user_hist_dict[user_id]['user_L10'].append(answered_correctly)
            Riiid.user_hist_dict[user_id]['user_L10_mean'] = np.mean(Riiid.user_hist_dict[user_id]['user_L10'])
            
            if answered_correctly:
                if Riiid.user_hist_dict[user_id]['user_streak'] > 0:
                    Riiid.user_hist_dict[user_id]['user_streak'] += 1
                else:
                    Riiid.user_hist_dict[user_id]['user_streak'] = 1
            else:
                if Riiid.user_hist_dict[user_id]['user_streak'] < 0:
                    Riiid.user_hist_dict[user_id]['user_streak'] -= 1
                else:
                    Riiid.user_hist_dict[user_id]['user_streak'] = -1                
            
    
    def transform_data(self, df, inference=True, verbose=False):
        
        if not inference: # we need questions and lectures for inference
            # step 0 = keep questions only
            df = df.loc[df['content_type_id']==0]
            if verbose: print('step 0 (keep questions only) - Done')
        gc.collect()

        # step 1 = fillna for prior_question_elapsed_time and prior_question_had_explanation
        df = df.fillna({'prior_question_elapsed_time':0.,
                        'prior_question_had_explanation':False})
        if verbose: print('step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done')
        
        df['days'] = np.floor(df['timestamp']/(1000*60*60*24))
  
        # step 3 add historical features
        if inference:
            
            (user_mean, user_count, user_sum, n_attempts,
             time_between, prior_question_elapsed_time_cumsum,
             user_L10_mean, user_streak) = self._scan_user_data(Riiid.user_hist_dict, df)
            
            # prev
            df['target_cumcount'] = user_count
            df['target_cumsum'] = user_sum
            df['user_mean'] = user_mean
            df['user_L10_mean'] = user_L10_mean
            df['user_streak'] = user_streak
            
            # current
            df['n_attempts'] = n_attempts
            df['time_between'] = time_between
            df['prior_question_elapsed_time_cumsum'] = prior_question_elapsed_time_cumsum
            
            df['lag_time'] = df['time_between'] - df['prior_question_elapsed_time']

            
        else:
            
            # shift target
            df['answered_correctly'] = df[['user_id', 'answered_correctly']].groupby('user_id').shift()
            df['answered_correctly'] = df['answered_correctly'].fillna(0).astype(np.int8)
            gc.collect()
            
            # user_streak
            def f(df):
                df['c-'] = (df['answered_correctly'] == 1).cumsum()
                df['c+'] = (df['answered_correctly'] == 0).cumsum()

                df['user_streak'] = (-((df['c-'] == 0).astype(int) + df.groupby('c-').cumcount())
                    + (df['c+'] == 0).astype(int) + df.groupby('c+').cumcount())

                return df            
            df = df.groupby('user_id').apply(f)
            df = df.drop(columns=['c-', 'c+'])
            gc.collect()
            
            
            df['user_L10_mean'] = df[['user_id', 'answered_correctly']].groupby('user_id')\
                .rolling(10, min_periods=1).mean()['answered_correctly'].values.astype(np.float16)
            
            
            df['time_between'] = df[['user_id', 'timestamp']]\
                .groupby('user_id').transform('diff').fillna(0.).astype(int)
            df['time_between'] = df[['user_id', 'timestamp', 'time_between']]\
                .groupby(['user_id', 'timestamp']).transform('first')
            gc.collect()
            
            df['lag_time'] = df['time_between'] - df['prior_question_elapsed_time']
            
            
            df['target_cumcount'] = df[['user_id', 'answered_correctly']]\
                .groupby(['user_id']).transform('cumcount').astype(np.int16) + 1
            df['target_cumsum'] = df[['user_id', 'answered_correctly']]\
                .groupby('user_id').transform('cumsum').astype(np.int16)
            df['user_mean']=(df['target_cumsum'] / df['target_cumcount']).astype(np.float16)
            
            
            df['prior_question_elapsed_time_cumsum'] = df[['user_id', 'prior_question_elapsed_time']]\
                .groupby('user_id').transform('cumsum').astype(int)

            df['n_attempts'] = df[['user_id', 'content_id', 'answered_correctly']]\
                            .groupby(['user_id', 'content_id']).transform('cumcount').astype('uint8')
            df['n_attempts'] = df['n_attempts'].clip(lower=None, upper=4)
            gc.collect()
            
        if verbose: print('step 3 (add historical features) - Done')

    
        # step 4 add ratios
        df['time_per_question']=np.float32(df['prior_question_elapsed_time_cumsum'] /
                                           df['target_cumcount'])
        df['time_per_question_cat'] = np.int32(np.round(df['time_per_question'],-3).clip(lower=0,
                                                                                upper=50000))
        df = df.join(self.tpq, on='time_per_question_cat')
        
        df['lag_time_cat'] = np.int32(np.round(df['lag_time'],-3).clip(lower=-100000,
                                                                       upper=500000))
        
        df['timestamp_prior_time_cumsum_diff']=df['timestamp']-df['prior_question_elapsed_time_cumsum']
        
        df['time_between_cat'] = np.int32(np.round(df['time_between'],-2).clip(upper=600000))
        df = df.join(self.tb, on='time_between_cat')
        
        
        if verbose: print('step 4 (add ratios) - Done')
            
        # step 2 merge question
        df = df.join(self.questions_df.loc[self.questions_df.index.isin(df['content_id'])], on='content_id')
        
        # fillna fillna mainly for lectures
        df = df.fillna({'prior_question_elapsed_time':0., 'prior_question_had_explanation':False,
                        'bundle_id':0, 'part':0, 'tags':0, 'num_of_tags':0, 'bundle_size':0,
                        'question_mean':0., 'question_std':0., 'question_skew':0.
                        })
        # change dtype
        df = df.astype({'bundle_id':'int16', 'part':'int8', 'tags':'int16', 'num_of_tags':'int8',
                        'bundle_size':'int8', 'prior_question_had_explanation':'bool',
                        })
        if verbose: print('step 2 (join questions_df) - Done')
        
        df['user_content_hmean'] = np.float16(2*((df['user_mean'] + 0.0001) * (df['question_mean'] + 0.0001)) /
                                        (df['user_mean'] + df['question_mean']))
        

        df['all_hmean'] = np.float16(5*((df['user_mean'] + 0.0001) *
                                        (df['question_mean'] + 0.0001) *
                                        (df['tags_mean'] + 0.0001) * df['tpq_mean'] * df['tb_mean']) /
                                     np.sum(df[['user_mean', 'question_mean', 'tags_mean',
                                                'tpq_mean', 'tb_mean']], axis=1))
        df['all_hsum'] = np.float16(np.sum(df[['user_mean', 'question_mean', 'tags_mean',
                                               'tpq_mean', 'tb_mean']], axis=1))
        
        return df
    
    
    def save_data(self, df, name):
        df.to_feather(self.path/(name + '.feather'))
        
    def load_data(self, name):
        return pd.read_feather(self.path/(name + '.feather'))

In [None]:
r = Riiid(path=r'./data')

In [None]:
%%time
r.setup('./')

In [None]:
len(r.user_hist_dict), Riiid.questions_df.shape

In [None]:
# train_df = pd.read_csv(r.path/'train.csv', nrows=None, dtype=r.dtype, usecols=r.usecols)
train_df = r.load_data('data') # 101 230 332
# train_df = r.load_data('data_q') # 99 271 300

# with open(r.path/'data_q.pickle', mode='rb') as file:
#     train_df = pickle.load(file)

# with open(r.path/'data_qr.pickle', mode='rb') as file:
#     train_df = pickle.load(file)

In [None]:
# do it once and then only load.
# Riiid._set_up_questions_df(train_df)
# with open(r.path/'questions_df.pickle', mode='wb') as file:
#     pickle.dump(Riiid.questions_df, file)
with open(r.path/'questions_df.pickle', mode='rb') as file:
    Riiid.questions_df = pickle.load(file)

In [None]:
with open(r.path/'tb.pickle', mode='rb') as file:
    Riiid.tb = pickle.load(file)
with open(r.path/'tpq.pickle', mode='rb') as file:
    Riiid.tpq = pickle.load(file)
# with open(r.path/'tptc.pickle', mode='rb') as file:
#     Riiid.tptc = pickle.load(file)

In [None]:
# with open(r.path/'tptc.pickle', mode='wb') as file:
#     pickle.dump(tptc, file)

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
r.questions_df.head()

Transformation train

In [None]:
with open(r.path/'train_gr2_row_id.pickle', mode='rb') as file:
    train_gr_row_id = pickle.load(file)
with open(r.path/'val_gr2_row_id.pickle', mode='rb') as file:
    val_gr_row_id = pickle.load(file)

In [None]:
train_df=train_df.loc[train_df['row_id'].isin(train_gr_row_id+val_gr_row_id)]
gc.collect()

In [None]:
%%time
train_df = r.transform_data(train_df, inference=False, verbose=True)

In [None]:
train_df.head()

In [None]:
train_df.dtypes

In [None]:
print(len(train_df.columns), train_df.columns)

In [None]:
train_df.isna().sum().sum()

In [None]:
train_gr = train_df.loc[train_df['row_id'].isin(train_gr_row_id)].reset_index(drop=True)
val_gr = train_df.loc[train_df['row_id'].isin(val_gr_row_id)].reset_index(drop=True)

In [None]:
r.save_data(train_gr, 'train_gr2_transformed')
r.save_data(val_gr, 'val_gr2_transformed')

Submission test

In [None]:
r.user_hist_dict[275030867]

In [None]:
test_df_ = pd.read_csv(r.path/'example_test.csv')
submission_df = pd.read_csv(r.path/'example_sample_submission.csv')

In [None]:
import lightgbm as lgb
bst = lgb.Booster(model_file = str(r.path) + '/models/lgb_g1-33f-l31-perfect')

In [None]:
target = 'answered_correctly'
features = bst.feature_name()

In [None]:
print(features)

In [None]:
sub = pd.DataFrame()

In [None]:
gr0 = test_df_.loc[test_df_['group_num']==0].copy()
gr1 = test_df_.loc[test_df_['group_num']==1].copy()
gr2 = test_df_.loc[test_df_['group_num']==2].copy()
gr3 = test_df_.loc[test_df_['group_num']==3].copy()
iter_test = [gr0, gr1, gr2, gr3]

In [None]:
prev_test_df = None
for test_df in iter_test:
    
    # from 2nd iteration, update user data
    if prev_test_df is not None:
        prev_test_df[target] = eval(test_df["prior_group_answers_correct"].iat[0])
        r.update_hist_data(prev_test_df)
    
    # save previous test_df
    prev_test_df = test_df.copy()
    
    test_df = r.transform_data(test_df)
    
    test_df[target] =  bst.predict(test_df[features])
#     test_df[target] =  bst_cat.predict(test_df[features])

    sub = sub.append(test_df)

In [None]:
# features to check
f=['user_id', 'bundle_id', 'timestamp', 'time_between', 'timestamp_prior_time_cumsum_diff', 'lag_time',
   'prior_question_elapsed_time','prior_question_elapsed_time_cumsum',
   'time_per_question','content_id', 'n_attempts','target_cumsum', 'target_cumcount', 'user_mean', 'answered_correctly'
   ]

In [None]:
sub.loc[sub['user_id']==554169193,f]

In [None]:
sub.loc[sub['user_id']==275030867,f]

In [None]:
r.user_hist_dict[554169193]['timestamp']

In [None]:
%load_ext line_profiler

In [None]:
# %lprun -f function_name_only function_call_with_arguments

In [None]:
%lprun -f r.transform_data r.transform_data(test_df_, inference=True)

In [None]:
%lprun -f r._scan_user_data r._scan_user_data(Riiid.user_hist_dict, test_df_)

In [None]:
submission = pd.read_csv(r.path/'submission.csv')