In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pickle

In [None]:
class Riiid:
    
    path = None
    
    dtype={'row_id': 'int64', 'timestamp': 'int64',
           'user_id': 'int32', 'content_id': 'int16',
           'content_type_id': 'int8', 'task_container_id': 'int16',
           'user_answer': 'int8', 'answered_correctly': 'int8',
           'prior_question_elapsed_time': 'float32',
           'prior_question_had_explanation': 'boolean',
          }
    
    usecols=['timestamp', 'user_id', 'content_id',
             'content_type_id','task_container_id',
             'answered_correctly','user_answer',
             'prior_question_elapsed_time','prior_question_had_explanation']
    
    questions_df = None
    lectures_df = None
    
    train_user_target_stats = None
    train_question_target_stats = None
    
    question_cumcount_hist = None
    time_between_hist = None
    prior_question_elapsed_time_cumsum_hist = None
    
    features = None
    _na_dict = None
    _dtype_dict = None

    def __init__(self):
        pass

    
    @staticmethod
    def _proc_question_tags(df):
        return pd.concat([df.drop('tags', 1), df['tags'].str.get_dummies(sep=" ")], 1)
    
    @classmethod
    def load_and_process_questions(cls):
        
        cls.questions_df = pd.read_csv(cls.path/'questions.csv')
#         n_answer_options_df = pd.read_csv(cls.path/'n_answer_options.csv')
        
        # drop columns
        cls.questions_df = cls.questions_df.drop(columns=['correct_answer'])
        
        # add number of tags
        cls.questions_df['num_of_tags'] = cls.questions_df['tags'].map(lambda x: len(str(x).split()))
        
        # add number of questions in bundle
        tmp = cls.questions_df[['question_id', 'bundle_id']] \
            .groupby('bundle_id').count() \
            .rename(columns={'question_id':'bundle_size'})
        
        cls.questions_df = cls.questions_df.join(tmp, on='bundle_id')
#         cls.questions_df = cls.questions_df.join(n_answer_options_df, on='question_id')
        
        # one hot encode tags
        #cls.questions_df = cls._proc_question_tags(cls.questions_df)
        
    @classmethod
    def load_and_process_lectures(cls):
        
        cls.lectures_df = pd.read_csv(cls.path/'lectures.csv')

        # process lectures data
        cls.lectures_df['type_of'] = cls.lectures_df['type_of'].astype('category')
#         types_of = ('type_starter', 'type_concept', 'type_intention', 'type_solving question')
#         cls.lectures_df['type_of'].cat.set_categories(types_of, ordered=False, inplace=True)

    
    @classmethod
    def set_data_path(cls, path):
        cls.path = Path(path)
    
    @classmethod
    def _get_user_target_stats(cls, train):
        
        # user part
        cls.train_user_target_stats = train.loc[train[train['content_type_id']==0].index,
                                                ['user_id', 'answered_correctly']] \
            .groupby('user_id').agg(['mean', 'std', 'skew',])
        
        cls.train_user_target_stats.columns = cls.train_user_target_stats.columns.droplevel()
        cls.train_user_target_stats.columns = ['user_mean', 'user_std', 'user_skew']
        
        cls.train_user_target_stats = cls.train_user_target_stats.astype(
            dtype = {'user_mean':'float32', 'user_std':'float32', 'user_skew':'float32'})
        
        cls.train_user_target_stats.fillna(0.0, inplace=True)

    @classmethod
    def _get_question_target_stats(cls, train):

        # question part
        cls.train_question_target_stats = train.loc[train[train['content_type_id']==0].index,
                                                    ['content_id', 'answered_correctly']] \
            .groupby('content_id').agg(['mean', 'std', 'skew',])
        
        cls.train_question_target_stats.columns = cls.train_question_target_stats.columns.droplevel()
        cls.train_question_target_stats.columns = ['question_mean', 'question_std', 'question_skew']
        
        cls.train_question_target_stats = cls.train_question_target_stats.astype(
            dtype = {'question_mean':'float32', 'question_std':'float32', 'question_skew':'float32'})
        
        cls.train_question_target_stats.fillna(0.0, inplace=True)
    
    @staticmethod
    def _make_question_cumcount(df):
        """For test/validation datasets only."""
        
        df['question_cumcount'] = df[['user_id', 'content_type_id', 'content_id']]\
                .groupby(['user_id', 'content_type_id']).transform('cumcount') + 1
        
        tmp = pd.Series(data=Riiid.question_cumcount_hist.values(),
                        index=Riiid.question_cumcount_hist.keys(),
                        name='tmp')
        
        df['question_cumcount'] += df.join(tmp, on='user_id')\
                                     .fillna({'tmp':0})\
                                     .astype({'tmp':'int16'})['tmp']
        
        Riiid.question_cumcount_hist.update(
            df[df['content_type_id']==0][['user_id', 'question_cumcount']]\
                .groupby('user_id')\
                .max()\
                .to_dict()['question_cumcount'])

        return df
    
    @staticmethod
    def _make_prior_question_elapsed_time_cumsum(df):
        """For test/validation datasets only."""
        
        df['prior_question_elapsed_time_cumsum'] = df[['user_id', 'content_type_id', 'prior_question_elapsed_time']]\
                .groupby(['user_id', 'content_type_id']).transform('cumsum')
        
        tmp = pd.Series(data=Riiid.prior_question_elapsed_time_cumsum_hist.values(),
                        index=Riiid.prior_question_elapsed_time_cumsum_hist.keys(),
                        name='tmp')
        
        df['prior_question_elapsed_time_cumsum'] += df.join(tmp, on='user_id')\
                                     .fillna({'tmp':0})\
                                     .astype({'tmp':int})['tmp']
        
        Riiid.prior_question_elapsed_time_cumsum_hist.update(
            df[df['content_type_id']==0][['user_id', 'prior_question_elapsed_time_cumsum']]\
                .groupby('user_id')\
                .max()\
                .to_dict()['prior_question_elapsed_time_cumsum'])

        return df        
    
    @staticmethod
    def _make_time_between(df):
        """For test/validation datasets only."""     
        
        tmp = pd.Series(data=Riiid.time_between_hist.values(),
                        index=Riiid.time_between_hist.keys(),
                        name='tmp')
        
        df['time_between'] = df[['user_id', 'content_type_id', 'timestamp']]\
                .groupby(['user_id', 'content_type_id']).transform('diff')
        
        df['time_between'] = df['time_between'].where(~df['time_between'].isna(),
                                                      df['timestamp'] - df.join(tmp, on='user_id')\
                                                        .fillna({'tmp':0})['tmp'])\
                                               .astype({'time_between':'int64'})

        Riiid.time_between_hist.update(
            df[df['content_type_id']==0][['user_id', 'timestamp']]\
                .groupby('user_id')\
                .max().to_dict()['timestamp'])

        return df
        
    def setup_data_stats(self, df):
        
        if Riiid.train_user_target_stats is None:
            Riiid._get_user_target_stats(df)
        print('train_user_target_stats - Done')
        
        if Riiid.train_question_target_stats is None:
            Riiid._get_question_target_stats(df)
        print('train_question_target_stats - Done')
        
        if Riiid._dtype_dict is None:
            Riiid._dtype_dict = df.dtypes.to_dict()
        print('_dtype_dict - Done')
        
        if Riiid._na_dict is None:
            Riiid._na_dict = {
                              'part': 0,
                              'num_of_tags': 0,
                              'bundle_size': 0,
#                               'n_answer_options': 0,
                              'question_mean': Riiid.train_question_target_stats['question_mean'].mean(axis=0),
                              'question_std': Riiid.train_question_target_stats['question_std'].mean(axis=0),
                              'question_skew': Riiid.train_question_target_stats['question_skew'].mean(axis=0),
                             }
        print('_na_dict - Done')
    
    @classmethod
    def get_features(self, df):
        # save features
        if Riiid.features is None:
            Riiid.features = list(df.columns)
            Riiid.features.remove('answered_correctly')
    
    def transform_data(self, df, test=False, verbose=False):
        
        if not test: # we need questions and lectures for test
            # step 0 = keep questions only
            df = df.loc[df[df['content_type_id']==0].index]
            if verbose: print('step 0 (keep questions only) - Done')

        # step 1 = fillna for prior_question_elapsed_time and prior_question_had_explanation
        df = df.fillna({'prior_question_elapsed_time':0.,
                        'prior_question_had_explanation':False})
        if verbose: print('step 1 (fillna: prior_question_elapsed_time & prior_question_had_explanation) - Done')
        
        # step 2 merge question without question_id, content_type_id and tags
        df = df.join(self.questions_df, on='content_id') \
               .drop(columns=['question_id',
#                               'content_type_id',
                              'tags'])
        
        # fillna fillna mainly for lectures
        df = df.fillna({'prior_question_elapsed_time':0.,
                        'prior_question_had_explanation':False,
                        'bundle_id':0, 'num_of_tags':0, 'bundle_size':0,
                        'part':0, 'n_answer_options':0})
        # change dtype
        df = df.astype({'bundle_id':'int16', 'num_of_tags':'int8',
                        'bundle_size':'int8', 'prior_question_had_explanation':'bool',
                        'part':'int8',
#                         'n_answer_options':'int8'
                       })
        
        if verbose: print('step 2 (merge questions_df) - Done')
               
        # step 3 merge question target stats
        df = df.join(self.train_question_target_stats, on='content_id')
        if verbose: print('step 3 (merge train_question_target_stats) - Done')
        
        # step 4 merge train_user_target_stats
#         df = df.join(self.train_user_target_stats, on='user_id')
#         if verbose: print('step 4 (merge train_user_target_stats) - Done')
        
        # step 4a add time_between
        if test:
            df = self._make_time_between(df)
        else:
            df['time_between'] = df[['user_id', 'timestamp']].groupby('user_id').diff().fillna(0.).astype(int)        
        if verbose: print('step 4a (add time_between) - Done')
            
        # step 4b add question_cumcount
        if test:
            df = self._make_question_cumcount(df)
        else:
            df['question_cumcount'] = df[['user_id', 'content_id']]\
                .groupby(['user_id']).cumcount().astype('int16')        
        if verbose: print('step 4b (add question_cumcount) - Done')
        
        # step 4c add prior_question_elapsed_time_cumsum
        if test:
            df = self._make_prior_question_elapsed_time_cumsum(df)
        else:
            df['prior_question_elapsed_time_cumsum'] = df[['user_id', 'prior_question_elapsed_time']]\
                .groupby('user_id').cumsum().astype(int)
        if verbose: print('step 4c (add prior_question_elapsed_time_cumsum) - Done')
            
        # step 4d add time_per_question
        df['time_per_question']=(df['prior_question_elapsed_time_cumsum'] / df['question_cumcount'])\
                .fillna(0.).replace(np.inf, 0.).astype('float32')
        if verbose: print('step 4d (add time_per_question) - Done')
            
        # step 4e add tb_int
        df['tb_int']=df['time_between'].round(-5).astype(int)
        if verbose: print('step 4e (add tb_int) - Done')
            
            
        # step 5 fill remaining NAs (using _na_dict)
        if test and self._na_dict is not None:
            df = df.fillna(self._na_dict)
        if verbose: print('step 5 (fill remaining NAs) - Done')
        
        # step 6 convert dtypes (using _na_dict)
        if test and self._dtype_dict is not None:
            df = df.astype(self._dtype_dict)
        if verbose: print('step 6 (convert dtypes) - Done')
        
        return df
        
    def split_data(self, df, n_iter=30):
        """Split into train and validation datasets."""
        
        counter = 0
        train_idx = df.index
        val_idx = pd.RangeIndex(start=0, stop=0, step=1)
    
        while counter < n_iter:
            tmp_val_flag = (df.loc[train_idx, ['user_id', 'timestamp']]\
                            .groupby('user_id')\
                            .transform(max).squeeze() == df.loc[train_idx,'timestamp'])
        
            tmp_val_index = df.loc[train_idx][tmp_val_flag].index
        
            val_idx = val_idx.append(tmp_val_index).sort_values()
            train_idx = train_idx.drop(tmp_val_index)
            counter += 1
    
        return train_idx.to_list(), val_idx.to_list()
    
    
    def save_data(self, df, name):
        df.to_feather(self.path/(name + '.feather'))
        
    def load_data(self, name):
        return pd.read_feather(self.path/(name + '.feather'))
    

In [None]:
r = Riiid()

In [None]:
r.set_data_path(path=r'./data')
r.load_and_process_questions()
r.load_and_process_lectures()

In [None]:
# r.save_data(data_df, name='data')

In [None]:
# train_idx, val_idx = r.split_data(train_df, n_iter=120)

In [None]:
# len(train_idx), len(val_idx)

In [None]:
# r.save_data(train_df.iloc[train_idx].reset_index(drop=True), name='train_p1')
# r.save_data(train_df.iloc[val_idx].reset_index(drop=True), name='train_p2')

In [None]:
# train_df = pd.read_csv(r.path/'train.csv', nrows=200000, dtype=r.dtype, usecols=r.usecols)
# train_df = r.load_data('data')
# train_df = r.load_data('train')
# train_df = r.load_data('train_p1')
train_df = r.load_data('train_p2')
# val_df = r.load_data('val') # we load transformed below
# val_df_p1 = r.load_data('val_p1')
val_df_p2 = r.load_data('val_p2')

In [None]:
train_df.head(2)

In [None]:
train_df.shape

In [None]:
# val_df.head(2)

In [None]:
# val_df.shape

In [None]:
r.questions_df.head(2)

In [None]:
r.lectures_df.head(2)

In [None]:
r.lectures_df['type_of'].cat.categories

In [None]:
r.setup_data_stats(train_df)

In [None]:
# with open(r.path/'train_user_target_stats.pickle', mode='rb') as file:
#     Riiid.train_user_target_stats = pickle.load(file)
# with open(r.path/'train_question_target_stats.pickle', mode='rb') as file:
#     Riiid.train_question_target_stats = pickle.load(file)

In [None]:
with open(r.path/'data_user_target_stats.pickle', mode='rb') as file:
    Riiid.train_user_target_stats = pickle.load(file)
with open(r.path/'data_question_target_stats.pickle', mode='rb') as file:
    Riiid.train_question_target_stats = pickle.load(file)

In [None]:
# with open(r.path/'train_user_target_stats.pickle', mode='wb') as file:
#     pickle.dump(r.train_user_target_stats, file)
# with open(r.path/'train_question_target_stats.pickle', mode='wb') as file:
#     pickle.dump(r.train_question_target_stats, file)

In [None]:
# with open(r.path/'data_user_target_stats.pickle', mode='wb') as file:
#     pickle.dump(r.train_user_target_stats, file)
# with open(r.path/'data_question_target_stats.pickle', mode='wb') as file:
#     pickle.dump(r.train_question_target_stats, file)

In [None]:
r.train_user_target_stats.head(2)

In [None]:
r.train_question_target_stats.head(2)

In [None]:
# (train_df.shape,
# val_df.shape)

Transformation train

In [None]:
train_df = r.transform_data(train_df, verbose=True)
# train_df = r.load_data('train_p2_transformed')

In [None]:
# with open(r.path/'train_time_between_hist.pickle', mode='rb') as file:
#     Riiid.time_between_hist = pickle.load(file)
# with open(r.path/'train_question_cumcount_hist.pickle', mode='rb') as file:
#     Riiid.question_cumcount_hist = pickle.load(file)
# with open(r.path/'train_prior_question_elapsed_time_cumsum_hist.pickle', mode='rb') as file:
#     Riiid.prior_question_elapsed_time_cumsum_hist = pickle.load(file)

In [None]:
# with open(r.path/'train_p1_time_between_hist.pickle', mode='rb') as file:
#     Riiid.time_between_hist = pickle.load(file)
# with open(r.path/'train_p1_question_cumcount_hist.pickle', mode='rb') as file:
#     Riiid.question_cumcount_hist = pickle.load(file)
# with open(r.path/'train_p1_prior_question_elapsed_time_cumsum_hist.pickle', mode='rb') as file:
#     Riiid.prior_question_elapsed_time_cumsum_hist = pickle.load(file)

In [None]:
with open(r.path/'train_p2_time_between_hist.pickle', mode='rb') as file:
    Riiid.time_between_hist = pickle.load(file)
with open(r.path/'train_p2_question_cumcount_hist.pickle', mode='rb') as file:
    Riiid.question_cumcount_hist = pickle.load(file)
with open(r.path/'train_p2_prior_question_elapsed_time_cumsum_hist.pickle', mode='rb') as file:
    Riiid.prior_question_elapsed_time_cumsum_hist = pickle.load(file)

In [None]:
# Riiid.time_between_hist = train_df[['user_id', 'timestamp']].groupby(['user_id']).max().to_dict()['timestamp']
# Riiid.question_cumcount_hist = train_df[['user_id', 'question_cumcount']]\
#             .groupby(['user_id']).max().to_dict()['question_cumcount']
# Riiid.prior_question_elapsed_time_cumsum_hist = train_df[['user_id', 'prior_question_elapsed_time_cumsum']]\
#             .groupby(['user_id']).max().to_dict()['prior_question_elapsed_time_cumsum']

In [None]:
# with open(r.path/'train_p1_time_between_hist.pickle', mode='wb') as file:
#     pickle.dump(Riiid.time_between_hist, file)
# with open(r.path/'train_p1_question_cumcount_hist.pickle', mode='wb') as file:
#     pickle.dump(Riiid.question_cumcount_hist, file)
# with open(r.path/'train_p1_prior_question_elapsed_time_cumsum_hist.pickle', mode='wb') as file:
#     pickle.dump(Riiid.prior_question_elapsed_time_cumsum_hist, file)

In [None]:
# with open(r.path/'data_time_between_hist.pickle', mode='wb') as file:
#     pickle.dump(Riiid.time_between_hist, file)
# with open(r.path/'data_question_cumcount_hist.pickle', mode='wb') as file:
#     pickle.dump(Riiid.question_cumcount_hist, file)
# with open(r.path/'data_prior_question_elapsed_time_cumsum_hist.pickle', mode='wb') as file:
#     pickle.dump(Riiid.prior_question_elapsed_time_cumsum_hist, file)

In [None]:
len(r.time_between_hist), len(r.question_cumcount_hist), len(r.prior_question_elapsed_time_cumsum_hist)

In [None]:
train_df.head(2)

In [None]:
train_df = train_df.reset_index(drop=True)

# r.save_data(train_df, 'data_transformed')
r.save_data(train_df, 'train_p2_transformed')

In [None]:
train_df.isna().sum().sum()

In [None]:
train_df.nunique()

In [None]:
Riiid._dtype_dict = None
Riiid._na_dict = None

In [None]:
r.setup_data_stats(train_df) # refactor to setup_dtype_na_dict or similar

In [None]:
r._na_dict

In [None]:
r._dtype_dict

In [None]:
# with open(r.path/'train_p2_na_dict.pickle', mode='wb') as file:
#     pickle.dump(Riiid._na_dict, file)
# with open(r.path/'train_p2_dtype_dict.pickle', mode='wb') as file:
#     pickle.dump(Riiid._dtype_dict, file)

In [None]:
with open(r.path/'train_p2_na_dict.pickle', mode='rb') as file:
    Riiid._na_dict = pickle.load(file)
with open(r.path/'train_p2_dtype_dict.pickle', mode='rb') as file:
    Riiid._dtype_dict = pickle.load(file)

In [None]:
# val_df.head(2)

In [None]:
# val_df = r.transform_data(val_df, test=True, verbose=True)
val_df = r.transform_data(val_df_p2, test=True, verbose=True)
# val_df = r.load_data('val_transformed')

In [None]:
# r.save_data(val_df, 'val_transformed')
r.save_data(val_df, 'val_p2_transformed_base_p2')

In [None]:
val_df.isna().sum()

In [None]:
val_df.shape

In [None]:
r.get_features(val_df)

In [None]:
print(r.features)
print(len(r.features))

In [None]:
# r.features.remove('user_id')
r.features.remove('content_type_id')

In [None]:
train_df[['timestamp', 'user_id']].groupby('user_id').max().join(
    val_df[['timestamp', 'user_id']].groupby('user_id').min(), how='outer',lsuffix='_train_max', rsuffix='_val_min').head()

### Modelling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [None]:
train_df = r.load_data('train_p2_transformed')

In [None]:
val_df_p1 = r.load_data('val_p1_transformed')
val_df_p2 = r.load_data('val_p2_transformed')

In [None]:
r.get_features(train_df)

In [None]:
r.features.remove('user_id')
r.features.remove('content_type_id')

In [None]:
features= r.features
target = 'answered_correctly'

RandomForestClassifier

In [None]:
with open(r.path/'models/rf-train-small-model.sav', 'rb') as f:
    rf = pickle.load(f)

In [None]:
params={'n_estimators':40,
        'criterion':'entropy',
        'max_depth':None,
        'min_samples_split':2,
        'min_samples_leaf':1,
        'min_weight_fraction_leaf':0.0,
        'max_features':1.0,
        'max_leaf_nodes':None,
        'min_impurity_decrease':0.0,
        'min_impurity_split':None,
        'bootstrap':True,
        'oob_score':False,
        'n_jobs':-1,
        'random_state':37,
        'verbose':1,
        'warm_start':False,
        'class_weight':None,
        'ccp_alpha':0.0,
        'max_samples':200000,}

In [None]:
rf = RandomForestClassifier(**params)

In [None]:
rf.fit(train_df[features].values, train_df[target].values)

In [None]:
with open(r.path/'models/rf-train-small-model.sav', 'wb') as f:
    pickle.dump(rf, f)

In [None]:
val_p1_preds = rf.predict_proba(val_df_p1[features])[:,1]
val_p2_preds = rf.predict_proba(val_df_p2[features])[:,1]

In [None]:
val_p1_preds

In [None]:
roc_auc_score(val_df_p2[target].values.squeeze(), val_p2_preds)

In [None]:
roc_auc_score(train_df[target], preds.mean(0)[:,1])

In [None]:
preds_estimators = np.stack([t.predict_proba(val_df_p1[features]) for t in rf.estimators_])

In [None]:
preds_estimators.shape

In [None]:
preds_estimators[:0+1,:,1].shape

In [None]:
plt.plot([roc_auc_score(val_df_p1[target], preds_estimators[:i+1,:,1].mean(0)) for i in range(len(rf.estimators_))]);

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(rf, val_df_p1[features])
fi

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

plot_fi(fi);

In [None]:
val_preds = rf.predict_proba(val_df_p1.loc[:,features])[:,1]

In [None]:
plt.figure(figsize = (16,5))
plt.margins(x=0.01, y=0.1)
plt.plot(rf.feature_importances_[np.argsort(rf.feature_importances_)][-10:], 'bo')
plt.xticks(np.arange(10),
           np.array(features)[np.argsort(rf.feature_importances_)][-10:],
           fontsize = 'small', rotation = 90);

In [None]:
val_preds.max()

In [None]:
dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], weight=None,
                     base_margin=None, missing=None,
                     silent=False, feature_names=features,
                     feature_types=None, nthread=None,)

In [None]:
dval1 = xgb.DMatrix(data=val_df_p1[features], label=val_df_p1[target], weight=None,
                     base_margin=None, missing=None,
                     silent=False, feature_names=features,
                     feature_types=None, nthread=None,)

In [None]:
dval2 = xgb.DMatrix(data=val_df_p2[features], label=val_df_p2[target], weight=None,
                     base_margin=None, missing=None,
                     silent=False, feature_names=features,
                     feature_types=None, nthread=None,)

In [None]:
dval.save_binary(r.path/'dval.xgboost', silent=False)

In [None]:
dval.get_base_margin()

In [None]:
params={'learning_rate':0.1,
        'max_depth':5,
        'eval_metric': 'auc',
        'objective':'binary:logistic'
       }

In [None]:
bst = xgb.train(params, dtrain=dtrain, num_boost_round=50, evals=[(dtrain,'train'), (dval1,'val_p1'), (dval2,'val_p2')], obj=None, feval=None,
          maximize=False, early_stopping_rounds=None, evals_result=None,
          verbose_eval=10, xgb_model=None, callbacks=None)

In [None]:
xgb.plot_importance(bst)

In [None]:
train_df = r.load_data('data_transformed')

In [None]:
train_df[train_df['bundle_size']==4]

In [None]:
train_df[train_df['user_id'] == 2147470777].to_csv('2147470777.csv')

In [None]:
train_df['tb_int'] = train_df['time_between'].round(decimals=-5).astype(int)

In [None]:
tmp = train_df[['tb_int', 'answered_correctly']].groupby('tb_int').mean()

In [None]:
tmp.reset_index()['tb_int'].value_counts()

In [None]:
tmp[tmp['answered_correctly']<0.1].reset_index().mean()

In [None]:
plt.hist(tmp['answered_correctly'], bins=10)        

In [None]:
plt.hist(tmp[tmp['answered_correctly']>0.9].index, bins=20);

In [None]:
plt.hist(tmp[tmp['answered_correctly']<0.1].index, bins=20);

In [None]:
train_p2 = r.load_data('train_p2_transformed')

In [None]:
val_p1, val_p2 = r.load_data('val_p1_transformed'), r.load_data('val_p2_transformed')

In [None]:
train_p2.head()