In [None]:
import pandas as pd
from pathlib import Path
import pickle
from sklearn.metrics import roc_auc_score
from kaggle import api
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_columns=30

import gc

In [None]:
path=Path('./data')

In [None]:
train = pd.read_feather(path/'train_gr1_transformed.feather')
val = pd.read_feather(path/'val_gr1_transformed.feather')

In [None]:
train = train.drop(index=train[train['content_type_id']==1].index,
                   columns = ['row_id', 'content_type_id']).reset_index(drop=True)
val = val.drop(index=val[val['content_type_id']==1].index,
               columns = ['row_id', 'content_type_id']).reset_index(drop=True)
gc.collect()

In [None]:
train.shape, val.shape

In [None]:
target = 'answered_correctly'
cols = list(val.columns)
cols.remove(target)

In [None]:
print(len(cols))
print()
print(cols)

In [None]:
to_remove = ['timestamp',
             'user_id',
             'target_cumsum',
             'prior_question_elapsed_time_cumsum',
             'time_per_question_cat',
             'bundle_id',
             'tb_mean',
            ]

In [None]:
to_remove = ['timestamp', 'user_id', 'content_id', 'task_container_id',
             'prior_question_elapsed_time', 'prior_question_had_explanation',
             'days', 'user_L10_mean',
             'target_cumcount', 'target_cumsum', 'user_mean',
             'prior_question_elapsed_time_cumsum', 'n_attempts',
             'time_per_question', 'time_per_question_cat', 'tpq_mean',
             'lag_time_cat', 'timestamp_prior_time_cumsum_diff', 'time_between_cat',
             'tb_mean', 'bundle_id', 'part', 'tags', 'num_of_tags', 'bundle_size',
             'question_mean', 'question_std', 'question_skew', 'bundle_mean',
             'bundle_std', 'bundle_skew', 'tags_mean', 'tags_std', 'tags_skew',
             'user_content_hmean', 'all_hmean', 'all_hsum'
            ]

In [None]:
features = [i for i in cols if i not in to_remove]

In [None]:
print(len(features))
print()
print(features)

In [None]:
cat = ['content_id', 'prior_question_had_explanation','n_attempts',
       'lag_time_cat', 'time_between_cat', 'part', 'tags', 'num_of_tags', 'bundle_size', 
      ]

RandomForest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=10,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=1.0,
    max_leaf_nodes=None, #default - None
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=37,
    verbose=1,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=0.01)

In [None]:
rf.fit(train[features].values, train[target].values)

In [None]:
rf_preds_train = rf.predict_proba(train[features])[:,1]
rf_preds_val = rf.predict_proba(val[features])[:,1]

In [None]:
rf_preds_p2[-5:]

In [None]:
# rf 60 estimators (new)
(roc_auc_score(train[target], rf_preds_train),
 roc_auc_score(val[target], rf_preds_val))

In [None]:
fi = rf.feature_importances_

In [None]:
fi[fi.argsort()]

In [None]:
pd.DataFrame(zip(features, fi), columns=['features', 'importance']).sort_values(by='importance', ascending=False)

In [None]:
preds_estimators = np.stack([t.predict_proba(val[features])[:,1] for t in rf.estimators_])

In [None]:
plt.plot([roc_auc_score(val[target], preds_estimators[:i+1].mean(0)) for i in range(len(rf.estimators_))]);

XGBoost

In [None]:
import xgboost as xgb

In [None]:
print(dir(xgb))

In [None]:
%%time
dtrain = xgb.DMatrix(data=train[features], label=train[target], weight=None, base_margin=None,
                        missing=None, silent=True, feature_names=features,
                        feature_types=None, nthread=-1,)

In [None]:
%%time
dval = xgb.DMatrix(data=val[features], label=val[target], weight=None, base_margin=None,
                        missing=None, silent=True, feature_names=features,
                        feature_types=None, nthread=-1,)

In [None]:
params = {
    # Parameters for Tree Booster
    'learning_rate':0.6, 'min_split_loss':0,
    'max_depth':4, 'subsample':0.1, 'colsample_bytree':1.0,
    
    # Learning Task Parameters
    'objective':'binary:logistic',
    'eval_metric':'auc'
}

In [None]:
%%time
xgb_booster=xgb.train(params=params, dtrain=dtrain, num_boost_round=1,
          evals=((dtrain, 'train'), (dval, 'val')),
          obj=None, feval=None, maximize=False,
          early_stopping_rounds=None, evals_result=None,
          verbose_eval=1, xgb_model=None, callbacks=None)

In [None]:
xgb_booster.save_model(path.stem+'/models/xgb_g1-3f-d4-perfect')

In [None]:
xgb.plot_importance(xgb_booster, importance_type='weight')

In [None]:
xgb.plot_importance(xgb_booster, importance_type='gain')

LightGBM

In [None]:
import lightgbm as lgb

In [None]:
print(dir(lgb))

In [None]:
# use cat
gtrain = lgb.Dataset(data=train[features], label=train[target], reference=None,
                        weight=None, group=None, init_score=None,
                        silent=False, feature_name=features,
                        categorical_feature=cat, params=None, free_raw_data=True)

gval = lgb.Dataset(data=val[features], label=val[target], reference=gtrain,
                        weight=None, group=None, init_score=None,
                        silent=False, feature_name=features,
                        categorical_feature=cat, params=None, free_raw_data=True)
gc.collect()

In [None]:
# cat auto
gtrain = lgb.Dataset(data=train[features], label=train[target], reference=None,
                        weight=None, group=None, init_score=None,
                        silent=False, feature_name='auto',
                        categorical_feature='auto', params=None, free_raw_data=True)

gval = lgb.Dataset(data=val[features], label=val[target], reference=gtrain,
                        weight=None, group=None, init_score=None,
                        silent=False, feature_name='auto',
                        categorical_feature='auto', params=None, free_raw_data=True)
gc.collect()

In [None]:
lgb_params={
    #Core Parameters
    'objective':'binary', #cross_entropy
    'learning_rate':0.4, #0.05
    'num_leaves':31, #127
    'num_threads':4,
    'device_type':'cpu',
    
    #Learning Control Parameters
    'max_depth':None,
#     'feature_pre_filter':False, # to change the min_data_in_leaf
    'min_data_in_leaf':20, # 20
    'bagging_fraction':0.1,#0.05 increase
    'feature_fraction':1.0,
    
    #Metric Parameters
    'metric':'auc'
}

In [None]:
%%time
# use cat
lgb_booster = lgb.train(params=lgb_params, train_set=gtrain, num_boost_round=1000,
                        valid_sets=[gtrain, gval], valid_names=['train_df', 'val_df'],
                        fobj=None, feval=None,
                        init_model=None, feature_name=features, categorical_feature=cat,
                        early_stopping_rounds=10, evals_result=None, verbose_eval=100,
                        learning_rates=None,
                        keep_training_booster=False, callbacks=None)

In [None]:
lgb_booster.save_model(path.stem+'/models/lgb_g4-30f-l127-cat-396-77636')

In [None]:
%%time
# cat auto
#list(np.arange(0.6, 0.02, -(0.6-0.02)/1000))
lgb_booster = lgb.train(params=lgb_params, train_set=gtrain, num_boost_round=1,
                        valid_sets=[gtrain, gval], valid_names=['train_df', 'val_df'],
                        fobj=None, feval=None,
                        init_model=None, feature_name='auto', categorical_feature='auto',
                        early_stopping_rounds=None, evals_result=None, verbose_eval=1,
                        learning_rates=None,
                        keep_training_booster=False, callbacks=None)

In [None]:
lgb_booster.save_model(path.stem+'/models/lgb_g1-3f-l31-perfect')

In [None]:
lgb.plot_importance(lgb_booster, importance_type='split')

In [None]:
lgb.plot_importance(lgb_booster, importance_type='gain')

In [None]:
lgb_booster_1 = lgb.Booster(model_file=path.stem + '/models/lgb_g1-3f-l31-perfect')
# lgb_booster_1c = lgb.Booster(model_file=path.stem + '/models/lgb_g1-30f-l127-cat-329-77551')
# lgb_booster_2 = lgb.Booster(model_file=path.stem + '/models/lgb_g2-30f-l127-1646-77602')
# lgb_booster_2c = lgb.Booster(model_file=path.stem + '/models/lgb_g2-30f-l127-cat-318-77595')
# lgb_booster_3 = lgb.Booster(model_file=path.stem + '/models/lgb_g3-30f-l127-1338-77631')
# lgb_booster_3c = lgb.Booster(model_file=path.stem + '/models/lgb_g3-30f-l127-cat-391-77672')
# lgb_booster_4 = lgb.Booster(model_file=path.stem + '/models/lgb_g4-30f-l127-1802-77642')
# lgb_booster_4c = lgb.Booster(model_file=path.stem + '/models/lgb_g4-30f-l127-cat-396-77636')

In [None]:
lgb_booster_1.num_trees(), #lgb_booster_1c.num_trees()

In [None]:
val1 = pd.read_feather(path/'val_gr1_transformed.feather')
# val2 = pd.read_feather(path/'val_gr2_transformed.feather')
# val3 = pd.read_feather(path/'val_gr3_transformed.feather')
# val4 = pd.read_feather(path/'val_gr4_transformed.feather')

In [None]:
val=val1
gc.collect()

In [None]:
%%time
lgb_preds_1 = lgb_booster_1.predict(val[features])
# lgb_preds_1c = lgb_booster_1c.predict(val[features])
# lgb_preds_2 = lgb_booster_2.predict(val[features])
# lgb_preds_2c = lgb_booster_2c.predict(val[features])
# lgb_preds_3 = lgb_booster_3.predict(val[features])
# lgb_preds_3c = lgb_booster_3c.predict(val[features])
# lgb_preds_4 = lgb_booster_4.predict(val[features])
# lgb_preds_4c = lgb_booster_4c.predict(val[features])

In [None]:
lgb_preds_1[:7]

In [None]:
lgb_preds_1c

In [None]:
val[target].values[:7]

In [None]:
w=0.5
print('lgb 1')
print("val: {:.5}".format(roc_auc_score(val[target], lgb_preds_1)))
# print("val cat: {:.5}".format(roc_auc_score(val[target], lgb_preds_1c)))
# print("val avg: {:.5}".format(roc_auc_score(val[target], w*lgb_preds_1+(1-w)*lgb_preds_1c)))
# print()
# print('lgb 2')
# print("val: {:.5}".format(roc_auc_score(val[target], lgb_preds_2)))
# print("val cat: {:.5}".format(roc_auc_score(val[target], lgb_preds_2c)))
# print("val avg: {:.5}".format(roc_auc_score(val[target], w*lgb_preds_2+(1-w)*lgb_preds_2c)))
# print()
# print('lgb 3')
# print("val: {:.5}".format(roc_auc_score(val[target], lgb_preds_3)))
# print("val cat: {:.5}".format(roc_auc_score(val[target], lgb_preds_3c)))
# print("val avg: {:.5}".format(roc_auc_score(val[target], w*lgb_preds_3+(1-w)*lgb_preds_3c)))
# print()
# print('lgb 4')
# print("val: {:.5}".format(roc_auc_score(val[target], lgb_preds_4)))
# print("val cat: {:.5}".format(roc_auc_score(val[target], lgb_preds_4c)))
# print("val avg: {:.5}".format(roc_auc_score(val[target], w*lgb_preds_4+(1-w)*lgb_preds_4c)))
# print()
# print('lgb avg all')
# print("val avg: {:.5}".format(roc_auc_score(val[target], 0.25*lgb_preds_1+0.25*lgb_preds_2
#                                             +0.25*lgb_preds_3 + 0.25*lgb_preds_4)))
# print()
# print('lgb avg cat all')
# print("val avg: {:.5}".format(roc_auc_score(val[target], 0.25*lgb_preds_1c+0.25*lgb_preds_2c
#                                             +0.25*lgb_preds_3c+ 0.25*lgb_preds_4c)))
# print()
# print('lgb avg all')
# print("val avg: {:.5}".format(roc_auc_score(val[target], (1-w)/4*lgb_preds_1c+(1-w)/4*lgb_preds_2c
#                                             +(1-w)/4*lgb_preds_3c+ (1-w)/4*lgb_preds_4c + w/4*lgb_preds_1c+w/4*lgb_preds_2c
#                                             +w/4*lgb_preds_3c+ w/4*lgb_preds_4c)))

In [None]:
preds = lgb_booster_1.predict(train[features])

In [None]:
np.unique(preds)

In [None]:
roc_auc_score(train[target], preds)

In [None]:
train_p1

In [None]:
train_p1[train_p1['user_id']==13134].tail(3)

In [None]:
train_p2[train_p2['user_id']==13134].tail(3)

In [None]:
val_p1[val_p1['user_id']==13134].tail(3)

In [None]:
val_p2[val_p2['user_id']==13134].head(3)

In [None]:
data['time_between_clipped']= data['time_between'].clip(upper=600000)

In [None]:
s = np.round(data['time_between_clipped'],-2).sort_values().values

In [None]:
np.round(data['time_between_clipped'],-2).value_counts()

In [None]:
data = pd.read_feather(path/'data_q.feather')
# data = pd.read_feather(path/'data_q_transformed.feather')

In [None]:
data.shape

In [None]:
user_ = data[['user_id', 'answered_correctly']].groupby('user_id').agg(['count', 'mean'])
user_.columns=['a_count', 'a_mean']
user_ = user_.reset_index()
class_count = user_['a_count'].value_counts().to_frame()
class_count.columns = ['class_count']
user_ = user_.join(class_count, on='a_count')

In [None]:
user_

In [None]:
val_users=[]

In [None]:
# add 50 users with class_count 1 (362072 rows)
seed = 2020
val_users.extend(user_[user_['class_count']==1].sample(n=50, random_state=seed)['user_id'].to_list())

In [None]:
data.loc[data['user_id'].isin(val_users)].shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# set 0.5% as new users
_, new_user_df = train_test_split(user_[user_['class_count']!=1],
                              test_size=0.005,
                              stratify=user_[user_['class_count']!=1]['class_count'],
                              random_state=seed)

In [None]:
new_user_df.shape

In [None]:
data.loc[data['user_id'].isin(new_user_df['user_id'])].shape

In [None]:
# add 1959 users as new users with stratified class_count (432282 rows)
val_users.extend(new_user_df['user_id'].to_list())
val_users.sort()

In [None]:
len(val_users), 362072 + 432282 

In [None]:
data.loc[data['user_id'].isin(val_users)].shape

In [None]:
# set 30% from remaining as existing users
_, users_to_split_df = train_test_split(user_[(user_['class_count']!=1)&(~user_['user_id'].isin(val_users))],
                              test_size=0.3,
                              stratify=user_[(user_['class_count']!=1)&(~user_['user_id'].isin(val_users))]['class_count'],
                              random_state=2020)

In [None]:
users_to_split_df

In [None]:
len(users_to_split_df['user_id'].unique())

In [None]:
data_ = data.loc[data['user_id'].isin(users_to_split_df['user_id'])].copy()
gc.collect()

In [None]:
data_.shape

In [None]:
len(data_['user_id'].unique())

In [None]:
def get_val_idx(df, n_iter=17):
        """Get validation idx."""
        
        counter = 0
        val_idx = []
        df['user_filter'] = (df[['user_id', 'task_container_id']].groupby('user_id').transform('nunique').values > n_iter)
    
        while counter < n_iter:
            train_trans = df.loc[df['user_filter'], ['user_id', 'timestamp']].groupby('user_id').transform('max').values.squeeze()
            val_filter = (train_trans == df.loc[df['user_filter'], 'timestamp'].values)
            val_idx.extend(df.loc[df['user_filter']][val_filter].index.to_list())
            val_idx.sort()
            
            df = df.drop(df[df['user_filter']][val_filter].index)
            
            counter += 1
    
        return val_idx

In [None]:
val_users_from_split = get_val_idx(data_, n_iter=12)

In [None]:
val_users_from_split[:5]

In [None]:
len(val_users_from_split)

In [None]:
794354 + 1832472

In [None]:
val_df = data.loc[val_users_from_split]

In [None]:
val_df.shape

In [None]:
len(val_df['user_id'].unique())

In [None]:
train_df = data[data['user_id'].isin(users_to_split['user_id'])].drop(data.loc[existing_users_idx].index).reset_index(drop=True)

In [None]:
common_users = set(train_df['user_id'].unique()).intersection(val_df['user_id'].unique())

In [None]:
only_in_train = set(train_df['user_id'].unique()).difference(common_users)

In [None]:
only_in_val = set(val_df['user_id'].unique()).difference(common_users)

In [None]:
len(common_users), len(only_in_train), len(only_in_val)

In [None]:
len(common_users) + len(only_in_train) + len(only_in_val)

In [None]:
train_df.shape, val_df.shape

In [None]:
check = train_df.loc[train_df['user_id'].isin(common_users),['user_id', 'timestamp']].groupby('user_id').max().join(
    val_df.loc[val_df['user_id'].isin(common_users), ['user_id', 'timestamp']].groupby('user_id').min(),rsuffix='_val')

In [None]:
check.shape

In [None]:
check

In [None]:
(check['timestamp'] > check['timestamp_val']).sum()

In [None]:
val_df = val_df.append(data[data['user_id'].isin(val_users)]).sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)

In [None]:
val_df.shape

In [None]:
questions_df = pd.read_csv(path/'questions.csv')

In [None]:
len(questions_df['question_id'].unique())

In [None]:
len(data['content_id'].unique())

In [None]:
len(train_df['content_id'].unique())

In [None]:
len(val_df['content_id'].unique())

In [None]:
train_df.head()

In [None]:
train_df.to_feather(path/'train_m.feather')
val_df.to_feather(path/'val_m.feather')

In [None]:
import random

seed = 2020
random.seed(seed)

In [None]:
train = pd.read_csv(path/'train.csv',
                   dtype={'row_id': 'int64',
                          'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [None]:
train = train.drop(index=train[train['content_type_id']==1].index,
                   columns = ['content_type_id', 'user_answer']).reset_index(drop=True)

In [None]:
train.shape

In [None]:
max_timestamp_u = train[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()

In [None]:
def rand_time(max_time_stamp):
    interval = MAX_TIME_STAMP - max_time_stamp
    rand_time_stamp = random.randint(0,interval)
    return rand_time_stamp

In [None]:
max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
train = train.merge(max_timestamp_u, on='user_id', how='left')
train['viretual_time_stamp'] = train.timestamp + train['rand_time_stamp']

In [None]:
len(train['user_id'].unique())

In [None]:
users = train[['user_id', 'timestamp']].groupby('user_id').first().reset_index()

In [None]:
users

In [None]:
user_gr1 = users['user_id'].sample(n=100000, random_state=2020).to_list()
user_gr1.sort()

In [None]:
user_gr2 = users.loc[~users['user_id'].isin(user_gr1)].sample(n=100000, random_state=2020)['user_id'].to_list()
user_gr2.sort()

In [None]:
user_gr3 = users.loc[~users['user_id'].isin(user_gr1+user_gr2)].sample(n=100000, random_state=2020)['user_id'].to_list()
user_gr3.sort()

In [None]:
user_gr4 = users.loc[~users['user_id'].isin(user_gr1+user_gr2+user_gr3)]['user_id'].to_list()
user_gr4.sort()

In [None]:
assert (len(user_gr1 + user_gr2 + user_gr3 + user_gr4) == users.shape[0])

In [None]:
# train.loc[train['user_id'].isin(user_gr4)]

In [None]:
train_gr1 = train.loc[train['user_id'].isin(user_gr1)]
train_gr2 = train.loc[train['user_id'].isin(user_gr2)]
train_gr3 = train.loc[train['user_id'].isin(user_gr3)]
train_gr4 = train.loc[train['user_id'].isin(user_gr4)]

In [None]:
train_gr1.shape

In [None]:
train_gr2.shape

In [None]:
train_gr3.shape

In [None]:
train_gr4.shape

In [None]:
train = train.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)

In [None]:
train_gr1 = train_gr1.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)
train_gr2 = train_gr2.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)
train_gr3 = train_gr3.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)
train_gr4 = train_gr4.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)

In [None]:
valid_gr1 = train_gr1[-2500000:]
train_gr1 = train_gr1[:-2500000]
valid_gr2 = train_gr2[-2500000:]
train_gr2 = train_gr2[:-2500000]
valid_gr3 = train_gr3[-2500000:]
train_gr3 = train_gr3[:-2500000]
valid_gr4 = train_gr4[-2500000:]
train_gr4 = train_gr4[:-2500000]

In [None]:
train_gr1 = train_gr1.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
valid_gr1 = valid_gr1.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
train_gr2 = train_gr2.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
valid_gr2 = valid_gr2.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
train_gr3 = train_gr3.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
valid_gr3 = valid_gr3.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
train_gr4 = train_gr4.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
valid_gr4 = valid_gr4.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)

In [None]:
tmp1 = train_gr1[['user_id', 'timestamp']].groupby('user_id').max().join(valid_gr1[['user_id', 'timestamp']].groupby('user_id').min(),
                                                                 rsuffix='_val', how='inner')
tmp2 = train_gr2[['user_id', 'timestamp']].groupby('user_id').max().join(valid_gr2[['user_id', 'timestamp']].groupby('user_id').min(),
                                                                 rsuffix='_val', how='inner')
tmp3 = train_gr3[['user_id', 'timestamp']].groupby('user_id').max().join(valid_gr3[['user_id', 'timestamp']].groupby('user_id').min(),
                                                                 rsuffix='_val', how='inner')
tmp4 = train_gr4[['user_id', 'timestamp']].groupby('user_id').max().join(valid_gr4[['user_id', 'timestamp']].groupby('user_id').min(),
                                                                 rsuffix='_val', how='inner')

In [None]:
tmp2[tmp2['timestamp'] == tmp2['timestamp_val']]

In [None]:
train_gr2[(train_gr2['user_id'] == 1558331816) & (train_gr2['timestamp'] == 16484595726)]

In [None]:
valid_gr2[(valid_gr2['user_id'] == 1558331816) & (valid_gr2['timestamp'] == 16484595726)]

In [None]:
# train_gr1['row_id'].to_pickle(path/'train_gr1_row_id.pickle')
# valid_gr1['row_id'].to_pickle(path/'val_gr1_row_id.pickle')
# train_gr2['row_id'].to_pickle(path/'train_gr2_row_id.pickle')
# valid_gr2['row_id'].to_pickle(path/'val_gr2_row_id.pickle')
# train_gr3['row_id'].to_pickle(path/'train_gr3_row_id.pickle')
# valid_gr3['row_id'].to_pickle(path/'val_gr3_row_id.pickle')
# train_gr4['row_id'].to_pickle(path/'train_gr4_row_id.pickle')
# valid_gr4['row_id'].to_pickle(path/'val_gr4_row_id.pickle')

In [None]:
train_gr1_list = train_gr1['row_id'].to_list()
valid_gr1_list = valid_gr1['row_id'].to_list()
train_gr2_list = train_gr2['row_id'].to_list()
valid_gr2_list = valid_gr2['row_id'].to_list()
train_gr3_list = train_gr3['row_id'].to_list()
valid_gr3_list = valid_gr3['row_id'].to_list()
train_gr4_list = train_gr4['row_id'].to_list()
valid_gr4_list = valid_gr4['row_id'].to_list()

In [None]:
73251571

In [None]:
train_gr2_list.append(73251571)
train_gr2_list.sort()
valid_gr2_list.remove(73251571)

In [None]:
with open(path/'train_gr1_row_id.pickle', mode='wb') as file:
    pickle.dump(train_gr1_list, file)
with open(path/'val_gr1_row_id.pickle', mode='wb') as file:
    pickle.dump(valid_gr1_list, file)
    
with open(path/'train_gr2_row_id.pickle', mode='wb') as file:
    pickle.dump(train_gr2_list, file)
with open(path/'val_gr2_row_id.pickle', mode='wb') as file:
    pickle.dump(valid_gr2_list, file)
    
with open(path/'train_gr3_row_id.pickle', mode='wb') as file:
    pickle.dump(train_gr3_list, file)
with open(path/'val_gr3_row_id.pickle', mode='wb') as file:
    pickle.dump(valid_gr3_list, file)
    
with open(path/'train_gr4_row_id.pickle', mode='wb') as file:
    pickle.dump(train_gr4_list, file)
with open(path/'val_gr4_row_id.pickle', mode='wb') as file:
    pickle.dump(valid_gr4_list, file)

In [None]:
with open(path/'train_gr2_row_id.pickle', mode='rb') as file:
    tr2 = pickle.load(file)

In [None]:
train.loc[train['row_id'].isin(tr2)]

In [None]:
tr2

In [None]:
train.head()

In [None]:
val_size = 2500000

for cv in range(5):
    valid = train[-val_size:]
    train = train[:-val_size]
    
    valid['row_id'].to_pickle(f'cv{cv+1}_valid_rows.pickle')
    train['row_id'].to_pickle(f'cv{cv+1}_train_rows.pickle')

In [None]:
train.shape