In [1]:
import pandas as pd
import random
import gc

random.seed(1)

In [2]:
train = pd.read_feather('../data/input/train.feather')

In [3]:
train.shape

(101230332, 10)

In [4]:
valid_split1 = train.groupby('user_id').tail(5)
train_split1 = train[~train.row_id.isin(valid_split1.row_id)]
print(f'{train_split1.answered_correctly.mean():.3f} {valid_split1.answered_correctly.mean():.3f}')

0.627 0.520


In [5]:
del valid_split1, train_split1
gc.collect()

22

In [6]:
max_timestamp_u = train[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()

In [7]:
def rand_time(max_time_stamp):
    interval = MAX_TIME_STAMP - max_time_stamp
    rand_time_stamp = random.randint(0,interval)
    return rand_time_stamp

max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
train = train.merge(max_timestamp_u, on='user_id', how='left')
train['viretual_time_stamp'] = train.timestamp + train['rand_time_stamp']

In [8]:
del train['max_time_stamp']
del train['rand_time_stamp']
del max_timestamp_u
gc.collect()

35

In [9]:
train = train.sort_values(['viretual_time_stamp', 'row_id'])

In [10]:
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,viretual_time_stamp
32933156,32933156,0,705741139,128,0,0,0,1,,,0
32933157,32933157,20666,705741139,7860,0,1,0,1,16000.0,False,20666
32933158,32933158,39172,705741139,7922,0,2,1,1,19000.0,False,39172
32933159,32933159,58207,705741139,156,0,3,2,1,17000.0,False,58207
32933160,32933160,75779,705741139,51,0,4,0,1,17000.0,False,75779


In [11]:
val_size = 2500000
# val_size = 25000000

for cv in range(5):
    valid = train[-val_size:]
    train = train[:-val_size]
    # check new users and new contents
    new_users = len(valid[~valid.user_id.isin(train.user_id)].user_id.unique())
    valid_question = valid[valid.content_type_id == 0]
    train_question = train[train.content_type_id == 0]
    new_contents = len(valid_question[~valid_question.content_id.isin(train_question.content_id)].content_id.unique())    
    print(f'cv{cv} {train.answered_correctly.mean():.3f} {valid.answered_correctly.mean():.3f} {new_users} {new_contents}')
    print(f'train:{train.shape} valid:{valid.shape}')
    valid.reset_index(drop=True).to_feather(f'../data/train_valid/cv{cv+1}_valid_all.feather')
    train.reset_index(drop=True).to_feather(f'../data/train_valid/cv{cv+1}_train_all.feather')

cv0 0.626 0.611 15119 0
train:(98730332, 11) valid:(2500000, 11)
cv1 0.626 0.619 11198 0
train:(96230332, 11) valid:(2500000, 11)
cv2 0.626 0.615 10159 0
train:(93730332, 11) valid:(2500000, 11)
cv3 0.626 0.619 9687 3
train:(91230332, 11) valid:(2500000, 11)
cv4 0.626 0.622 9184 0
train:(88730332, 11) valid:(2500000, 11)


In [32]:
val_size = 2500000
# val_size = 25000000

for cv in range(40):
    valid = train[-val_size:]
    train = train[:-val_size]
    # check new users and new contents
    new_users = len(valid[~valid.user_id.isin(train.user_id)].user_id.unique())
    valid_question = valid[valid.content_type_id == 0]
    train_question = train[train.content_type_id == 0]
    new_contents = len(valid_question[~valid_question.content_id.isin(train_question.content_id)].content_id.unique())    
    print(f'cv{cv} {train.answered_correctly.mean():.3f} {valid.answered_correctly.mean():.3f} {new_users} {new_contents}')
    print(f'train:{train.shape} valid:{valid.shape}')
    valid[['row_id']].reset_index(drop=True).to_feather(f'../data/train_valid/cv{cv+1}_valid.feather')
    train[['row_id']].reset_index(drop=True).to_feather(f'../data/train_valid/cv{cv+1}_train.feather')

cv0 0.626 0.611 15119 0
train:(98730332, 11) valid:(2500000, 11)
cv1 0.626 0.619 11198 0
train:(96230332, 11) valid:(2500000, 11)
cv2 0.626 0.615 10159 0
train:(93730332, 11) valid:(2500000, 11)
cv3 0.626 0.619 9687 3
train:(91230332, 11) valid:(2500000, 11)
cv4 0.626 0.622 9184 0
train:(88730332, 11) valid:(2500000, 11)
cv5 0.626 0.623 9030 0
train:(86230332, 11) valid:(2500000, 11)
cv6 0.627 0.620 8923 0
train:(83730332, 11) valid:(2500000, 11)
cv7 0.627 0.621 8963 0
train:(81230332, 11) valid:(2500000, 11)
cv8 0.627 0.624 8652 0
train:(78730332, 11) valid:(2500000, 11)
cv9 0.627 0.623 8725 0
train:(76230332, 11) valid:(2500000, 11)
cv10 0.627 0.626 8758 0
train:(73730332, 11) valid:(2500000, 11)
cv11 0.627 0.627 8398 0
train:(71230332, 11) valid:(2500000, 11)
cv12 0.627 0.630 8555 0
train:(68730332, 11) valid:(2500000, 11)
cv13 0.627 0.627 8094 0
train:(66230332, 11) valid:(2500000, 11)
cv14 0.627 0.628 8222 1
train:(63730332, 11) valid:(2500000, 11)
cv15 0.627 0.630 8464 0
train:(6

In [31]:
101230332 - (2500000*39)

3730332

In [25]:
101230332 * 0.04

4049213.2800000003

In [None]:
6230332
6230332
2500000
38730332