testとtest_suplementの分離とpkl化

In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from contextlib import contextmanager
import pickle

In [2]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

path = '~/talkingdata/'

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [5]:
with timer("load training data"):
    test_df = pd.read_csv(path+"input/test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

with timer("load test data"):
    testsup_df = pd.read_csv(path+"input/test_supplement.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

[load training data] done in 13 s
[load test data] done in 42 s


In [18]:
with open('test.csv.pkl','wb') as test:
  pickle.dump(test_df, test, protocol=2)
# pickle.dump(test_df, open('test.csv.pkl', 'w'), protocol=2)

FileNotFoundError: [Errno 2] No such file or directory: '~/talkingdata/test.csv.pkl'

In [37]:
test_df.count()

click_id      18790469
ip            18790469
app           18790469
device        18790469
os            18790469
channel       18790469
click_time    18790469
dtype: int64

In [38]:
testsup_df.count()

click_id         57845224
ip               57845224
app              57845224
device           57845224
os               57845224
channel          57845224
click_time       57845224
click_id.test    18790469
dtype: int64

In [26]:
corr = pd.read_csv("~/talkingdata/data/test_click_id_relation.csv")
print(corr.head())

   click_id.test  click_id.testsup
0              0          21290878
1              1          21290876
2              2          21290877
3              3          21290879
4              4          21290880


In [27]:
corr = corr.rename(columns={'click_id.testsup': 'click_id'})
print(corr.head())

   click_id.test  click_id
0              0  21290878
1              1  21290876
2              2  21290877
3              3  21290879
4              4  21290880


In [34]:
corr.count()

click_id.test    18790469
click_id         18790469
dtype: int64

In [28]:
testsup_df = pd.merge(testsup_df, corr, on='click_id', how='left', sort=False)
print(testsup_df.head())
# test_df = pd.merge(test_df, testsup_df, on='click_id.testsup', how='left', sort=False)
# print(test_df)

   click_id      ip  app  device  os  channel           click_time  \
0         0   43570    3       1  18      379  2017-11-09 14:23:39   
1         1   80528    3       1  13      379  2017-11-09 14:23:51   
2         2   32323    3       1  13      379  2017-11-09 14:25:57   
3         3   42887    3       1  17      379  2017-11-09 14:26:03   
4         4  119289   58       1  30      120  2017-11-09 14:26:41   

   click_id.test  
0            NaN  
1            NaN  
2            NaN  
3            NaN  
4            NaN  


In [31]:
testsup_without_test = testsup_df[testsup_df['click_id.test'].isnull()]

In [35]:
testsup_without_test.count()

click_id         39054755
ip               39054755
app              39054755
device           39054755
os               39054755
channel          39054755
click_time       39054755
click_id.test           0
dtype: int64

In [39]:
with open('testsup_without_test.csv.pkl','wb') as f:
  pickle.dump(testsup_without_test, f, protocol=2)

In [36]:
18790469 + 39054755

57845224

In [None]:
# with timer("load training data"):
#     train_df = pd.read_csv(path+"input/train.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])