In [1]:
import pandas as pd 
import numpy as np
import gc

In [29]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'int8'
}

to_read = ['ip','click_time'] 

Train Min IP Time

In [20]:
train_tmp = pd.read_csv('./data/train.csv.zip', 
                        usecols=to_read, 
                        dtype=dtypes, 
                        parse_dates=['click_time'], 
                        chunksize=10**6)

In [21]:
min_ip_time = pd.DataFrame()

for df in train_tmp:
    
    agg = df.groupby('ip')['click_time'].min().reset_index()
    
    min_ip_time = pd.concat([min_ip_time, agg])

In [22]:
min_ip_time_train = min_ip_time.groupby('ip')['click_time'].min().reset_index()

In [35]:
min_ip_time_train.shape

(277396, 2)

In [27]:
del train_tmp, min_ip_time
gc.collect()

734

Test Min IP Time

In [28]:
test_tmp = pd.read_csv('./data/test_supplement.csv.zip', 
                        usecols=to_read, 
                        dtype=dtypes, 
                        parse_dates=['click_time'], 
                        chunksize=10**6)

In [30]:
min_ip_time = pd.DataFrame()

for df in test_tmp:
    
    agg = df.groupby('ip')['click_time'].min().reset_index()
    
    min_ip_time = pd.concat([min_ip_time, agg])

In [32]:
min_ip_time_test = min_ip_time.groupby('ip')['click_time'].min().reset_index()

In [34]:
min_ip_time_test.shape

(126414, 2)

In [36]:
min_ip_time = pd.concat([min_ip_time_train, min_ip_time_test]).groupby('ip')['click_time'].min().reset_index()

In [37]:
min_ip_time.shape

(364779, 2)

In [38]:
del test_tmp
gc.collect()

350

In [42]:
min_ip_time.rename(columns={'click_time':'ip_first_seen'}, inplace=True)

### Use as feature

In [41]:
train_df = pd.read_csv('./data/train.csv.zip', skiprows=range(1,144903891), nrows=40000000, 
                       parse_dates=['click_time'], usecols=to_read, dtype=dtypes)

In [43]:
test_df = pd.read_csv('./data/test.csv.zip', usecols=to_read, parse_dates=['click_time'], dtype=dtypes)

In [44]:
concat_df = pd.concat([train_df,test_df])

In [50]:
del train_df, test_df
gc.collect()

21

In [49]:
concat_df = concat_df.merge(min_ip_time, on='ip', how='left')

### Calculate Time to First Seen

In [54]:
concat_df['time_diff_first_seen'] = (concat_df['click_time'] - concat_df['ip_first_seen']).astype('timedelta64[s]')

In [64]:
concat_df['log2_time_diff_first_seen'] = np.log2(1 + concat_df['time_diff_first_seen'].values).astype(int)

In [76]:
concat_df.dtypes

ip                                   uint32
click_time                   datetime64[ns]
ip_first_seen                datetime64[ns]
time_diff_first_seen                float64
log2_time_diff_first_seen             int64
dtype: object

In [77]:
concat_df.to_csv('tmp/time_diff.csv.gzip', index=False, chunksize=10**6, compression='gzip')