In [1]:
import pandas as pd
import numpy as np
import time
import gc

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train = pd.read_csv(path + 'train.csv',dtype=dtypes)
test = pd.read_csv(path + 'test.csv',dtype=dtypes)

# Check NA

In [3]:
print('training length is {}'.format(len(train)))
print('testing length is {}'.format(len(test)))

n_train = pd.isnull(train).sum()
n_test = pd.isnull(test).sum()

print(n_train)
print('-------------------')
print(n_test)

### there is no N/A

training length is 184903890
testing length is 18790469
ip                         0
app                        0
device                     0
os                         0
channel                    0
click_time                 0
attributed_time    184447044
is_attributed              0
dtype: int64
-------------------
click_id      0
ip            0
app           0
device        0
os            0
channel       0
click_time    0
dtype: int64


# add time features

In [9]:
# get timestamp

import multiprocessing

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

def get_timestamp(x):
    return x.timestamp()





workers = 30

for df in [train, test]:
    clicks = pd.to_datetime(df.click_time)
    print('get clicks')
    df['timestamp'] = apply_by_multiprocessing(clicks, get_timestamp, workers=workers)
#     df['timestamp'] = clicks.apply(lambda t: t.timestamp())
    df['timestamp'] = df['timestamp'].astype('uint32')
    print('timestamping is done')
    
    dt = clicks.dt

    df['year'] = dt.year.astype('uint16')
    print('year is done')
    
    df['month'] = dt.month.astype('uint8')
    print('month is done')
        
    df['week'] = dt.week.astype('uint8')
    print('week is done')
    
    df['day'] = dt.day.astype('uint8')
    print('day is done')
    
    df['hour'] = dt.hour.astype('uint8')
    print('hour is done')
    
    df['minute'] = dt.minute.astype('uint8')
    print('minute is done')
    
    df['second'] = dt.second.astype('uint8')
    print('second is done')
    print('================================================================')

get clicks
timestamping is done
year is done
month is done
week is done
day is done
hour is done
minute is done
second is done
get clicks
timestamping is done
year is done
month is done
week is done
day is done
hour is done
minute is done
second is done


In [11]:
target = 'is_attributed'
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'year', 
              'month',
              'week',
              'day',
              'hour',
              'timestamp',
              'minute',
              'second']

train_cols =  feature_col.copy()
train_cols.append(target)
df_train = train[train_cols]
df_test = test[feature_col]

print('saving')
df_train.to_csv(path+'train_cleaned.csv', index=False)
print('training done')
df_test.to_csv(path+'test_cleaned.csv', index=False)
print('testing done')

saving
training done
testing done


# processing on sample

In [12]:
train_sample = pd.read_csv(path + 'train_sample.csv',dtype=dtypes)

for df in [train_sample]:
    clicks = pd.to_datetime(df.click_time)
    print('get clicks')
    df['timestamp'] = apply_by_multiprocessing(clicks, get_timestamp, workers=workers)
#     df['timestamp'] = clicks.apply(lambda t: t.timestamp())
    df['timestamp'] = df['timestamp'].astype('uint32')
    print('timestamping is done')
    
    dt = clicks.dt

    df['year'] = dt.year.astype('uint16')
    print('year is done')
    
    df['month'] = dt.month.astype('uint8')
    print('month is done')
        
    df['week'] = dt.week.astype('uint8')
    print('week is done')
    
    df['day'] = dt.day.astype('uint8')
    print('day is done')
    
    df['hour'] = dt.hour.astype('uint8')
    print('hour is done')
    
    df['minute'] = dt.minute.astype('uint8')
    print('minute is done')
    
    df['second'] = dt.second.astype('uint8')
    print('second is done')
    print('================================================================')

get clicks
timestamping is done
year is done
month is done
week is done
day is done
hour is done
minute is done
second is done


In [15]:
df_train_sample = train_sample[train_cols]
df_train_sample.to_csv(path+'train_sample_cleaned.csv', index=False)