In [15]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline
%reload_ext autotime

In [16]:
dtypes = {'ip': np.uint32, 'app': np.uint16, 'device': np.uint8, 'os': np.uint8, 'channel': np.uint8, 'is_attributed': np.bool}
train_df = pd.read_csv('../data/raw/train_sample.csv', sep=',', dtype=dtypes, parse_dates=['click_time', 'attributed_time'])

time: 216 ms


### Extract time information
Extract day, minute, hour, second from the click_time.

In [17]:
train_df['day']  = train_df.click_time.dt.day.astype('uint8')
train_df['hour'] = train_df.click_time.dt.hour.astype('uint8')
train_df['minute'] = train_df.click_time.dt.minute.astype('uint8')
train_df['second'] = train_df.click_time.dt.second.astype('uint8')
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,minute,second
0,87540,12,1,13,241,2017-11-07 09:30:38,NaT,False,9,7,30,38
1,105560,25,1,17,3,2017-11-07 13:40:27,NaT,False,13,7,40,27
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,False,18,7,5,24
3,94584,13,1,13,221,2017-11-07 04:58:08,NaT,False,4,7,58,8
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,False,9,9,0,9


time: 53.4 ms


In [7]:
# Number of clicks for each ip+app combination
gp = train_df[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,minute,second,ip_app_count_x,ip_app_count_y,ip_app_count
0,87540,12,1,13,241,2017-11-07 09:30:38,NaT,False,9,7,30,38,3,3,3
1,105560,25,1,17,3,2017-11-07 13:40:27,NaT,False,13,7,40,27,4,4,4
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,False,18,7,5,24,1,1,1
3,94584,13,1,13,221,2017-11-07 04:58:08,NaT,False,4,7,58,8,1,1,1
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,False,9,9,0,9,2,2,2


time: 178 ms


In [13]:
# Number of clicks for each ip+app+os combination
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()
train_df[['ip','app', 'os', 'channel', 'ip_app_os_count']].head()

Unnamed: 0,ip,app,os,channel,ip_app_os_count
0,87540,12,13,241,2
1,105560,25,17,3,1
2,101424,12,19,212,1
3,94584,13,13,221,1
4,68413,12,1,178,1


time: 185 ms


In [34]:
# Credits: https://www.kaggle.com/nanomathias/feature-engineering-importance-testing
def generateAggregateFeatures(train_df, aggregateFeatures):
    for spec in aggregateFeatures:
        print("Generating aggregate feature {} group by {}, and aggregating {} with {}".format(spec['name'], spec['groupBy'], spec['select'], spec['agg']))
        gp = train_df[spec['groupBy'] + [spec['select']]] \
            .groupby(by=spec['groupBy'])[spec['select']] \
            .agg(spec['agg']) \
            .reset_index() \
            .rename(index=str, columns={spec['select']: spec['name']})
        train_df = train_df.merge(gp, on=spec['groupBy'], how='left')
        del gp
        gc.collect()
     
    return train_df

aggregateFeatures = [
    # Number of clickes for ip-app
    {'name': 'ip-app-count', 'groupBy': ['ip', 'app'], 'select': 'channel', 'agg': 'count'}, 
    # Number of clicks for each ip-app-os
    {'name': 'ip-app-os-count', 'groupBy': ['ip','app', 'os'], 'select': 'channel', 'agg': 'count' },
    # Number of clicks for ip-day-hour
    {'name': 'ip-day-hour-count', 'groupBy': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Number of clicks for ip-app-day-hour
    {'name': 'ip-app-day-hour-count', 'groupBy': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Clicks variance in day, for ip-app-channel
    {'name': 'ip-app-channel-var', 'groupBy': ['ip','app','channel'], 'select': 'day', 'agg': 'var'},
    # Clicks variance in hour, for ip-app-os
    {'name': 'ip-app-os-var', 'groupBy': ['ip','app','os'], 'select': 'hour', 'agg': 'var'},
    # Clicks variance in hour, for ip-day-channel
    {'name': 'ip-day-channel-var', 'groupBy': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'},
    # Mean clicks in an hour, for ip-app-channel
    {'name': 'ip-app-channel-var', 'groupBy': ['ip','app','channel'], 'select': 'hour', 'agg': 'mean'},
    # How popular is the app in channel?
    {'name': 'app-popularity', 'groupBy': ['app'], 'select': 'channel', 'agg': 'count'},
    # How popular is the channel in app?
    {'name': 'channel-popularity', 'groupBy': ['channel'], 'select': 'app', 'agg': 'count'},
    # Average clicks on app by distinct users; is it an app they return to?
    {'name': 'avg-clicks-on-app', 'groupBy': ['app'], 'select': 'ip', 'agg': lambda x: float(len(x)) / len(x.unique())}
]
train_df = generateAggregateFeatures(train_df, aggregateFeatures)
train_df.head()

Generating aggregate feature ip-app-count group by ['ip', 'app'], and aggregating channel with count
Generating aggregate feature ip-app-os-count group by ['ip', 'app', 'os'], and aggregating channel with count
Generating aggregate feature ip-day-hour-count group by ['ip', 'day', 'hour'], and aggregating channel with count
Generating aggregate feature ip-app-day-hour-count group by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Generating aggregate feature ip-app-channel-var group by ['ip', 'app', 'channel'], and aggregating day with var
Generating aggregate feature ip-app-os-var group by ['ip', 'app', 'os'], and aggregating hour with var
Generating aggregate feature ip-day-channel-var group by ['ip', 'day', 'channel'], and aggregating hour with var
Generating aggregate feature ip-app-channel-var group by ['ip', 'app', 'channel'], and aggregating hour with mean
Generating aggregate feature app-popularity group by ['app'], and aggregating channel with count
Generating 

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,...,ip-app-os-count,ip-day-hour-count,ip-app-day-hour-count,ip-app-channel-var_x,ip-app-os-var,ip-day-channel-var,ip-app-channel-var_y,app-popularity,channel-popularity,avg-clicks-on-app
0,87540,12,1,13,241,2017-11-07 09:30:38,NaT,False,9,7,...,2,1,1,,24.5,,9.0,13198,238,1.436126
1,105560,25,1,17,3,2017-11-07 13:40:27,NaT,False,13,7,...,1,4,1,0.25,,35.066667,11.5,804,3618,1.086486
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,False,18,7,...,1,1,1,,,,18.0,13198,635,1.436126
3,94584,13,1,13,221,2017-11-07 04:58:08,NaT,False,4,7,...,1,1,1,,,,4.0,2422,3960,1.111519
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,False,9,9,...,1,1,1,,,,9.0,13198,2936,1.436126


time: 2.32 s


### Time till next click
How long it takes for a given ip-app-channel before they perform the next click.

In [41]:
def generateNextClickFeatures(train_df, nextClickAggregateFeatures):
    for spec in nextClickAggregateFeatures:
        feature_name = '{}-next-click'.format('_'.join(spec['groupBy']))   
        train_df[feature_name] = train_df[spec['groupBy'] + ['click_time']].groupby(['ip']).click_time.transform(lambda x: x.diff().shift(-1)).dt.seconds
    return train_df


nextClickAggregateFeatures = [
    {'groupBy': ['ip']},
    {'groupBy': ['ip', 'app']},
    {'groupBy': ['ip', 'channel']},
    {'groupBy': ['ip', 'os']}
]
train_df = generateNextClickFeatures(train_df, nextClickAggregateFeatures)
train_df.head()


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,...,ip-day-channel-var,ip-app-channel-var_y,app-popularity,channel-popularity,avg-clicks-on-app,ip-nextclick,ip-next-click,ip_app-next-click,ip_channel-next-click,ip_os-next-click
0,87540,12,1,13,241,2017-11-07 09:30:38,NaT,False,9,7,...,,9.0,13198,238,1.436126,61540.0,61540.0,61540.0,61540.0,61540.0
1,105560,25,1,17,3,2017-11-07 13:40:27,NaT,False,13,7,...,35.066667,11.5,804,3618,1.086486,12747.0,12747.0,12747.0,12747.0,12747.0
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,False,18,7,...,,18.0,13198,635,1.436126,57936.0,57936.0,57936.0,57936.0,57936.0
3,94584,13,1,13,221,2017-11-07 04:58:08,NaT,False,4,7,...,,4.0,2422,3960,1.111519,19577.0,19577.0,19577.0,19577.0,19577.0
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,False,9,9,...,,9.0,13198,2936,1.436126,51097.0,51097.0,51097.0,51097.0,51097.0


time: 57.3 s
