In [11]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os



In [138]:
gc.collect()

40

In [134]:
debug=0
if debug:
    print('*** debug parameter set: this is a test run for debugging purposes ***')

In [139]:
nrows = 184903891-1
nchunk = 40000000
val_size = 2500000

frm = 144903891

if debug:
    frm=0
    nchunk=100000
    val_size=10000

to = frm + nchunk

In [140]:
dtypes = {
            'ip'            : 'uint32',
            'app'           : 'uint16',
            'device'        : 'uint16',
            'os'            : 'uint16',
            'channel'       : 'uint16',
            'is_attributed' : 'uint8',
            'click_id'      : 'uint32',
            }

In [141]:
print('loading train data...', frm, to)
train_df_tmp = pd.read_csv("data//train.csv.zip", 
                           parse_dates = ['click_time'], 
                           skiprows = range(1, frm), 
                           nrows = to - frm, 
                           dtype = dtypes, 
                           usecols = ['ip','app','device','os', 'channel', 'click_time', 'is_attributed'],
                           chunksize = 10**6)

train_df = pd.concat(train_df_tmp, ignore_index=True)

loading train data... 144903891 184903891


In [143]:
del train_df_tmp
gc.collect()

465

In [147]:
print('loading test data...')
if debug:
    test_df = pd.read_csv("data/test.csv.zip", 
                          nrows=100000, 
                          parse_dates=['click_time'], 
                          dtype=dtypes, 
                          usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
else:
    test_df = pd.read_csv("data/test.csv.zip", 
                              parse_dates=['click_time'], 
                              dtype=dtypes, 
                              usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

loading test data...


In [148]:
len_train = len(train_df)
train_df = train_df.append(test_df)

In [149]:
del test_df
gc.collect()

219

In [150]:
print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')

gc.collect()

Extracting new features...


14

### Agg

In [152]:
naddfeat=9
save_tmp = 0
agg = ['count', 'mean', 'var', 'skew', 'nunique', 'cumcount']

for i in range(0, naddfeat):
    if i==0: selcols=['ip', 'channel']; QQ=4;
    if i==1: selcols=['ip', 'device', 'os', 'app']; QQ=5;
    if i==2: selcols=['ip', 'day', 'hour']; QQ=4;
    if i==3: selcols=['ip', 'app']; QQ=4;
    if i==4: selcols=['ip', 'app', 'os']; QQ=4;
    if i==5: selcols=['ip', 'device']; QQ=4;
    if i==6: selcols=['app', 'channel']; QQ=4;
    if i==7: selcols=['ip', 'os']; QQ=5;
    if i==8: selcols=['ip', 'device', 'os', 'app']; QQ=4;
    print('selcols', selcols, '-', agg[QQ], '[%d of %d]' %[i+1, naddfeat])

    filename='agg_%s.csv.gz'%(agg[QQ]+'_'+'_'.join(selcols))
    
    if os.path.exists(filename):
        if QQ==5: 
            gp=pd.read_csv(filename,header=None)
            train_df['cumcnt_'+'_'.join(selcols)]=gp
        else: 
            gp=pd.read_csv(filename)
            train_df = train_df.merge(gp, on=selcols[0:len(selcols)-1], how='left')
    else:
        if QQ==0:
            gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]]\
                                  .count()\
                                  .reset_index()\
                                  .rename(index=str, columns={selcols[len(selcols)-1]: 'cnt_'+'_'.join(selcols)})
            train_df = train_df.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            
        if QQ==1:
            gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]]\
                                  .mean()\
                                  .reset_index()\
                                  .rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            train_df = train_df.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            
        if QQ==2:
            gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]]\
                                  .var()\
                                  .reset_index()\
                                  .rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            train_df = train_df.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            
        if QQ==3:
            gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]]\
                                  .skew()\
                                  .reset_index()\
                                  .rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            train_df = train_df.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            
        if QQ==4:
            gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]]\
                                  .nunique()\
                                  .reset_index()\
                                  .rename(index=str, columns={selcols[len(selcols)-1]: 'nuniq_'+'_'.join(selcols)})
            train_df = train_df.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            
        if QQ==5:
            gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]]\
                                  .cumcount()
            train_df['cumcnt_'+'_'.join(selcols)] = gp.values
        
        if save_tmp:
            gp.to_csv('tmp/' + filename, index=False, compression='gzip')

selcols ['ip', 'channel'] - nunique
selcols ['ip', 'device', 'os', 'app'] - cumcount
selcols ['ip', 'day', 'hour'] - nunique
selcols ['ip', 'app'] - nunique
selcols ['ip', 'app', 'os'] - nunique
selcols ['ip', 'device'] - nunique
selcols ['app', 'channel'] - nunique
selcols ['ip', 'os'] - cumcount
selcols ['ip', 'device', 'os', 'app'] - nunique


In [153]:
del gp
gc.collect()  

235

In [176]:
print('grouping by ip-day-hour combination...')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']]\
                                            .count()\
                                            .reset_index()\
                                            .rename(index=str, columns={'channel': 'ip_tcount'})
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')

del gp
gc.collect()

grouping by ip-day-hour combination...


61

In [177]:
print('grouping by ip-app combination...')
gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']]\
                                       .count()\
                                       .reset_index()\
                                       .rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()

grouping by ip-app combination...


101

In [178]:
print('grouping by ip-app-os combination...')
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']]\
                                            .count()\
                                            .reset_index()\
                                            .rename(index=str, columns={'channel': 'ip_app_os_count'})
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

grouping by ip-app-os combination...


117

In [179]:
# Adding features with var and mean hour (inspired from nuhsikander's script)
print('grouping by : ip_day_chl_var_hour')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']]\
                                            .var()\
                                            .reset_index()\
                                            .rename(index=str, columns={'hour': 'ip_day_chl_var_hour'})
train_df = train_df.merge(gp, on=['ip','day','channel'], how='left')
del gp
gc.collect()

grouping by : ip_day_chl_var_hour


117

In [180]:
print('grouping by : ip_app_os_var_hour')
gp = train_df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']]\
                                         .var()\
                                         .reset_index()\
                                         .rename(index=str, columns={'hour': 'ip_app_os_var'})
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

grouping by : ip_app_os_var_hour


116

In [181]:
print('grouping by : ip_app_channel_var_day')
gp = train_df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']]\
                                             .var()\
                                             .reset_index()\
                                             .rename(index=str, columns={'day': 'ip_app_channel_var_day'})
train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

grouping by : ip_app_channel_var_day


116

In [182]:
print('grouping by : ip_app_chl_mean_hour')
gp = train_df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']]\
                                             .mean()\
                                             .reset_index()\
                                             .rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
print("merging...")
train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

grouping by : ip_app_chl_mean_hour
merging...


116

In [183]:
print("vars and data type: ")
train_df.info()
train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16')
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

vars and data type: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58790469 entries, 0 to 58790468
Data columns (total 26 columns):
app                         uint16
channel                     uint16
click_id                    float64
click_time                  datetime64[ns]
device                      uint16
ip                          uint32
is_attributed               float64
os                          uint16
hour                        uint8
day                         uint8
nuniq_ip_channel            int64
cumcnt_ip_device_os_app     int64
nuniq_ip_day_hour           int64
nuniq_ip_app                int64
nuniq_ip_app_os             int64
nuniq_ip_device             int64
nuniq_app_channel           int64
cumcnt_ip_os                int64
nuniq_ip_device_os_app      int64
ip_tcount                   int64
ip_app_count                int64
ip_app_os_count             int64
ip_tchan_count              float64
ip_app_os_var               float64
ip_app_channel_var_day    

In [192]:
to_write = list(train_df.columns[10:])
train_df_write = train_df[to_write]

In [193]:
train_df_write.to_csv('tmp/agg_features.csv.gz', index=False, compression='gzip', chunksize=3*10**6, float_format='%.8f')