In [3]:
!ls data/fraud-detection

models	test.csv  tmp  train.csv  train_sample.csv


In [30]:
"""
A non-blending lightGBM model that incorporates portions and ideas from various public kernels.
"""
DEBUG = True
WHERE = 'kaggle'
FILENO = 4
NCHUNK = 32000000
OFFSET = 75000000
VAL_RUN = False

MISSING32 = 999999999
MISSING8 = 255
PUBLIC_CUTOFF = 4032690

if WHERE=='kaggle':
	inpath = 'data/fraud-detection/'
	pickle_path ='data/fraud-detection/'
	suffix = ''
	outpath = ''
	savepath = ''
	oofpath = ''
	cores = 4
elif WHERE=='gcloud':
	inpath = '../.kaggle/competitions/talkingdata-adtracking-fraud-detection/'
	pickle_path = '../data/'
	suffix = '.zip'
	outpath = '../sub/'
	oofpath = '../oof/'
	savepath = '../data/'
	cores = 7

import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os

def do_rollingstd(df, group_cols, agg_name, agg_type='float32', window_size=5, show_max=False, show_agg=True):
    # suggest trying ['app', 'channel', 'day', 'hour'] to start
    if show_agg:
        print( "Doing rolling std dev of ", group_cols , '...')
    c_gp = df[group_cols].groupby(group_cols).size().rename('gp_count').to_frame().reset_index()
    df = df.merge(c_gp, on=group_cols, how='left')
    del c_gp
    df['ma_gp'] = df[group_cols+['gp_count']].groupby(group_cols)['gp_count'].unique().\
    rolling(window_size).mean().rename('ma_gp').to_frame().reset_index(drop=True)
    df['res'] = df['gp_count'] - df['ma_gp'] #might need to use sub method here to deal with subtraction of NAs
    df.drop(['ma_gp', 'gp_count'], axis=1, inplace=True)
    gc.collect()
    df[agg_name] = df[group_cols+['res']].groupby(group_cols)['res'].rolling(window_size).\
    std().rename(agg_name).to_frame().reset_index(drop=True)
    df[agg_name].fillna(-999, inplace=True)
    df[agg_name] = df[agg_name].astype(agg_type)
    print(df[agg_name].notnull().mean())
    print(df[agg_name].iloc[70000:70010])
    gc.collect()
    return( df )
    
def do_count( df, group_cols, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Aggregating by ", group_cols , '...' )
    gp = df[group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

def do_countuniq( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Counting unqiue ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )
    
def do_cumcount( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Cumulative count by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

def do_mean( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating mean of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

def do_var( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating variance of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

debug = DEBUG
if debug:
    print('*** debug parameter set: this is a test run for debugging purposes ***')

    
if VAL_RUN:
    nrows=122071522
    outpath = oofpath
else:
    nrows=184903890
nchunk=NCHUNK
val_size=2500000
frm=nrows-OFFSET
if debug:
    frm=0
    nchunk=100000
    val_size=10000
to=frm+nchunk
fileno = FILENO

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        }

if VAL_RUN:
    print('loading train data...',frm,to)
    train_df = pd.read_pickle( pickle_path+"training.pkl.gz" )[frm:to]
    train_df['click_time'] = pd.to_datetime( train_df.click_time )
    print('loading test data...')
    if debug:
        public_cutoff = 10000
        test_df = pd.read_pickle( pickle_path+"validation.pkl.gz" )[:30000]
        test_df['click_time'] = pd.to_datetime( test_df.click_time )
        y_test = test_df['is_attributed'].values
        test_df.drop(['is_attributed'],axis=1,inplace=True)
    else:
        public_cutoff = PUBLIC_CUTOFF
        test_df = pd.read_pickle( pickle_path+"validation.pkl.gz" )
        test_df['click_time'] = pd.to_datetime( test_df.click_time )
        y_test = test_df['is_attributed'].values
        test_df.drop(['is_attributed'],axis=1,inplace=True)
else:
    print('loading train data...',frm,to)
    train_df = pd.read_csv(inpath+"train.csv", parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
    print('loading test data...')
    if debug:
        test_df = pd.read_csv(inpath+"test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    else:
        test_df = pd.read_csv(inpath+"test.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    train_df['click_id'] = MISSING32
    train_df['click_id'] = train_df.click_id.astype('uint32')


len_train = len(train_df)
test_df['is_attributed'] = MISSING8
test_df['is_attributed'] = test_df.is_attributed.astype('uint8')
train_df=train_df.append(test_df)

del test_df
gc.collect()

print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')

print('Extracting aggregation features...')

train_df = do_rollingstd(train_df, ['day', 'hour'], 'rolling', window_size=5, show_max=False, show_agg=True); gc.collect()
train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'app', 'X1', show_max=True ); gc.collect()
#train_df = do_cumcount( train_df, ['ip'], 'os', 'X7', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'channel', 'X0', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip', 'day'], 'hour', 'X2', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'app', 'X3', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip', 'app'], 'os', 'X4', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip'], 'device', 'X5', 'uint16', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['app'], 'channel', 'X6', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'device', 'os'], 'app', 'X8', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'day', 'hour'], 'ip_tcount', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app'], 'ip_app_count', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app', 'os'], 'ip_app_os_count', 'uint16', show_max=True ); gc.collect()
#train_df = do_var( train_df, ['ip', 'day', 'channel'], 'hour', 'ip_tchan_count', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os'], 'hour', 'ip_app_os_var', show_max=True ); gc.collect()
#train_df = do_var( train_df, ['ip', 'app', 'channel'], 'day', 'ip_app_channel_var_day', show_max=True ); gc.collect()
#train_df = do_mean( train_df, ['ip', 'app', 'channel'], 'hour', 'ip_app_channel_mean_hour', show_max=True ); gc.collect()

print('Doing nextClick...')
predictors=[]
new_feature = 'nextClick'
D=2**26
train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
        + "_" + train_df['os'].astype(str)).apply(hash) % D
click_buffer= np.full(D, 3000000000, dtype=np.uint32)
train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9
next_clicks= []
for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)):
    next_clicks.append(click_buffer[category]-t)
    click_buffer[category]= t
del(click_buffer)
QQ= list(reversed(next_clicks))
train_df.drop(['epochtime','category','click_time'], axis=1, inplace=True)
train_df[new_feature] = pd.Series(QQ).astype('float32')
predictors.append(new_feature)


*** debug parameter set: this is a test run for debugging purposes ***
loading train data... 0 100000
loading test data...
Extracting new features...
Extracting aggregation features...
Doing rolling std dev of  ['app', 'channel', 'day', 'hour'] ...
1.0
70000   -999.0
70001   -999.0
70002   -999.0
70003   -999.0
70004   -999.0
70005   -999.0
70006   -999.0
70007   -999.0
70008   -999.0
70009   -999.0
Name: rolling, dtype: float32
Cumulative count by  ['ip', 'device', 'os'] ...
X1 max value =  231
Counting unqiue  channel  by  ['ip'] ...
X0 max value =  80
Counting unqiue  app  by  ['ip'] ...
X3 max value =  34
Counting unqiue  channel  by  ['app'] ...
X6 max value =  32
Counting unqiue  app  by  ['ip', 'device', 'os'] ...
X8 max value =  22
Aggregating by  ['ip', 'day', 'hour'] ...
ip_tcount max value =  610
Aggregating by  ['ip', 'app'] ...
ip_app_count max value =  212
Aggregating by  ['ip', 'app', 'os'] ...
ip_app_os_count max value =  62
Calculating variance of  hour  by  ['ip', 'ap

In [36]:
hr =[1,1,1,2,2,2,2]
id = ['a', 'b', 'a', 'b', 'b', 'a', 'b']
for i in zip(id, hr):
    print(i)
df = pd.DataFrame(zip(id, hr), columns = ['id', 'hr'])

('a', 1)
('b', 1)
('a', 1)
('b', 2)
('b', 2)
('a', 2)
('b', 2)


In [42]:
data = np.array([['id','hr'],
                ['a', 1],
                ['b', 1],
                ['a', 1],
                ['b', 2],
                ['b', 2],
                ['a', 2],
                ['b', 2]
                ])
                
print(pd.DataFrame(data=data[:,1:],
                  index=data[:,0],
                  ))

  index   0
0    id  hr
1     a   1
2     b   1
3     a   1
4     b   2
5     b   2
6     a   2
7     b   2


In [44]:
from collections import OrderedDict
data = OrderedDict([ ('id', ['a', 'b', 'a', 'b', 'b', 'a', 'b']),
          ('hr', [1, 1, 1, 2, 2, 2, 2])
           ] )
df = pd.DataFrame.from_dict(data)

In [45]:
df

Unnamed: 0,id,hr
0,a,1
1,b,1
2,a,1
3,b,2
4,b,2
5,a,2
6,b,2


In [60]:
def do_rollingstd(df, group_cols, agg_name, agg_type='float32', window_size=5, show_max=False, show_agg=True):
    # suggest trying ['app', 'channel', 'day', 'hour'] to start
    if show_agg:
        print( "Doing rolling std dev of ", group_cols , '...')
    c_gp = df[group_cols].groupby(group_cols).size().rename('gp_count').to_frame().reset_index()
    df = df.merge(c_gp, on=group_cols, how='left')
    del c_gp
    df['ma_gp'] = df[group_cols+['gp_count']].groupby(group_cols)['gp_count'].mean().\
    rolling(window_size).sum().rename('ma_gp').to_frame().reset_index(drop=True)
    df['res'] = df['gp_count'] - df['ma_gp'] #might need to use sub method here to deal with subtraction of NAs
    #df.drop(['ma_gp', 'gp_count'], axis=1, inplace=True)
    gc.collect()
    df[agg_name] = df[group_cols+['res']].groupby(group_cols)['res'].rolling(window_size).\
    std().rename(agg_name).to_frame().reset_index(drop=True)
    df[agg_name].fillna(-999, inplace=True)
    df[agg_name] = df[agg_name].astype(agg_type)
    print(df[agg_name].notnull().mean())
    print(df[agg_name].iloc[70000:70010])
    gc.collect()
    return( df )

In [61]:
do_rollingstd(df, ['id', 'hr'], 'rolling', agg_type='float32', window_size=2)

Doing rolling std dev of  ['id', 'hr'] ...
1.0
Series([], Name: rolling, dtype: float32)


Unnamed: 0,id,hr,gp_count,ma_gp,res,rolling
0,a,1,2,,,-999.0
1,b,1,1,3.0,-2.0,-999.0
2,a,1,2,2.0,0.0,-999.0
3,b,2,3,4.0,-1.0,-999.0
4,b,2,3,,,-999.0
5,a,2,1,,,-999.0
6,b,2,3,,,-999.0


In [62]:
group_cols=['id', 'hr']
agg_name = 'rolling'
agg_type='float32'
window_size=2

In [63]:
c_gp = df[group_cols].groupby(group_cols).size().rename('gp_count').to_frame().reset_index()
df = df.merge(c_gp, on=group_cols, how='left')
del c_gp
df

Unnamed: 0,id,hr,gp_count
0,a,1,2
1,b,1,1
2,a,1,2
3,b,2,3
4,b,2,3
5,a,2,1
6,b,2,3


In [82]:
df[group_cols+['gp_count']].drop_duplicates()

Unnamed: 0,id,hr,gp_count
0,a,1,2
1,b,1,1
3,b,2,3
5,a,2,1


In [89]:
df2 = df[group_cols+['gp_count']].drop_duplicates()
df2['ma']=df2.groupby('id')['gp_count'].rolling(2).sum().reset_index(0,drop=True)
#.groupby(group_cols)['gp_count'].unique()#.reset_index(drop=True)#.\
    #rolling(window_size).sum().rename('ma_gp').to_frame().reset_index(drop=True)

In [91]:
df.merge(df2, on=group_cols, how='left')

Unnamed: 0,id,hr,gp_count_x,gp_count_y,ma
0,a,1,2,2,
1,b,1,1,1,
2,a,1,2,2,
3,b,2,3,3,4.0
4,b,2,3,3,4.0
5,a,2,1,1,3.0
6,b,2,3,3,4.0


In [140]:
from collections import OrderedDict
data = OrderedDict([ ('id', ['a', 'b', 'a', 'b', 'b', 'a', 'b', 'a', 'b']),
          ('hr', [1, 1, 1, 2, 2, 2, 2, 3, 3])
           ] )
df = pd.DataFrame.from_dict(data)

In [141]:
df

Unnamed: 0,id,hr
0,a,1
1,b,1
2,a,1
3,b,2
4,b,2
5,a,2
6,b,2
7,a,3
8,b,3


In [142]:
group_cols=['id', 'hr']
agg_name = 'rolling'
agg_type='float32'
window_size=2

### THIS WORKS

In [143]:
c_gp = df[group_cols].groupby(group_cols).size().rename('gp_count').to_frame().reset_index()
df = df.merge(c_gp, on=group_cols, how='left')
del c_gp
gc.collect()
df2 = df[group_cols+['gp_count']].drop_duplicates()
df2['ma']=df2.groupby('id')['gp_count'].rolling(2).mean().reset_index(0,drop=True)
df2['res'] = df2['gp_count'] - df2['ma']
df2.drop(['gp_count', 'res'], axis=1, inplace=True)
df2[agg_name]=df2.groupby('id')['ma'].rolling(2).std().reset_index(0,drop=True)
df2.drop('ma', axis=1, inplace=True)
df = df.merge(df2, on=group_cols, how='left')
del df2
gc.collect()
df

Unnamed: 0,id,hr,gp_count,rolling
0,a,1,2,
1,b,1,1,
2,a,1,2,
3,b,2,3,
4,b,2,3,
5,a,2,1,
6,b,2,3,
7,a,3,1,0.353553
8,b,3,1,0.0


In [162]:
"""
A non-blending lightGBM model that incorporates portions and ideas from various public kernels.
"""
DEBUG = True
WHERE = 'kaggle'
FILENO = 4
NCHUNK = 32000000
OFFSET = 75000000
VAL_RUN = False

MISSING32 = 999999999
MISSING8 = 255
PUBLIC_CUTOFF = 4032690

if WHERE=='kaggle':
	inpath = 'data/fraud-detection/'
	pickle_path ='data/fraud-detection/'
	suffix = ''
	outpath = ''
	savepath = ''
	oofpath = ''
	cores = 4
elif WHERE=='gcloud':
	inpath = '../.kaggle/competitions/talkingdata-adtracking-fraud-detection/'
	pickle_path = '../data/'
	suffix = '.zip'
	outpath = '../sub/'
	oofpath = '../oof/'
	savepath = '../data/'
	cores = 7

import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os

def do_rollingstd(df, group_cols, agg_name, agg_type='float32', window_size=5, show_max=False, show_agg=True):
    # suggest trying ['app', 'channel', 'day', 'hour'] to start
    if show_agg:
        print( "Doing rolling std dev of ", group_cols , '...')
    c_gp = df[group_cols].groupby(group_cols).size().rename('gp_count').to_frame().reset_index()
    df = df.merge(c_gp, on=group_cols, how='left')
    del c_gp
    gc.collect()
    df2 = df[group_cols+['gp_count']].drop_duplicates()
    df2['ma']=df2.groupby('channel')['gp_count'].rolling(2).mean().reset_index(0,drop=True)
    df2['res'] = df2['gp_count'] - df2['ma']
    df2.drop(['gp_count', 'res'], axis=1, inplace=True)
    df2[agg_name]=df2.groupby('channel')['ma'].rolling(2).std().reset_index(0,drop=True)
    df2.drop('ma', axis=1, inplace=True)
    df = df.merge(df2, on=group_cols, how='left')
    del df2
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    gc.collect()
    #print(df[agg_name].count())
    #print(df[[agg_name]+['channel', 'hour']][df['channel']==379])#.iloc[60000:60010])
    df[agg_name].fillna(-999, inplace=True)
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)
    

debug = DEBUG
if debug:
    print('*** debug parameter set: this is a test run for debugging purposes ***')

    
if VAL_RUN:
    nrows=122071522
    outpath = oofpath
else:
    nrows=184903890
nchunk=NCHUNK
val_size=2500000
frm=nrows-OFFSET
if debug:
    frm=0
    nchunk=100000
    val_size=10000
to=frm+nchunk
fileno = FILENO

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        }

if VAL_RUN:
    print('loading train data...',frm,to)
    train_df = pd.read_pickle( pickle_path+"training.pkl.gz" )[frm:to]
    train_df['click_time'] = pd.to_datetime( train_df.click_time )
    print('loading test data...')
    if debug:
        public_cutoff = 10000
        test_df = pd.read_pickle( pickle_path+"validation.pkl.gz" )[:30000]
        test_df['click_time'] = pd.to_datetime( test_df.click_time )
        y_test = test_df['is_attributed'].values
        test_df.drop(['is_attributed'],axis=1,inplace=True)
    else:
        public_cutoff = PUBLIC_CUTOFF
        test_df = pd.read_pickle( pickle_path+"validation.pkl.gz" )
        test_df['click_time'] = pd.to_datetime( test_df.click_time )
        y_test = test_df['is_attributed'].values
        test_df.drop(['is_attributed'],axis=1,inplace=True)
else:
    print('loading train data...',frm,to)
    train_df = pd.read_csv(inpath+"train.csv", parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
    print('loading test data...')
    if debug:
        test_df = pd.read_csv(inpath+"test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    else:
        test_df = pd.read_csv(inpath+"test.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    train_df['click_id'] = MISSING32
    train_df['click_id'] = train_df.click_id.astype('uint32')


len_train = len(train_df)
test_df['is_attributed'] = MISSING8
test_df['is_attributed'] = test_df.is_attributed.astype('uint8')
train_df=train_df.append(test_df)

del test_df
gc.collect()

print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')

print('Extracting aggregation features...')

train_df = do_rollingstd(train_df, ['channel', 'hour'], 'rolling', window_size=3, show_max=True, show_agg=True); gc.collect()
train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'app', 'X1', show_max=True ); gc.collect()
#train_df = do_cumcount( train_df, ['ip'], 'os', 'X7', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'channel', 'X0', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip', 'day'], 'hour', 'X2', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'app', 'X3', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip', 'app'], 'os', 'X4', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip'], 'device', 'X5', 'uint16', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['app'], 'channel', 'X6', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'device', 'os'], 'app', 'X8', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'day', 'hour'], 'ip_tcount', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app'], 'ip_app_count', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app', 'os'], 'ip_app_os_count', 'uint16', show_max=True ); gc.collect()
#train_df = do_var( train_df, ['ip', 'day', 'channel'], 'hour', 'ip_tchan_count', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os'], 'hour', 'ip_app_os_var', show_max=True ); gc.collect()
#train_df = do_var( train_df, ['ip', 'app', 'channel'], 'day', 'ip_app_channel_var_day', show_max=True ); gc.collect()
#train_df = do_mean( train_df, ['ip', 'app', 'channel'], 'hour', 'ip_app_channel_mean_hour', show_max=True ); gc.collect()

print('Doing nextClick...')
predictors=[]
new_feature = 'nextClick'
D=2**26
train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
        + "_" + train_df['os'].astype(str)).apply(hash) % D
click_buffer= np.full(D, 3000000000, dtype=np.uint32)
train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9
next_clicks= []
for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)):
    next_clicks.append(click_buffer[category]-t)
    click_buffer[category]= t
del(click_buffer)
QQ= list(reversed(next_clicks))
train_df.drop(['epochtime','category','click_time'], axis=1, inplace=True)
train_df[new_feature] = pd.Series(QQ).astype('float32')
predictors.append(new_feature)


*** debug parameter set: this is a test run for debugging purposes ***
loading train data... 0 100000
loading test data...
Extracting new features...
Extracting aggregation features...
Doing rolling std dev of  ['channel', 'hour'] ...
rolling max value =  1078.337841309485
10872
           rolling  channel  hour
0              NaN      379    14
1              NaN      379    14
2              NaN      379    14
4              NaN      379    14
5              NaN      379    14
6              NaN      379    14
7              NaN      379    14
8              NaN      379    14
10             NaN      379    14
11             NaN      379    14
12             NaN      379    14
13             NaN      379    14
14             NaN      379    14
15             NaN      379    14
16             NaN      379    14
17             NaN      379    14
18             NaN      379    14
19             NaN      379    14
20             NaN      379    14
21             NaN      379    14
23    