In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics
import gc

In [3]:
PATH = 'data/fraud-detection/'

# Validation Set Creation Plan
<ol>
    <li> Read in data sets, generating date time columns </li>
    <li> Determine day, hours, ips of test set </li>
    <li> Valid: Select data from day before test day, limit to hours, ips of test set </li>
    <li> Train: Select one or two days before test set </li>
</ol>

## 1. Read in data sets, generating date time columns 

In [3]:
!ls data/fraud-detection/tmp/

df_raw.feather	test_df.feather  train_df.feather  val_idxs.npy


In [4]:
PATH = 'data/fraud-detection/'

In [5]:
dtypes = {
            'ip'            : 'uint32',
            'app'           : 'uint16',
            'device'        : 'uint16',
            'os'            : 'uint16',
            'channel'       : 'uint16',
            'is_attributed' : 'uint8',
            'click_id'      : 'uint32',
            }

In [6]:
test_df = pd.read_csv(f"{PATH}test.csv", parse_dates=['click_time'], dtype=dtypes, \
                      usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

In [6]:
train_df = pd.read_csv(f'{PATH}train.csv', parse_dates=['click_time'], dtype=dtypes, \
                     usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

In [7]:
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['minute'] = pd.to_datetime(train_df.click_time).dt.minute.astype('uint8')

In [None]:
test_df['hour'] = pd.to_datetime(test_df.click_time).dt.hour.astype('uint8')
test_df['minute'] = pd.to_datetime(test_df.click_time).dt.minute.astype('uint8')

In [8]:
gc.collect()

154

In [19]:
train_df.to_feather(f'{PATH}tmp/df_raw.feather')

In [19]:
test_df.to_feather(f'{PATH}tmp/test_df.feather')

In [6]:
PATH = 'data/fraud-detection/'
train_df = pd.read_feather(f'{PATH}tmp/df_raw.feather')

In [12]:
test_df = pd.read_feather(f'{PATH}tmp/test_df.feather')

## 2. Determine day, hours, ips of test set 

In [9]:
len_train = len(train_df)

In [13]:
len_test = len(test_df)

In [10]:
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')

In [14]:
test_df['day'] = pd.to_datetime(test_df.click_time).dt.day.astype('uint8')

In [10]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)

In [23]:
display_all(test_df.describe(include='all').transpose())

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
click_id,18790500.0,,,,,,9395230.0,5424340.0,0.0,4697620.0,9395230.0,14092900.0,18790500.0
ip,18790500.0,,,,,,63069.2,36886.0,0.0,31558.0,63936.0,95316.0,126413.0
app,18790500.0,,,,,,12.2148,11.6492,0.0,3.0,12.0,18.0,521.0
device,18790500.0,,,,,,1.73051,25.9704,0.0,1.0,1.0,1.0,3031.0
os,18790500.0,,,,,,18.7331,11.3506,0.0,13.0,18.0,19.0,604.0
channel,18790500.0,,,,,,264.806,135.525,0.0,135.0,236.0,401.0,498.0
click_time,18790469.0,21603.0,2017-11-10 09:00:10,1466.0,2017-11-10 04:00:00,2017-11-10 15:00:00,,,,,,,
hour,18790500.0,,,,,,9.21995,3.75193,4.0,5.0,10.0,13.0,15.0
minute,18790500.0,,,,,,29.3203,17.3159,0.0,14.0,29.0,44.0,59.0
day,18790500.0,,,,,,10.0,0.0,10.0,10.0,10.0,10.0,10.0


test hour runs from 4 through 15, day is 10

In [24]:
display_all(train_df.describe(include='all').transpose())

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
ip,184904000.0,,,,,,90876.0,69527.9,1.0,40245.0,79622.0,118247.0,364778.0
app,184904000.0,,,,,,12.0113,14.8052,0.0,3.0,12.0,15.0,768.0
device,184904000.0,,,,,,21.7233,259.333,0.0,1.0,1.0,1.0,4227.0
os,184904000.0,,,,,,22.677,55.2528,0.0,13.0,18.0,19.0,956.0
channel,184904000.0,,,,,,268.579,129.588,0.0,140.0,258.0,379.0,500.0
click_time,184903890.0,259620.0,2017-11-07 14:00:11,1502.0,2017-11-06 14:32:21,2017-11-09 16:00:00,,,,,,,
is_attributed,184904000.0,,,,,,0.00247072,0.0496449,0.0,0.0,0.0,0.0,1.0
hour,184904000.0,,,,,,9.29878,6.17164,0.0,4.0,9.0,14.0,23.0
minute,184904000.0,,,,,,29.3264,17.381,0.0,14.0,29.0,44.0,59.0
day,184904000.0,,,,,,7.86353,0.889935,6.0,7.0,8.0,9.0,9.0


train day runs from 6 through 9

In [32]:
for day in range(6,10):
    display_all(train_df[['day', 'hour']][train_df.day == day].describe(include='uint8').transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day,9308568.0,6.0,0.0,6.0,6.0,6.0,6.0,6.0
hour,9308568.0,19.467074,2.852707,14.0,17.0,19.0,23.0,23.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day,59633310.0,7.0,0.0,7.0,7.0,7.0,7.0,7.0
hour,59633310.0,9.209491,6.225496,0.0,4.0,9.0,14.0,23.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day,62945075.0,8.0,0.0,8.0,8.0,8.0,8.0,8.0
hour,62945075.0,9.42701,6.170658,0.0,4.0,9.0,14.0,23.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
day,53016937.0,9.0,0.0,9.0,9.0,9.0,9.0,9.0
hour,53016937.0,7.461635,4.588471,0.0,4.0,7.0,12.0,16.0


So the train data runs from day 6:hour 14 continuously through day 9: hour 23

In [36]:
len(test_df.ip.unique())

93936

In [37]:
len(train_df.ip.unique())

277396

In [9]:
gc.collect()

107

In [16]:
test_ips = test_df.ip.unique()

## 3. Valid: Select data from day before test day, limit to hours, ips of test set 

In [17]:
# note this creates a view of the dataframe, not a new dataframe
val_df = train_df[train_df.ip.isin(test_ips)][train_df.day == 9]

  


In [18]:
val_df = val_df[val_df.hour.isin(range(4,16))]

In [22]:
val_df.reset_index().to_feather(f'{PATH}tmp/val_df.feather')

In [17]:
len(val_df)

32715929

In [61]:
val_idxs = val_df.index.values

In [62]:
len(val_idxs)

32715929

In [73]:
np.save(f'{PATH}tmp/val_idxs', val_idxs)

In [9]:
val_idxs = np.load(f'{PATH}tmp/val_idxs.npy')

In [70]:
!ls {PATH}tmp

df_raw.feather	test_df.feather  val_idxs.feather.npy  val_idxs.npy


In [74]:
val_df = val_df.copy()

## 4. Train: Select one or two days before test set 

In [20]:
train_df = train_df[train_df.day.isin([7,8])]
gc.collect()

474

In [23]:
train_df.reset_index().to_feather(f'{PATH}tmp/train_df.feather')

In [10]:
train_df = pd.read_feather(f'{PATH}tmp/train_df.feather')

In [25]:
# sizes to sample, twice as many for train since train is twice as many days
train_sample = 10000000
val_sample = train_sample/2

In [26]:
len(train_df)

122578385

In [28]:
train_df = train_df.sample(n=train_sample, random_state=42)
gc.collect()

518

In [29]:
val_df = val_df.sample(n=val_sample, random_state=42)
gc.collect()

331

In [32]:
train_df = train_df.drop('day', axis=1)
val_df = val_df.drop('day', axis=1)

In [34]:
len(val_df)

5000000

In [33]:
train_df = train_df.append(val_df)

In [35]:
del val_df
gc.collect()

804

# Modeling 

## Final data subset preprocessing before feature engineering 

In [91]:
len(val_df)

32715929

In [39]:
del train_df
del test_df
gc.collect()

1129

In [36]:
df = train_df.copy()
del train_df
gc.collect()

228

In [40]:
del val_df
gc.collect()

160

In [37]:
df = df.reset_index()

In [38]:
y = df['is_attributed'].values

In [44]:
df.drop('is_attributed', axis = 1, inplace=True)

In [45]:
len(df)

72527535

In [46]:
len(y)

72527535

In [47]:
df.head()

Unnamed: 0,index,ip,app,device,os,channel,click_time,hour,minute
0,82259195,58991,12,1,26,259,2017-11-08 04:00:00,4,0
1,82259196,184626,18,1,19,121,2017-11-08 04:00:00,4,0
2,82259197,151871,3,1,9,280,2017-11-08 04:00:00,4,0
3,82259198,4180,3,1,19,424,2017-11-08 04:00:00,4,0
4,82259199,231270,9,1,15,466,2017-11-08 04:00:00,4,0


In [49]:
df.drop('index', axis=1, inplace=True)

In [50]:
df.head(2)

Unnamed: 0,ip,app,device,os,channel,click_time,hour,minute
0,58991,12,1,26,259,2017-11-08 04:00:00,4,0
1,184626,18,1,19,121,2017-11-08 04:00:00,4,0


## Feature Definitions 

In [39]:
def do_skew( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating skew of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].skew().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [40]:
def do_var( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating variance of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [41]:
def do_count( df, group_cols, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Aggregating by ", group_cols , '...' )
    gp = df[group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [42]:
def do_countuniq( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Counting unqiue ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [43]:
def do_cumcount( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Cumulative count by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [44]:
def do_mean( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating mean of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [45]:
def do_var( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating variance of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [46]:
def do_click_shift(df, group_cols, shift_name, agg_type='float32', shift_val=-1, show_max=False, \
                   show_agg=True):
    if show_agg:
        print( "Aggregating by ", group_cols , '...' )
    #gp = 
    df[shift_name] = df[group_cols + ['click_time']].groupby(group_cols).click_time.transform(lambda x: x.diff().shift(shift_val)).dt.seconds#.rename(shift_name).to_frame().reset_index()
    #add fillna code from above here
    #df = df.merge(gp, on=group_cols, how='left')
    #del gp
    if show_max:
        print( shift_name + " max value = ", df[shift_name].max() )
    df[shift_name] = df[shift_name].astype(agg_type)
    gc.collect()
    return( df )

In [47]:
# creating 5 min buckets
ns5min=5*60*1000000000
df['fives'] = pd.to_datetime(((df.click_time.astype(np.int64) // ns5min + 1 ) * ns5min)).dt.minute.astype('uint8')

In [69]:
df.head(2)

Unnamed: 0,ip,app,device,os,channel,click_time,hour,minute,fives
0,58991,12,1,26,259,2017-11-08 04:00:00,4,0,5
1,184626,18,1,19,121,2017-11-08 04:00:00,4,0,5


In [68]:
df.tail(2)

Unnamed: 0,ip,app,device,os,channel,click_time,hour,minute,fives
72527533,103239,11,1,15,173,2017-11-09 15:59:59,15,59,0
72527534,90379,18,1,17,107,2017-11-09 15:59:59,15,59,0


In [48]:
train_df = df.copy()

In [49]:
del df
gc.collect()

413

In [50]:
train_df = do_skew( train_df, ['ip', 'app', 'os'], 'hour', 'ip_app_os_skew_h', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os'], 'hour', 'ip_app_os_var_h', show_max=True ); gc.collect()
train_df = do_skew( train_df, ['ip', 'app', 'os', 'hour'], 'fives', 'ip_app_os_skew_f', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os', 'hour'], 'fives', 'ip_app_os_var_f', show_max=True ); gc.collect()
train_df = do_skew( train_df, ['ip', 'device', 'os'], 'hour', 'ip_dev_os_skew_h', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'device', 'os'], 'hour', 'ip_dev_os_var_h', show_max=True ); gc.collect()
train_df = do_skew( train_df, ['ip', 'device', 'os', 'hour'], 'fives', 'ip_dev_os_skew_f', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'device', 'os', 'hour'], 'fives', 'ip_dev_os_var_f', show_max=True ); gc.collect()
train_df = do_skew( train_df, ['ip', 'app', 'os'], 'minute', 'ip_app_os_skew_m', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os'], 'minute', 'ip_app_os_var_m', show_max=True ); gc.collect()
train_df = do_skew( train_df, ['ip', 'app', 'os', 'hour'], 'minute', 'ip_app_os_h_skew_m', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os', 'hour'], 'minute', 'ip_app_os_h_var_m', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'channel', 'X0', 'uint8', show_max=True ); gc.collect()
train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'app', 'X1', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'hour'], 'minute', 'X2', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'app', 'X3', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'app'], 'os', 'X4', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'device', 'X5', 'uint16', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['app'], 'channel', 'X6', show_max=True ); gc.collect()
train_df = do_cumcount( train_df, ['ip'], 'os', 'X7', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'device', 'os'], 'app', 'X8', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'hour'], 'ip_tcount', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'hour', 'minute'], 'ip_t_mcount', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app'], 'ip_app_count', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app', 'os'], 'ip_app_os_count', 'uint16', show_max=True ); gc.collect()
train_df = do_mean( train_df, ['ip', 'app', 'channel'], 'hour', 'ip_app_channel_mean_hour', show_max=True ); gc.collect()
train_df = do_mean( train_df, ['ip', 'app', 'channel', 'hour'], 'minute', 'ip_app_channel_h_mean_m', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip'], 'ip_count', 'uint16', show_max=True ); gc.collect()
train_df = do_click_shift( train_df, ['ip'], 'i_next_click'); gc.collect()
train_df = do_click_shift( train_df, ['ip', 'app'], 'ia_next_click'); gc.collect()
train_df = do_click_shift( train_df, ['ip', 'channel'], 'ic_next_click'); gc.collect()
train_df = do_click_shift( train_df, ['ip', 'os'], 'io_next_click'); gc.collect()
train_df = do_click_shift( train_df, ['ip', 'os', 'device'], 'iod_next_click')


Calculating skew of  hour  by  ['ip', 'app', 'os'] ...
ip_app_os_skew_h max value =  7.338100515521791
Calculating variance of  hour  by  ['ip', 'app', 'os'] ...
ip_app_os_var_h max value =  264.5
Calculating skew of  fives  by  ['ip', 'app', 'os', 'hour'] ...
ip_app_os_skew_f max value =  6.794849630746133
Calculating variance of  fives  by  ['ip', 'app', 'os', 'hour'] ...
ip_app_os_var_f max value =  1512.5
Calculating skew of  hour  by  ['ip', 'device', 'os'] ...
ip_dev_os_skew_h max value =  8.124038404635966
Calculating variance of  hour  by  ['ip', 'device', 'os'] ...
ip_dev_os_var_h max value =  264.5
Calculating skew of  fives  by  ['ip', 'device', 'os', 'hour'] ...
ip_dev_os_skew_f max value =  6.335950539104124
Calculating variance of  fives  by  ['ip', 'device', 'os', 'hour'] ...
ip_dev_os_var_f max value =  1512.5
Calculating skew of  minute  by  ['ip', 'app', 'os'] ...
ip_app_os_skew_m max value =  5.092073653557608
Calculating variance of  minute  by  ['ip', 'app', 'os'] 

In [51]:
gc.collect()

174

In [53]:
train_df.to_feather(f'{PATH}tmp/feature_df.feather')

y still needs to be dropped from train_df before modeling, by the way...

In [54]:
train_df.head()

Unnamed: 0,index,ip,app,device,os,channel,click_time,is_attributed,hour,minute,...,ip_app_count,ip_app_os_count,ip_app_channel_mean_hour,ip_app_channel_h_mean_m,ip_count,i_next_click,ia_next_click,ic_next_click,io_next_click,iod_next_click
0,46430117,100929,2,2,20,18,2017-11-07 11:40:03,0,11,40,...,235,2,12.142858,25.0,3493,52821.0,32441.0,10414.0,59601.0,84281.0
1,115447639,66638,15,2,8,245,2017-11-08 14:06:01,0,14,6,...,104,19,12.666667,29.714285,884,60856.0,3223.0,3223.0,77674.0,84989.0
2,62753353,240419,12,1,1,265,2017-11-07 17:29:33,0,17,29,...,11,3,14.5,30.333334,67,26876.0,26876.0,131.0,126.0,126.0
3,24690703,148460,2,1,13,469,2017-11-07 04:35:50,0,4,35,...,15,5,7.0,35.0,208,80276.0,21619.0,19055.0,74206.0,74206.0
4,89013219,121359,2,1,19,477,2017-11-08 06:00:49,0,6,0,...,117,28,11.45,27.0,1228,59569.0,47262.0,47262.0,59569.0,59569.0


## Feature Importance 

In [21]:
df = pd.read_feather(f'{PATH}tmp/feature_df.feather', nthreads=4)

ArrowIOError: Failed to open local file: data/fraud-detection/tmp/feature_df.feather , error: No such file or directory

In [55]:
del test_df
gc.collect()

3252

In [5]:
n_trn = 10000000

In [57]:
train_df.dtypes

index                                int64
ip                                  uint32
app                                 uint16
device                              uint16
os                                  uint16
channel                             uint16
click_time                  datetime64[ns]
is_attributed                        uint8
hour                                 uint8
minute                               uint8
fives                                uint8
ip_app_os_skew_h                   float32
ip_app_os_var_h                    float32
ip_app_os_skew_f                   float32
ip_app_os_var_f                    float32
ip_dev_os_skew_h                   float32
ip_dev_os_var_h                    float32
ip_dev_os_skew_f                   float32
ip_dev_os_var_f                    float32
ip_app_os_skew_m                   float32
ip_app_os_var_m                    float32
ip_app_os_h_skew_m                 float32
ip_app_os_h_var_m                  float32
X0         

In [10]:
is_number = np.vectorize(lambda x: np.issubdtype(x, np.number))
is_number(df.dtypes)

array([ True,  True,  True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

### Dealing with NaNs 

In [61]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

<b>our data got out of order...probably during sampling</b>

In [63]:
train_df.head(3)

Unnamed: 0,index,ip,app,device,os,channel,click_time,is_attributed,hour,minute,...,ip_app_count,ip_app_os_count,ip_app_channel_mean_hour,ip_app_channel_h_mean_m,ip_count,i_next_click,ia_next_click,ic_next_click,io_next_click,iod_next_click
0,46430117,100929,2,2,20,18,2017-11-07 11:40:03,0,11,40,...,235,2,12.142858,25.0,3493,52821.0,32441.0,10414.0,59601.0,84281.0
1,115447639,66638,15,2,8,245,2017-11-08 14:06:01,0,14,6,...,104,19,12.666667,29.714285,884,60856.0,3223.0,3223.0,77674.0,84989.0
2,62753353,240419,12,1,1,265,2017-11-07 17:29:33,0,17,29,...,11,3,14.5,30.333334,67,26876.0,26876.0,131.0,126.0,126.0


In [65]:
# could also sort by click_time but here they are equivalent and betting index is faster
train_df = train_df.sort_values(by=['index'])

In [68]:
display_all(train_df.tail(3).transpose())

Unnamed: 0,11559824,10811585,12641883
index,184903427,184903435,184903436
ip,48240,209,92190
app,3,18,12
device,1,1,1
os,32,19,23
channel,30,107,340
click_time,2017-11-09 15:59:59,2017-11-09 15:59:59,2017-11-09 15:59:59
is_attributed,0,0,0
hour,15,15,15
minute,59,59,59


In [62]:
display_all(train_df.isnull().sum().sort_index()/len(train_df))

X0                          0.000000
X1                          0.000000
X2                          0.000000
X3                          0.000000
X4                          0.000000
X5                          0.000000
X6                          0.000000
X7                          0.000000
X8                          0.000000
app                         0.000000
channel                     0.000000
click_time                  0.000000
device                      0.000000
fives                       0.000000
hour                        0.000000
i_next_click                0.009178
ia_next_click               0.093217
ic_next_click               0.196380
index                       0.000000
io_next_click               0.093894
iod_next_click              0.100657
ip                          0.000000
ip_app_channel_h_mean_m     0.000000
ip_app_channel_mean_hour    0.000000
ip_app_count                0.000000
ip_app_os_count             0.000000
ip_app_os_h_skew_m          0.753509
i

In [60]:
train_df.columns

Index(['index', 'ip', 'app', 'device', 'os', 'channel', 'click_time',
       'is_attributed', 'hour', 'minute', 'fives', 'ip_app_os_skew_h',
       'ip_app_os_var_h', 'ip_app_os_skew_f', 'ip_app_os_var_f',
       'ip_dev_os_skew_h', 'ip_dev_os_var_h', 'ip_dev_os_skew_f',
       'ip_dev_os_var_f', 'ip_app_os_skew_m', 'ip_app_os_var_m',
       'ip_app_os_h_skew_m', 'ip_app_os_h_var_m', 'X0', 'X1', 'X2', 'X3', 'X4',
       'X5', 'X6', 'X7', 'X8', 'ip_tcount', 'ip_t_mcount', 'ip_app_count',
       'ip_app_os_count', 'ip_app_channel_mean_hour',
       'ip_app_channel_h_mean_m', 'ip_count', 'i_next_click', 'ia_next_click',
       'ic_next_click', 'io_next_click', 'iod_next_click'],
      dtype='object')

If we want to not use medians like proc df we can use the below list...

In [79]:
# preparing lists of columns to perform fillna on
cat_fill = ['ip', 'app', 'device', 'os', 'channel']

num_fill = ['ip_app_os_skew_h',
       'ip_app_os_var_h', 'ip_app_os_skew_f', 'ip_app_os_var_f',
       'ip_dev_os_skew_h', 'ip_dev_os_var_h', 'ip_dev_os_skew_f',
       'ip_dev_os_var_f', 'ip_app_os_skew_m', 'ip_app_os_var_m',
       'ip_app_os_h_skew_m', 'ip_app_os_h_var_m','ip_app_count',
       'ip_app_os_count', 'i_next_click', 'ia_next_click',
       'ic_next_click', 'io_next_click', 'iod_next_click']

other_fill = ['ip_app_channel_mean_hour', 'ip_app_channel_h_mean_m' ]

In [None]:
for column in num_fill:
    train_df[column].fillna()

In [85]:
??train_cats(train_df)

<b>proc_df did not properly fill nas</b>

In [127]:
hm_df.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hour,minute,fives,ip_app_os_skew_h,...,ip_app_count,ip_app_os_count,ip_app_channel_mean_hour,ip_app_channel_h_mean_m,ip_count,i_next_click,ia_next_click,ic_next_click,io_next_click,iod_next_click
1344910,20173,18,1,13,134,0,0,0,5,-0.492789,...,47,13,8.5,0.0,468,32871.0,32871.0,764.0,28761.0,28761.0
695655,75722,12,1,13,178,0,0,0,5,-1.01781,...,12,7,4.8,0.0,106,10621.0,32811.0,26371.0,52827.0,52827.0
4127104,195610,9,1,16,134,0,0,0,5,,...,3,2,0.0,0.0,34,40490.0,39765.0,46731.0,,
5400380,126230,14,1,53,134,0,0,0,5,,...,15,1,5.0,0.0,215,63717.0,36386.0,36386.0,342.0,342.0
9758181,148473,20,1,25,259,0,0,0,5,,...,5,1,4.2,0.0,89,,,,,


In [144]:
for column in hm_df.columns:
    if column in num_fill:
        print(column)
        for n,c in hm_df.items():
            c.fillna(c.median(), inplace=True)

ip_app_os_skew_h
ip_app_os_var_h
ip_app_os_skew_f
ip_app_os_var_f
ip_dev_os_skew_h
ip_dev_os_var_h
ip_dev_os_skew_f
ip_dev_os_var_f
ip_app_os_skew_m
ip_app_os_var_m
ip_app_os_h_skew_m
ip_app_os_h_var_m
ip_app_count
ip_app_os_count
i_next_click
ia_next_click
ic_next_click
io_next_click
iod_next_click


In [145]:
display_all(hm_df.head(3).transpose())

Unnamed: 0,1344910,695655,4127104
ip,20173.0,75722.0,195610.0
app,18.0,12.0,9.0
device,1.0,1.0,1.0
os,13.0,13.0,16.0
channel,134.0,178.0,134.0
is_attributed,0.0,0.0,0.0
hour,0.0,0.0,0.0
minute,0.0,0.0,0.0
fives,5.0,5.0,5.0
ip_app_os_skew_h,-0.492789,-1.01781,0.144139


In [146]:
gc.collect()

802

Let's see if we caught all the nulls

In [148]:
display_all(hm_df.isnull().sum().sort_index()/len(hm_df))

X0                          0.0
X1                          0.0
X2                          0.0
X3                          0.0
X4                          0.0
X5                          0.0
X6                          0.0
X7                          0.0
X8                          0.0
app                         0.0
channel                     0.0
device                      0.0
fives                       0.0
hour                        0.0
i_next_click                0.0
ia_next_click               0.0
ic_next_click               0.0
io_next_click               0.0
iod_next_click              0.0
ip                          0.0
ip_app_channel_h_mean_m     0.0
ip_app_channel_mean_hour    0.0
ip_app_count                0.0
ip_app_os_count             0.0
ip_app_os_h_skew_m          0.0
ip_app_os_h_var_m           0.0
ip_app_os_skew_f            0.0
ip_app_os_skew_h            0.0
ip_app_os_skew_m            0.0
ip_app_os_var_f             0.0
ip_app_os_var_h             0.0
ip_app_o

### Base Model 

In [168]:
def print_score(m):
    res = [metrics.roc_auc_score(y_train, m.predict(X_train)), metrics.roc_auc_score(y_valid, m.predict(X_valid)),
          m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [186]:
columns = [
    #'ip', 
           'app', 'device', 'os', 'channel','hour', 'minute', 'fives', 'ip_app_os_skew_h',
       'ip_app_os_var_h', 'ip_app_os_skew_f', 'ip_app_os_var_f',
#       'ip_dev_os_skew_h', 'ip_dev_os_var_h', 'ip_dev_os_skew_f',
 #      'ip_dev_os_var_f', 'ip_app_os_skew_m', 'ip_app_os_var_m',
       'ip_app_os_h_skew_m', 'ip_app_os_h_var_m', 'X0', 'X1', 'X2', 'X3', 'X4',
       'X5', 'X6', 'X7', 'X8', 'ip_tcount', 'ip_t_mcount', 'ip_app_count',
       'ip_app_os_count', 'ip_app_channel_mean_hour',
       'ip_app_channel_h_mean_m', 
    #'ip_count', 
    'i_next_click', 'ia_next_click',
       'ic_next_click', 'io_next_click', 'iod_next_click']

In [70]:
target = ['is_attributed']

In [98]:
n_trn = 10000000

In [None]:
y_hm = x_hm['is_attributed'].values

In [151]:
x_hm.shape, y_hm.shape

((15000000, 58), (15000000,))

In [149]:
??proc_df

without ips

In [187]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

X_train, X_valid = split_vals(x_hm[columns], n_trn)
y_train, y_valid = split_vals(y_hm, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((10000000, 33), (10000000,), (5000000, 33))

In [177]:
set_rf_samples(1000000)

In [178]:
m=RandomForestClassifier(n_estimators=20, max_features=0.5, min_samples_leaf=5, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 21min 54s, sys: 9.22 s, total: 22min 3s
Wall time: 3min 16s
[0.7178181534753137, 0.562553900335945, 0.9983435, 0.9989822]


without ip counts

In [176]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

X_train, X_valid = split_vals(x_hm[columns], n_trn)
y_train, y_valid = split_vals(y_hm, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((10000000, 40), (10000000,), (5000000, 40))

In [181]:
gc.collect()

4080

In [182]:
m=RandomForestClassifier(n_estimators=20, max_features=0.5, min_samples_leaf=5, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 20min 52s, sys: 7.53 s, total: 20min 59s
Wall time: 3min 7s
[0.7177396504834318, 0.5621909699425857, 0.9983454, 0.998979]


without ip or ip count

In [181]:
gc.collect()

4080

In [185]:
m=RandomForestClassifier(n_estimators=20, max_features=0.5, min_samples_leaf=5, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 19min 29s, sys: 7.95 s, total: 19min 37s
Wall time: 3min 1s
[0.7237564408483589, 0.5620111065171446, 0.9983415, 0.9989806]


without a bunch of features: 'ip_dev_os_skew_h', 'ip_dev_os_var_h', 'ip_dev_os_skew_f',
      'ip_dev_os_var_f', 'ip_app_os_skew_m', 'ip_app_os_var_m',

In [188]:
m=RandomForestClassifier(n_estimators=20, max_features=0.5, min_samples_leaf=5, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 15min 46s, sys: 7.53 s, total: 15min 54s
Wall time: 2min 25s
[0.7192675124557447, 0.5625537001145402, 0.9983422, 0.9989818]


In [194]:
%time X_train['ip_app_os_skew_h'].iloc[:30].rolling(5).mean() #-0.229399 4.35s

CPU times: user 505 µs, sys: 0 ns, total: 505 µs
Wall time: 379 µs


1344910         NaN
695655          NaN
4127104         NaN
5400380         NaN
9758181   -0.229399
5887743   -0.106601
2150876    0.147837
8660526    0.265057
4467339    0.265057
5008883    0.344250
6173687    0.557885
8760391    0.577269
3081752    0.460048
5404209    0.555948
3729169    0.476755
3884624    0.410344
5616801    0.364324
6074861    0.469970
2698128    0.333364
8574959    0.820471
2721879    0.588617
8009822    0.288478
2659099    0.074654
1378087    0.292436
4548284   -0.194671
3301348   -0.146643
332957     0.136036
6962398    0.244214
412005    -0.241503
8742481   -0.305800
Name: ip_app_os_skew_h, dtype: float64

In [197]:
def moving_average(data, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same')

%time moving_average(X_train['ip_app_os_skew_h'].iloc[:30], 5)

CPU times: user 214 µs, sys: 4 µs, total: 218 µs
Wall time: 221 µs


array([-0.27788, -0.25364, -0.2294 , -0.1066 ,  0.14784,  0.26506,  0.26506,  0.34425,  0.55788,  0.57727,
        0.46005,  0.55595,  0.47675,  0.41034,  0.36432,  0.46997,  0.33336,  0.82047,  0.58862,  0.28848,
        0.07465,  0.29244, -0.19467, -0.14664,  0.13604,  0.24421, -0.2415 , -0.3058 , -0.29344, -0.30022])

In [211]:
def get_deviation(y, window_size):
    avg = moving_average(y, window_size).tolist()
    residual = y - avg
    testing_std = pd.rolling_std(residual, window_size)
    testing_std_as_df = pd.DataFrame(testing_std)
    rolling_std = testing_std_as_df.replace(np.nan,
                                  testing_std_as_df.iloc[window_size - 1]).round(3).iloc[:,0].tolist()
    return rolling_std[:50]

%time get_deviation(X_train['ip_app_os_skew_h'].iloc[:500000], 5)

CPU times: user 44.4 ms, sys: 12 ms, total: 56.4 ms
Wall time: 112 ms


	Series.rolling(window=5,center=False).std()
  after removing the cwd from sys.path.


[0.439,
 0.439,
 0.439,
 0.439,
 0.439,
 0.434,
 0.203,
 0.207,
 0.288,
 0.287,
 0.444,
 0.466,
 0.467,
 0.425,
 0.445,
 0.318,
 0.319,
 0.282,
 0.377,
 1.152,
 1.191,
 1.456,
 1.457,
 1.517,
 1.008,
 1.005,
 0.59,
 0.55,
 0.64,
 0.643,
 0.699,
 0.689,
 0.627,
 0.442,
 0.458,
 0.578,
 0.575,
 0.876,
 0.828,
 0.826,
 1.164,
 1.249,
 1.032,
 1.048,
 1.03,
 0.683,
 0.75,
 0.824,
 0.753,
 0.842]

In [214]:
def get_rolling_std(y, window_size):
    avg = moving_average(y, window_size)
    residual = y - avg
    testing_std = pd.rolling_std(residual, window_size)
    testing_std_as_df = pd.DataFrame(testing_std)
    rolling_std = testing_std_as_df.replace(np.nan,
                                  testing_std_as_df.iloc[window_size - 1]).round(3).iloc[:,0].tolist()
    return rolling_std[:50]

%time get_rolling_std(X_train['ip_app_os_skew_h'], 5)

	Series.rolling(window=5,center=False).std()
  after removing the cwd from sys.path.


CPU times: user 545 ms, sys: 340 ms, total: 885 ms
Wall time: 826 ms


[0.439,
 0.439,
 0.439,
 0.439,
 0.439,
 0.434,
 0.203,
 0.207,
 0.288,
 0.287,
 0.444,
 0.466,
 0.467,
 0.425,
 0.445,
 0.318,
 0.319,
 0.282,
 0.377,
 1.152,
 1.191,
 1.456,
 1.457,
 1.517,
 1.008,
 1.005,
 0.59,
 0.55,
 0.64,
 0.643,
 0.699,
 0.689,
 0.627,
 0.442,
 0.458,
 0.578,
 0.575,
 0.876,
 0.828,
 0.826,
 1.164,
 1.249,
 1.032,
 1.048,
 1.03,
 0.683,
 0.75,
 0.824,
 0.753,
 0.842]

In [252]:
def moving_average(data, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same')

def get_rolling_std(df, group_cols, col, window_size, show_max=False, show_agg=True):
    if show_agg:
        print( "Calculating std deviation of MA of ", col, '...' )
    gp = df[group_cols + [col]].groupby(group_cols)
    avg = gp[col].transform(lambda x: moving_average(x, window_size)).reset_index()
    residual = gp[col] - moving_average(df[group_cols + [col]].groupby(group_cols)[col], window_size)
#     for name, group in gp[col]
#     testing_std = pd.rolling_std(residual, window_size)
#     testing_std_as_df = pd.DataFrame(testing_std)
#     #df[col + '_std'] = 
#     rolling_std = testing_std_as_df.replace(np.nan,
#                                   testing_std_as_df.iloc[window_size - 1]).round(3).iloc[:,0].tolist()
    gc.collect()
    return residual#rolling_std#df

get_rolling_std(X_train.iloc[:50], ['app', 'channel'], 'X6', 5, show_max=True)

Calculating std deviation of MA of  X6 ...


ValueError: Wrong number of items passed 5, placement implies 1

In [242]:
gc.collect()

290

In [243]:
get_rolling_std(X_train, 'ip_app_os_skew_h', 5, show_max=True)

Calculating std deviation of MA of  ip_app_os_skew_h ...


	Series.rolling(window=5,center=False).std()
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,app,device,os,channel,hour,minute,fives,ip_app_os_skew_h,ip_app_os_var_h,ip_app_os_skew_f,...,ip_app_count,ip_app_os_count,ip_app_channel_mean_hour,ip_app_channel_h_mean_m,i_next_click,ia_next_click,ic_next_click,io_next_click,iod_next_click,ip_app_os_skew_h_std
1344910,18,1,13,134,0,0,5,-0.492789,27.000000,0.000000,...,47,13,8.500000,0.000000,32871.0,32871.0,764.0,28761.0,28761.0,0.439
695655,12,1,13,178,0,0,5,-1.017810,15.238095,0.000000,...,12,7,4.800000,0.000000,10621.0,32811.0,26371.0,52827.0,52827.0,0.439
4127104,9,1,16,134,0,0,5,0.121201,0.000000,0.000000,...,3,2,0.000000,0.000000,40490.0,39765.0,46731.0,42520.0,42477.0,0.439
5400380,14,1,53,134,0,0,5,0.121201,22.602299,0.000000,...,15,1,5.000000,0.000000,63717.0,36386.0,36386.0,342.0,342.0,0.439
9758181,20,1,25,259,0,0,5,0.121201,22.602299,0.000000,...,5,1,4.200000,0.000000,43136.0,43025.0,42705.0,42520.0,42477.0,0.439
5887743,20,1,17,478,0,0,5,0.121201,22.602299,0.000000,...,1,1,0.000000,0.000000,29977.0,43025.0,42705.0,80615.0,80615.0,0.434
2150876,12,1,25,328,0,0,5,0.254380,60.700001,0.000000,...,115,5,8.043478,21.500000,81270.0,52205.0,30901.0,13784.0,13784.0,0.203
8660526,15,1,13,245,0,0,5,0.707305,38.590645,2.000000,...,61,19,7.608696,12.500000,29787.0,16048.0,85121.0,85243.0,85243.0,0.207
4467339,3,1,9,130,0,0,5,0.121201,22.602299,0.000000,...,29,1,0.000000,0.000000,26118.0,24173.0,42705.0,42520.0,42477.0,0.288
5008883,2,1,19,469,0,0,5,0.517166,77.619049,0.000000,...,35,7,8.333333,0.000000,4839.0,19610.0,34343.0,18923.0,18923.0,0.287
