In [1]:
import numpy as np
import pandas as pd
import gc
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

https://stats.stackexchange.com/questions/312619/does-isolation-forest-need-an-anomaly-sample-during-training

In [2]:
gc.collect()

0

### data 1: concat_df

In [3]:
dtypes = {  
            'ip'            : 'uint32',
            'app'           : 'uint16',
            'device'        : 'uint16',
            'os'            : 'uint16',
            'channel'       : 'uint16',
            'day'           : 'uint8',
            'hour'          : 'uint8',
            'minute'        : 'uint8',
            'ip_count'      : 'uint32',
            'ip_day_hour_minute_count' : 'uint32',
            'os_device_count'          : 'uint32',
            'os_app_channel_count'     : 'uint32',
            #'ip_day_hour_count'        : 'uint32',
            #'ip_app_count'             : 'uint32',
            #'ip_app_os_count'          : 'uint32',
            'ip_device_count'          : 'uint32',
            'app_channel_count'        : 'uint32',
            'next_click'               : 'int64',
            'prev_click'               : 'int64'        
            }

to_read = ['ip', 'app', 'device', 'os', 'channel', 'day', 'hour', 'minute',
            'ip_count', 'ip_day_hour_minute_count',
            'os_device_count',
            'os_app_channel_count',
            #'ip_day_hour_count',
            #'ip_app_count',
            #'ip_app_os_count',
            'ip_device_count',
            'app_channel_count',
            'next_click',
            'prev_click'
          ]

In [4]:
concat_df_tmp = pd.read_csv('tmp/concat_df.csv.gz', compression='gzip', dtype=dtypes, usecols=to_read, chunksize = 10**6)
concat_df = pd.concat(concat_df_tmp)

In [5]:
del concat_df_tmp
gc.collect()

406

### data 2: time feature

In [6]:
time_feature = pd.read_csv('tmp/time_diff.csv.gz', compression='gzip', usecols=['log2_time_diff_first_seen'])

### data 3: agg future

In [7]:
agg_dtype = {
    'nuniq_ip_channel':            'uint32',
    'cumcnt_ip_device_os_app':     'uint32',
    'nuniq_ip_day_hour':           'uint32',
    'nuniq_ip_app':                'uint32',
    'nuniq_ip_app_os':             'uint32',
    'nuniq_ip_device':             'uint32',
    'nuniq_app_channel':           'uint32',
    'cumcnt_ip_os':                'uint32',
    'nuniq_ip_device_os_app':      'uint32',
    'ip_tcount':                   'uint32',
    'ip_app_count':                'uint32',
    'ip_app_os_count':             'uint32',
    'ip_tchan_count':              'float32',
    'ip_app_os_var':               'float32',
    'ip_app_channel_var_day':      'float32',
    'ip_app_channel_mean_hour':    'float32'
}

agg_to_read = ['nuniq_ip_channel', 
               'cumcnt_ip_device_os_app',
               'nuniq_ip_day_hour',
               'nuniq_ip_app',
               'nuniq_ip_app_os',
               'nuniq_ip_device',
               'nuniq_app_channel',
               'cumcnt_ip_os',
               'nuniq_ip_device_os_app',
               'ip_tcount',
               'ip_app_count',
               'ip_app_os_count',
               'ip_tchan_count',
               'ip_app_os_var',
               'ip_app_channel_var_day',
               'ip_app_channel_mean_hour']

In [8]:
agg_feature_tmp = pd.read_csv('tmp/agg_features.csv.gz', compression='gzip', dtype=agg_dtype, 
                              usecols=agg_to_read, chunksize=10**6)

agg_feature = pd.concat(agg_feature_tmp)

In [9]:
del agg_feature_tmp
gc.collect()

406

In [10]:
fill_agg = ['ip_tchan_count','ip_app_os_var','ip_app_channel_var_day','ip_app_channel_mean_hour']
agg_feature[fill_agg] = agg_feature[fill_agg].fillna(agg_feature[fill_agg].mean())

### Concat 

In [11]:
concat_df = pd.concat([concat_df, time_feature, agg_feature], axis=1)

In [12]:
del time_feature, agg_feature
gc.collect()

42

In [13]:
train_y = pd.read_csv('tmp/train_y.csv.gz', compression='gzip')

In [14]:
nrows = 40000000
n_sample = 5*10**5

In [15]:
if_idx = train_y.loc[train_y['is_attributed']==0].sample(n_sample, random_state=1).index.values

### generate log2 feature

In [16]:
def log_bin_feature(df, features):
    for fea in features: 
        df[fea]= np.log2(1 + df[fea].values).astype(int)

In [17]:
features = ['ip_count',
            'ip_day_hour_minute_count',
            'os_device_count',
            'os_app_channel_count',
            'ip_tcount',
            'ip_app_count',
            'ip_app_os_count',
            'ip_device_count',
            'app_channel_count',
            'next_click',
            'prev_click',
            'nuniq_ip_channel', 
            'cumcnt_ip_device_os_app',
            'nuniq_ip_day_hour',
            'nuniq_ip_app',
            'nuniq_ip_app_os',
            'nuniq_ip_device',
            'nuniq_app_channel',
            'cumcnt_ip_os',
            'nuniq_ip_device_os_app']

In [18]:
log_bin_feature(concat_df, features)

In [19]:
train_if = concat_df.iloc[if_idx]

### Generate feature list

In [20]:
predictors = features + ['hour', 'minute', 'os', 'app', 'device', 'log2_time_diff_first_seen']

### Isolation Forest

In [21]:
forest = IsolationForest(n_estimators = 110,
                         contamination = 0.0025,
                         random_state = 1)

In [22]:
forest.fit(train_if[predictors])

IsolationForest(bootstrap=False, contamination=0.0025, max_features=1.0,
        max_samples='auto', n_estimators=110, n_jobs=1, random_state=1,
        verbose=0)

In [23]:
test = concat_df.iloc[:nrows][predictors]

In [24]:
def batch_pred(model, X, batch_size=100000):
    
    N_batch = int(X.shape[0]/batch_size)
    pred = []
    
    for i in range(N_batch+1):
        if i%10 == 0 and i>0:
            print("finished batch %d/%d" %(i, N_batch))
        
        batch_X = X.iloc[i*batch_size : (i+1)*batch_size] 
        
        if batch_X.shape[0] > 0:
            batch_pred = -model.decision_function(batch_X)
            pred.extend(list(batch_pred))
    
    return pred

In [25]:
if_pred = batch_pred(forest, test)

finished batch 10/400
finished batch 20/400
finished batch 30/400
finished batch 40/400
finished batch 50/400
finished batch 60/400
finished batch 70/400
finished batch 80/400
finished batch 90/400
finished batch 100/400
finished batch 110/400
finished batch 120/400
finished batch 130/400
finished batch 140/400
finished batch 150/400
finished batch 160/400
finished batch 170/400
finished batch 180/400
finished batch 190/400
finished batch 200/400
finished batch 210/400
finished batch 220/400
finished batch 230/400
finished batch 240/400
finished batch 250/400
finished batch 260/400
finished batch 270/400
finished batch 280/400
finished batch 290/400
finished batch 300/400
finished batch 310/400
finished batch 320/400
finished batch 330/400
finished batch 340/400
finished batch 350/400
finished batch 360/400
finished batch 370/400
finished batch 380/400
finished batch 390/400
finished batch 400/400


In [26]:
auc = roc_auc_score(train_y['is_attributed'], if_pred)

In [27]:
auc

0.9012363032204544

In [28]:
del test, if_pred
gc.collect()

37

In [29]:
if_score = batch_pred(forest, concat_df[predictors], batch_size=100000)

finished batch 10/587
finished batch 20/587
finished batch 30/587
finished batch 40/587
finished batch 50/587
finished batch 60/587
finished batch 70/587
finished batch 80/587
finished batch 90/587
finished batch 100/587
finished batch 110/587
finished batch 120/587
finished batch 130/587
finished batch 140/587
finished batch 150/587
finished batch 160/587
finished batch 170/587
finished batch 180/587
finished batch 190/587
finished batch 200/587
finished batch 210/587
finished batch 220/587
finished batch 230/587
finished batch 240/587
finished batch 250/587
finished batch 260/587
finished batch 270/587
finished batch 280/587
finished batch 290/587
finished batch 300/587
finished batch 310/587
finished batch 320/587
finished batch 330/587
finished batch 340/587
finished batch 350/587
finished batch 360/587
finished batch 370/587
finished batch 380/587
finished batch 390/587
finished batch 400/587
finished batch 410/587
finished batch 420/587
finished batch 430/587
finished batch 440/5

In [30]:
len(if_score)

58790469

In [31]:
if_output = pd.DataFrame()

In [32]:
if_output['if_score'] = if_score

In [33]:
if_output.to_csv('tmp/if_score_v2.csv.gz', float_format='%.8f', index=False, compression='gzip')