In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from datetime import datetime

In [22]:
start = datetime.now()
#VALIDATE = False
VALIDATE = True
RANDOM_STATE = 50
VALID_SIZE = 0.90
MAX_ROUNDS = 1000
EARLY_STOP = 50
OPT_ROUNDS =  650
skiprows = range(1, 109903891)
nrows = 75000000
output_filename = '../submission/submission.csv'

path = '../input/'

In [3]:
dtypes = {
    'ip'              : 'uint32',
    'app'             : 'uint16',
    'device'          : 'uint16',
    'os'              : 'uint16',
    'channel'         : 'uint16',
    'is_attributed'   : 'uint8',
    'click_id'        : 'uint32'
}

In [4]:
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

In [23]:
train_df = pd.read_csv(path + "train.csv", skiprows=skiprows, nrows=nrows, dtype=dtypes, usecols=train_cols)

In [6]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,143414,3,1,19,280,2017-11-08 12:33:34,0
1,173096,12,1,17,178,2017-11-08 12:33:34,0
2,8210,3,1,42,280,2017-11-08 12:33:34,0
3,5746,3,1,19,130,2017-11-08 12:33:34,0
4,31475,26,1,22,266,2017-11-08 12:33:34,0


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000000 entries, 0 to 74999999
Data columns (total 7 columns):
ip               uint32
app              uint16
device           uint16
os               uint16
channel          uint16
click_time       object
is_attributed    uint8
dtypes: object(1), uint16(4), uint32(1), uint8(1)
memory usage: 1.5+ GB


In [8]:
train_df.describe()

Unnamed: 0,ip,app,device,os,channel,is_attributed
count,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0
mean,94582.79,11.5676,13.00055,21.22303,266.4763,0.002355027
std,79299.51,13.215,210.0925,49.27564,131.0557,0.04847144
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,39958.0,3.0,1.0,13.0,137.0,0.0
50%,78702.0,11.0,1.0,18.0,245.0,0.0
75%,115951.0,15.0,1.0,19.0,379.0,0.0
max,364778.0,768.0,4227.0,956.0,500.0,1.0


In [9]:
len_train = len(train_df)

In [10]:
len_train

75000000

In [11]:
gc.collect()

7

In [12]:
most_freq_hours_in_test_data = [4, 5, 9, 10, 13, 14]

In [13]:
least_freq_hours_in_test_data = [6, 11, 15]

In [24]:
def prep_data(df):
    
    df['hour'] = pd.to_datetime(df.click_time).dt.hour.astype('uint8')
    df['day']  =pd.to_datetime(df.click_time).dt.day.astype('uint8')
    df.drop(['click_time'], axis=1, inplace=True)
    gc.collect()
    
    df['in_test_hh'] = (
                          3
                        - 2*df['hour'].isin(most_freq_hours_in_test_data)
                        - 1*df['hour'].isin(least_freq_hours_in_test_data)
                        ).astype('uint8')
    
    gp = df[['ip', 'day', 'in_test_hh', 'channel']].groupby(by=['ip', 'day', 'in_test_hh'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'nip_day_test_hh'})
    df = df.merge(gp, on=['ip', 'day', 'in_test_hh'],  how='left')
    df.drop(['in_test_hh'], axis=1, inplace=True)
    df['nip_day_test_hh'] = df['nip_day_test_hh'].astype('uint32')
    del gp
    gc.collect()
    
    gp = df[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'nip_day_hh'})
    df = df.merge(gp, on=['ip', 'day', 'hour'], how='left')
    df['nip_day_hh'] = df['nip_day_hh'].astype('uint16')
    del gp
    gc.collect()
    
    gp = df[['ip', 'os', 'hour', 'channel']].groupby(by=['ip', 'os', 'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'nip_hh_os'})
    df = df.merge(gp, on=['ip', 'os', 'hour'], how='left')
    df['nip_hh_os'] = df['nip_hh_os'].astype('uint16')
    del gp
    gc.collect()
    
    gp = df[['ip', 'app', 'hour', 'channel']].groupby(by=['ip', 'app', 'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'nip_hh_app'})
    df = df.merge(gp, on=['ip', 'app', 'hour'], how='left')
    df['nip_hh_app'] = df['nip_hh_app'].astype('uint16')
    del gp
    gc.collect()
    
    gp  =df[['ip', 'device', 'hour', 'channel']].groupby(by=['ip', 'device', 'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'nip_hh_dev'})
    df = df.merge(gp, on=['ip', 'device', 'hour'], how='left')
    df['nip_hh_dev'] = df['nip_hh_dev'].astype('uint32')
    del gp
    gc.collect()
    
    df.drop(['ip', 'day'], axis=1, inplace=True)
    gc.collect()
    return df

In [25]:
train_df = prep_data(train_df)
gc.collect()

53

In [16]:
train_df.head()

Unnamed: 0,app,device,os,channel,is_attributed,hour,nip_day_test_hh,nip_day_hh,nip_hh_os,nip_hh_app,nip_hh_dev
0,3,1,19,280,0,12,193,43,29,21,93
1,12,1,17,178,0,12,18,18,4,3,18
2,3,1,42,280,0,12,456,42,4,26,135
3,3,1,19,130,0,12,236,19,21,6,32
4,26,1,22,266,0,12,65,65,11,3,65


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75000000 entries, 0 to 74999999
Data columns (total 11 columns):
app                uint16
device             uint16
os                 uint16
channel            uint16
is_attributed      uint8
hour               uint8
nip_day_test_hh    uint32
nip_day_hh         uint16
nip_hh_os          uint16
nip_hh_app         uint16
nip_hh_dev         uint32
dtypes: uint16(7), uint32(2), uint8(2)
memory usage: 2.2 GB


In [18]:
train_df.describe()

Unnamed: 0,app,device,os,channel,is_attributed,hour,nip_day_test_hh,nip_day_hh,nip_hh_os,nip_hh_app,nip_hh_dev
count,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0,75000000.0
mean,11.5676,13.00055,21.22303,266.4763,0.002355027,10.03923,4691.08,1096.161,191.1268,174.7109,1250.369
std,13.215,210.0925,49.27564,131.0557,0.04847144,5.869191,16119.98,3833.781,952.4891,786.8014,5170.955
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
25%,3.0,1.0,13.0,137.0,0.0,5.0,264.0,64.0,11.0,6.0,65.0
50%,11.0,1.0,18.0,245.0,0.0,11.0,655.0,146.0,28.0,16.0,150.0
75%,15.0,1.0,19.0,379.0,0.0,14.0,1917.0,388.0,72.0,53.0,406.0
max,768.0,4227.0,956.0,500.0,1.0,23.0,174383.0,43958.0,16031.0,12338.0,64083.0


In [19]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 9,
    'max_depth': 5,
    'min_child_samples': 100,
    'max_bin': 100,
    'subsample': 0.9,
    'subsample_freq': 1,
    'colsample_bytree': 0.7,
    'min_child_weight': 0,
    'min_split_gain': 0,
    'nthread': 8,
    'verbose': 0,
    'scale_pos_weight': 99.7,
}

In [20]:
target = 'is_attributed'
predictors = ['app', 'device', 'os', 'channel', 'hour', 'nip_day_test_hh', 'nip_day_hh', 'nip_hh_os', 'nip_hh_app', 'nip_hh_dev']
categorical = ['app', 'device', 'os', 'channel', 'hour']

In [26]:
if VALIDATE:
    
    train_df, val_df = train_test_split(train_df, test_size=VALID_SIZE, random_state=RANDOM_STATE, shuffle=True)
    
    dtrain = lgb.Dataset(train_df[predictors].values,
                         label=train_df[target].values,
                         feature_name=predictors,
                         categorical_feature=categorical)
    
    del train_df
    gc.collect()
    
    dvalid = lgb.Dataset(val_df[predictors].values,
                         label=val_df[target].values,
                         feature_name=predictors,
                         categorical_feature=categorical)
    
    del val_df
    gc.collect()
    
    evals_results = {}
    
    model = lgb.train(params
                     ,dtrain
                     ,valid_sets=[dtrain, dvalid]
                     ,valid_names=['train', 'valid']
                     ,evals_result=evals_results
                     ,num_boost_round=MAX_ROUNDS
                     ,early_stopping_rounds=EARLY_STOP
                     ,verbose_eval=50
                     ,feval=None)
    
    del dvalid

else:
    
    gc.collect()
    dtrain = lgb.Dataset(train_df[predictors].values
                        ,label=train_df[target].values
                        ,feature_name=predictors
                        ,categorical_feature=categorical)
    
    del train_df
    gc.collect()
    
    evals_results = {}
    
    model = lgb.train(params
                     ,dtrain
                     ,valid_sets=[dtrain]
                     ,valid_names=['train']
                     ,evals_result=evals_results
                     ,num_boost_round=OPT_ROUNDS
                     ,verbose_eval=50
                     ,feval=None)
    
del dtrain
gc.collect()



Training until validation scores don't improve for 50 rounds.
[50]	train's auc: 0.972803	valid's auc: 0.968848
[100]	train's auc: 0.978679	valid's auc: 0.971129
[150]	train's auc: 0.981197	valid's auc: 0.971634
[200]	train's auc: 0.982914	valid's auc: 0.971791
[250]	train's auc: 0.984409	valid's auc: 0.971845
[300]	train's auc: 0.985611	valid's auc: 0.97191
[350]	train's auc: 0.986583	valid's auc: 0.971931
Early stopping, best iteration is:
[333]	train's auc: 0.986259	valid's auc: 0.971964


32

In [27]:
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
test_df = pd.read_csv(path + "test.csv", dtype=dtypes, usecols=test_cols)
test_df = prep_data(test_df)
gc.collect()

53

In [28]:
sub = pd.DataFrame()
sub['click_id'] = test_df['click_id']
sub['is_attributed'] = model.predict(test_df[predictors])
sub.to_csv(output_filename, index=False, float_format='%.9f')

In [30]:
print('=='*35)
print('=='*10 + 'Final Report' + '=='*10)
print('=='*35)
print(datetime.now(), '\n')
print('{:^17} : {:}'.format('train time', datetime.now()-start))
print('{:^17} : {:}'.format('output file', output_filename))
print('{:^17} : {: .5f}'.format('train auc', model.best_score['train']['auc']))
if VALIDATE:
    print('{:^17} : {:.5f}\n'.format('valid auc', model.best_score['valid']['auc']))
    print('{:^17} : {:}\n{:^17} : {}\n{:^17} : {}'.format('VALIDATE', VALIDATE, 'VALID_SIZE', VALID_SIZE, 'RANDOM_STATE', RANDOM_STATE))
print('{:^17} : {:}\n{:^17} : {}\n{:^17} : {}\n'.format('MAX_ROUNDS', MAX_ROUNDS, 'EARLY_STOP', EARLY_STOP, 'OPT_ROUNDS', model.best_iteration))
print('{:^17} : {:}\n{:^17} : {}\n'.format('skiprows', skiprows, 'nrows', nrows))
print('{:^17} : {:}\n{:^17} : {}\n'.format('variables', predictors, 'categorical', categorical))
print('{:^17} : {:}\n'.format('model params',params))
print('=='*35)

2018-04-16 01:02:01.177624 

   train time     : 1:36:45.989075
   output file    : ../submission/submission.csv
    train auc     :  0.98626
    valid auc     : 0.97196

    VALIDATE      : True
   VALID_SIZE     : 0.9
  RANDOM_STATE    : 50
   MAX_ROUNDS     : 1000
   EARLY_STOP     : 50
   OPT_ROUNDS     : 333

    skiprows      : range(1, 109903891)
      nrows       : 75000000

    variables     : ['app', 'device', 'os', 'channel', 'hour', 'nip_day_test_hh', 'nip_day_hh', 'nip_hh_os', 'nip_hh_app', 'nip_hh_dev']
   categorical    : ['app', 'device', 'os', 'channel', 'hour']

  model params    : {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.1, 'num_leaves': 9, 'max_depth': 5, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.9, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'min_split_gain': 0, 'nthread': 8, 'verbose': 0, 'scale_pos_weight': 99.7, 'categorical_column': [0, 1, 2, 3, 4]}

