In [1]:
import pandas as pd
import numpy as np
import gc
import random

In [2]:
# import train data

rows_to_skip = 140000000

dtypes = {
    'ip':'uint32',
    'app':'uint16',
    'device':'uint16',
    'os':'uint16',
    'channel':'uint16',
    'is_attributed':'uint8'
}

#p = 0.1
#train = pd.read_csv('train.csv', dtype=dtypes, header=0, \
#                                 skiprows=lambda i: i>0 and random.random() > p)
train = pd.read_csv('train.csv', dtype=dtypes, skiprows=range(1, rows_to_skip))
train.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44903891 entries, 0 to 44903890
Data columns (total 8 columns):
ip                 uint32
app                uint16
device             uint16
os                 uint16
channel            uint16
click_time         object
attributed_time    object
is_attributed      uint8
dtypes: object(2), uint16(4), uint32(1), uint8(1)
memory usage: 1.2+ GB


In [3]:
# import test data
dtypes = {
    'click_id':'uint32',
    'ip':'uint32',
    'app':'uint16',
    'device':'uint16',
    'os':'uint16',
    'channel':'uint16'
}

test = pd.read_csv('test.csv', dtype=dtypes)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18790469 entries, 0 to 18790468
Data columns (total 7 columns):
click_id      uint32
ip            uint32
app           uint16
device        uint16
os            uint16
channel       uint16
click_time    object
dtypes: object(1), uint16(4), uint32(2)
memory usage: 430.1+ MB


In [4]:
# Drop some unimportant information
click_id = test['click_id']
test = test.drop(['click_id'], axis=1)
test['is_attributed'] = np.zeros(test['ip'].shape[0]).astype('uint8')
train = train.drop(['attributed_time'], axis=1)

In [5]:
# Concate train test to generate more features together
total = pd.concat([train, test], axis=0, ignore_index=True)
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63694360 entries, 0 to 63694359
Data columns (total 7 columns):
ip               uint32
app              uint16
device           uint16
os               uint16
channel          uint16
click_time       object
is_attributed    uint8
dtypes: object(1), uint16(4), uint32(1), uint8(1)
memory usage: 1.2+ GB


In [6]:
N = train.shape[0]
del train, test
gc.collect()

38

In [7]:
# Extract day hour info
total['day'] = total['click_time'].map(lambda x: x[8:10]).astype('uint8')
total['hour'] = total['click_time'].map(lambda x: x[11:13]).astype('uint8')
total = total.drop(['click_time'], axis=1)
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour
0,32511,1,1,6,153,0,9,2
1,154267,12,1,19,328,0,9,2
2,34897,3,1,13,280,0,9,2
3,191691,14,1,20,480,0,9,2
4,56105,15,1,19,278,0,9,2


In [8]:
# Total number of click per ip
num_click_per_ip = total[['ip','is_attributed']].groupby('ip', as_index=False).count().astype('uint32')
num_click_per_ip.columns = ['ip', 'total_click']

total = total.merge(num_click_per_ip, on='ip', how='left')
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click
0,32511,1,1,6,153,0,9,2,2486
1,154267,12,1,19,328,0,9,2,1210
2,34897,3,1,13,280,0,9,2,5901
3,191691,14,1,20,480,0,9,2,1234
4,56105,15,1,19,278,0,9,2,3640


In [9]:
# Total number of click per hour per ip
num_click_per_ip = total[['ip','day','hour','is_attributed']]\
                   .groupby(by=['ip','day','hour']).count().astype('uint32').reset_index()
num_click_per_ip.columns = ['ip', 'day', 'hour', 'click_per_hour']

total = total.merge(num_click_per_ip, on=['ip', 'day' ,'hour'], how='left')
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click,click_per_hour
0,32511,1,1,6,153,0,9,2,2486,76
1,154267,12,1,19,328,0,9,2,1210,42
2,34897,3,1,13,280,0,9,2,5901,140
3,191691,14,1,20,480,0,9,2,1234,126
4,56105,15,1,19,278,0,9,2,3640,72


In [10]:
# Total number of click per channel per ip
num_click_per_ip = total[['ip','channel','is_attributed']]\
                   .groupby(by=['ip','channel']).count().astype('uint32').reset_index()
num_click_per_ip.columns = ['ip', 'channel', 'click_per_channel']

total = total.merge(num_click_per_ip, on=['ip', 'channel'], how='left')
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click,click_per_hour,click_per_channel
0,32511,1,1,6,153,0,9,2,2486,76,70
1,154267,12,1,19,328,0,9,2,1210,42,2
2,34897,3,1,13,280,0,9,2,5901,140,254
3,191691,14,1,20,480,0,9,2,1234,126,18
4,56105,15,1,19,278,0,9,2,3640,72,11


In [11]:
# Total number of click per app per ip
num_click_per_ip = total[['ip','app','is_attributed']]\
                   .groupby(by=['ip','app']).count().astype('uint32').reset_index()
num_click_per_ip.columns = ['ip', 'app', 'click_per_app']

total = total.merge(num_click_per_ip, on=['ip', 'app'], how='left')
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click,click_per_hour,click_per_channel,click_per_app
0,32511,1,1,6,153,0,9,2,2486,76,70,66
1,154267,12,1,19,328,0,9,2,1210,42,2,131
2,34897,3,1,13,280,0,9,2,5901,140,254,901
3,191691,14,1,20,480,0,9,2,1234,126,18,68
4,56105,15,1,19,278,0,9,2,3640,72,11,250


In [12]:
# Total number of click per device per ip
num_click_per_ip = total[['ip','device','is_attributed']]\
                   .groupby(by=['ip','device']).count().astype('uint32').reset_index()
num_click_per_ip.columns = ['ip', 'device', 'click_per_device']

total = total.merge(num_click_per_ip, on=['ip', 'device'], how='left')
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click,click_per_hour,click_per_channel,click_per_app,click_per_device
0,32511,1,1,6,153,0,9,2,2486,76,70,66,2475
1,154267,12,1,19,328,0,9,2,1210,42,2,131,1203
2,34897,3,1,13,280,0,9,2,5901,140,254,901,5878
3,191691,14,1,20,480,0,9,2,1234,126,18,68,1054
4,56105,15,1,19,278,0,9,2,3640,72,11,250,3613


In [13]:
# Total number of click per channel per ip
num_click_per_ip = total[['ip','os','is_attributed']]\
                   .groupby(by=['ip','os']).count().astype('uint32').reset_index()
num_click_per_ip.columns = ['ip', 'os', 'click_per_os']

total = total.merge(num_click_per_ip, on=['ip', 'os'], how='left')
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click,click_per_hour,click_per_channel,click_per_app,click_per_device,click_per_os
0,32511,1,1,6,153,0,9,2,2486,76,70,66,2475,44
1,154267,12,1,19,328,0,9,2,1210,42,2,131,1203,451
2,34897,3,1,13,280,0,9,2,5901,140,254,901,5878,1242
3,191691,14,1,20,480,0,9,2,1234,126,18,68,1054,38
4,56105,15,1,19,278,0,9,2,3640,72,11,250,3613,708


In [14]:
# Map hour info into 18-21 -> low(1), 0-14 -> high(3), else medium(2)
total['hour'] = total['hour'].map(lambda x: 1 if 18 <= x <= 21 else 3 if 0 <= x <= 14 else 2)
total.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,total_click,click_per_hour,click_per_channel,click_per_app,click_per_device,click_per_os
0,32511,1,1,6,153,0,9,3,2486,76,70,66,2475,44
1,154267,12,1,19,328,0,9,3,1210,42,2,131,1203,451
2,34897,3,1,13,280,0,9,3,5901,140,254,901,5878,1242
3,191691,14,1,20,480,0,9,3,1234,126,18,68,1054,38
4,56105,15,1,19,278,0,9,3,3640,72,11,250,3613,708


In [15]:
total = total.drop(['ip', 'day'], axis=1)
total.head()

Unnamed: 0,app,device,os,channel,is_attributed,hour,total_click,click_per_hour,click_per_channel,click_per_app,click_per_device,click_per_os
0,1,1,6,153,0,3,2486,76,70,66,2475,44
1,12,1,19,328,0,3,1210,42,2,131,1203,451
2,3,1,13,280,0,3,5901,140,254,901,5878,1242
3,14,1,20,480,0,3,1234,126,18,68,1054,38
4,15,1,19,278,0,3,3640,72,11,250,3613,708


In [16]:
train = total[:N]
test = total[N:]
del total
gc.collect()

127

In [19]:
# Look at no. of positive and negative data points, perform downsampling

print(train[train['is_attributed'] == 1].count())
#print(pos_train['app'].count())
print(train[train['is_attributed'] == 0].count())
#print(neg_train['app'].count())

In [20]:
# Downsampling negative data
#down_neg_train = neg_train.sample(5000000, random_state = 0)
#total_train = pd.concat([down_neg_train, pos_train], axis=0)
#del train, pos_train, neg_train, down_neg_train
#gc.collect()

In [21]:
# Convert time info
#total_train['hour'] = total_train['click_time'].map(lambda x: x[11:13]).astype('uint8')
#test['hour'] = test['click_time'].map(lambda x: x[11:13]).astype('uint8')
#total_train = total_train.drop(['click_time'], axis=1)
#test = test.drop(['click_time'], axis=1)

In [22]:
#total_train.head()

In [23]:
#test.head()

In [17]:
# Prepare train, test
X_train = train.drop(['is_attributed'], axis=1).values
Y_train = train['is_attributed'].values
X_test = test.drop(['is_attributed'], axis=1).values

In [18]:
del train, test
gc.collect()

0

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc
import lightgbm as lgbm

sss = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
result = np.zeros(X_test.shape[0])

for (train_index, cv_index) in sss.split(X_train, Y_train):

    XX_train, YY_train = X_train[train_index], Y_train[train_index]
    XX_cv, YY_cv = X_train[cv_index], Y_train[cv_index]
    gbm_train = lgbm.Dataset(XX_train, YY_train, feature_name=['app','device','os','channel','hour','total_click','click_per_hour',\
                'click_per_channel','click_per_app','click_per_device','click_per_os'], \
                categorical_feature=['app','device','os','channel','hour'])
    gbm_cv = lgbm.Dataset(XX_cv, YY_cv, feature_name=['app','device','os','channel','hour','total_click','click_per_hour',\
                'click_per_channel','click_per_app','click_per_device','click_per_os'], \
             categorical_feature=['app','device','os','channel','hour'])
    
    params = {
        # Task based parameter
        'application' :'binary',
        'learning_rate' : 0.1,
        'num_iterations': 1000,
        'boosting' : 'gbdt',
        
        # Deal with overfitting
        'bagging_fraction': 0.8, 
        'bagging_freq': 1,
        'min_data_in_leaf': 100,
        'feature_fraction': 0.8,
        'num_leaves': 11,
        'max_depth': -1,
        'max_bin': 100,
        
        # Others
        'metric': 'auc',
        'num_threads': 8,
        'scale_pos_weight': 200
    }

    bst = lgbm.train(params, gbm_train, valid_sets=[gbm_cv], early_stopping_rounds=10)
    bst.save_model('model.txt', num_iteration=bst.best_iteration)
    ypred = bst.predict(X_test, num_iteration=bst.best_iteration)
    result += ypred



[1]	valid_0's auc: 0.948377
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.952952
[3]	valid_0's auc: 0.9543
[4]	valid_0's auc: 0.955007
[5]	valid_0's auc: 0.955706
[6]	valid_0's auc: 0.955958
[7]	valid_0's auc: 0.956443
[8]	valid_0's auc: 0.956931
[9]	valid_0's auc: 0.960141
[10]	valid_0's auc: 0.960329
[11]	valid_0's auc: 0.960499
[12]	valid_0's auc: 0.960622


KeyboardInterrupt: 

In [19]:
result /= 5

In [20]:
submit = pd.DataFrame({'click_id':click_id, 'is_attributed':result})
submit.to_csv('submit.csv', index=False)