In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
train=pd.read_csv('train.csv')

In [3]:
train.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [4]:
test=pd.read_csv('test.csv')

In [5]:
test.columns

Index(['id', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [6]:
train.shape

(40428967, 24)

In [7]:
test.shape

(4577464, 23)

In [8]:
train = pd.concat([train, test], ignore_index=True, sort=False)

In [9]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [10]:
train=reduce_mem_usage(train)

Mem. usage decreased to 4463.83 Mb (45.8% reduction)


In [11]:
train['time']=pd.to_datetime(train['hour'],format='%y%m%d%H')
train['day']=train['time'].dt.day
train['hour']=train['time'].dt.hour

train['user']=train['device_id'].astype(str)+ '_' + train['device_ip'].astype(str)+ '_' +train['device_model'].astype(str)
train['app']=train['app_id'].astype(str)+ '_' + train['app_domain'].astype(str)+ '_' +train['app_category'].astype(str)
train['site']=train['site_id'].astype(str)+ '_' + train['site_domain'].astype(str)+ '_' +train['site_category'].astype(str)

In [12]:
train['day'].value_counts()

22    5337126
28    5287222
31    4577464
30    4218938
21    4122995
23    3870752
26    3835892
29    3832608
25    3363122
24    3335302
27    3225010
Name: day, dtype: int64

In [13]:
train.shape

(45006431, 29)

In [14]:
#选27-31日的数据训练
train = train[(train.day==27)|(train.day==28)|(train.day==29)|(train.day==30)|(train.day==31)]

In [15]:
train.shape

(21141242, 29)

In [16]:
train['day'].value_counts()

28    5287222
31    4577464
30    4218938
29    3832608
27    3225010
Name: day, dtype: int64

In [17]:
matrix=train
del train,test
gc.collect()

5085

In [18]:
matrix['day'].value_counts()

28    5287222
31    4577464
30    4218938
29    3832608
27    3225010
Name: day, dtype: int64

In [19]:
matrix['device_ip_count']=matrix.groupby(['device_ip'])['device_ip'].transform('count')
matrix['device_id_count']=matrix.groupby(['device_id'])['device_id'].transform('count')
matrix['hourly_device_ip_count']=matrix.groupby(['device_ip','hour'])['device_ip'].transform('count')
matrix['hourly_device_id_count']=matrix.groupby(['device_id','hour'])['device_id'].transform('count')

matrix['user_count']=matrix.groupby(['user'])['user'].transform('count')
matrix['hourly_user_count']=matrix.groupby(['user','hour'])['user'].transform('count')
matrix['hourly_count']=matrix.groupby(['hour'])['hour'].transform('count')

matrix['app_count']=matrix.groupby(['app'])['app'].transform('count')
matrix['site_count']=matrix.groupby(['site'])['site'].transform('count')

matrix['hourly_app_count']=matrix.groupby(['app','hour'])['app'].transform('count')
matrix['hourly_site_count']=matrix.groupby(['site','hour'])['site'].transform('count')


matrix['C14_count']=matrix.groupby(['C14'])['C14'].transform('count')
matrix['C21_count']=matrix.groupby(['C21'])['C21'].transform('count')
matrix['C19_count']=matrix.groupby(['C19'])['C19'].transform('count')
matrix['C20_count']=matrix.groupby(['C20'])['C20'].transform('count')

In [20]:
matrix.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'time', 'day', 'user',
       'app', 'site', 'device_ip_count', 'device_id_count',
       'hourly_device_ip_count', 'hourly_device_id_count', 'user_count',
       'hourly_user_count', 'hourly_count', 'app_count', 'site_count',
       'hourly_app_count', 'hourly_site_count', 'C14_count', 'C21_count',
       'C19_count', 'C20_count'],
      dtype='object')

In [21]:
#对分类特征进行hash处理,某种程度上相当于label-encode
D=2**20
for feat in ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'time',  'user',
       'app', 'site']:
    
    matrix.loc[:,feat]=matrix.loc[:][feat].apply(lambda x:abs(hash((feat+'_'+str(x))))% D)

In [22]:
#测试时用

val=matrix[(matrix.day==30)] 
tra=matrix[(matrix.day==27)|(matrix.day==28)|(matrix.day==29)]
test=matrix[(matrix.day==31)]

del matrix
gc.collect()

22

In [23]:
tra.shape

(12344840, 44)

In [24]:
val.shape

(4218938, 44)

In [25]:
test.shape

(4577464, 44)

In [26]:
tra.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'time', 'day', 'user',
       'app', 'site', 'device_ip_count', 'device_id_count',
       'hourly_device_ip_count', 'hourly_device_id_count', 'user_count',
       'hourly_user_count', 'hourly_count', 'app_count', 'site_count',
       'hourly_app_count', 'hourly_site_count', 'C14_count', 'C21_count',
       'C19_count', 'C20_count'],
      dtype='object')

In [27]:
features=[ 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'time', 'day', 'user',
       'app', 'site', 'device_ip_count', 'device_id_count',
       'hourly_device_ip_count', 'hourly_device_id_count', 'user_count',
       'hourly_user_count', 'hourly_count', 'app_count', 'site_count',
       'hourly_app_count', 'hourly_site_count', 'C14_count', 'C21_count',
       'C19_count', 'C20_count']

In [28]:
#测试时用
y_val=val['click']
x_val=val[features]
y_train=tra['click']
x_train=tra[features]
x_test=test[features]

In [30]:
import lightgbm as lgb

lgb_params = {
    'n_estimators' : 2000,
    'learning_rate' : 0.02,
    'colsample_bytree' : 0.3,
    'subsample' : 0.7,
    'subsample_freq' : 2,
    'num_leaves' : 16,
    'seed' : 99
}

lgb_model = lgb.LGBMClassifier(**lgb_params)

In [31]:
lgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train),(x_val, y_val)],  eval_metric='logloss',verbose=True,early_stopping_rounds=50)

[1]	training's binary_logloss: 0.440442	valid_1's binary_logloss: 0.453775
Training until validation scores don't improve for 50 rounds
[2]	training's binary_logloss: 0.439232	valid_1's binary_logloss: 0.452561
[3]	training's binary_logloss: 0.438123	valid_1's binary_logloss: 0.451397
[4]	training's binary_logloss: 0.437123	valid_1's binary_logloss: 0.450689
[5]	training's binary_logloss: 0.436066	valid_1's binary_logloss: 0.449724
[6]	training's binary_logloss: 0.435	valid_1's binary_logloss: 0.448944
[7]	training's binary_logloss: 0.434014	valid_1's binary_logloss: 0.447971
[8]	training's binary_logloss: 0.43292	valid_1's binary_logloss: 0.446842
[9]	training's binary_logloss: 0.431907	valid_1's binary_logloss: 0.445873
[10]	training's binary_logloss: 0.431034	valid_1's binary_logloss: 0.445234
[11]	training's binary_logloss: 0.43019	valid_1's binary_logloss: 0.444531
[12]	training's binary_logloss: 0.42939	valid_1's binary_logloss: 0.44381
[13]	training's binary_logloss: 0.428698	va

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
               importance_type='split', learning_rate=0.02, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=16, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=99,
               silent=True, subsample=0.7, subsample_for_bin=200000,
               subsample_freq=2)

In [32]:
#生成提交用的submission
submission=pd.read_csv('sampleSubmission.csv')
test_predict = lgb_model.predict_proba(x_test)[:, 1]

In [33]:
submission['click']=test_predict
submission.to_csv("submission1.csv", index=False)

In [34]:
submission.head()

Unnamed: 0,id,click
0,10000174058809263569,0.068204
1,10000182526920855428,0.219892
2,10000554139829213984,0.210251
3,10001094637809798845,0.042106
4,10001377041558670745,0.220065
