# 数据说明


Data field:

- id: ad identifier
- click: 0/1 for non-click/click
- hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
- C1 -- anonymized categorical variable
- banner_pos
- site_id
- site_domain
- site_category
- app_id
- app_domain
- app_category
- device_id
- device_ip
- device_model
- device_type
- device_conn_type
- C14-C21 -- anonymized categorical variables

In [1]:
import pandas as pd
import numpy as np
from meanencoder import *

## 数据预处理

In [2]:
raw_data_path="/mnt/wc/wc/kaggle/data/avazu-ctr-prediction/"
tmp_data_path = raw_data_path + '/tmp_data/'

In [3]:
train_set = pd.read_csv(raw_data_path+"train.csv")
test_set = pd.read_csv(raw_data_path+"test.csv")

In [4]:
(train_set.shape, test_set.shape)

((40428967, 24), (4577464, 23))

In [5]:
list(zip(train_set.columns, train_set.dtypes))

[('id', dtype('float64')),
 ('click', dtype('int64')),
 ('hour', dtype('int64')),
 ('C1', dtype('int64')),
 ('banner_pos', dtype('int64')),
 ('site_id', dtype('O')),
 ('site_domain', dtype('O')),
 ('site_category', dtype('O')),
 ('app_id', dtype('O')),
 ('app_domain', dtype('O')),
 ('app_category', dtype('O')),
 ('device_id', dtype('O')),
 ('device_ip', dtype('O')),
 ('device_model', dtype('O')),
 ('device_type', dtype('int64')),
 ('device_conn_type', dtype('int64')),
 ('C14', dtype('int64')),
 ('C15', dtype('int64')),
 ('C16', dtype('int64')),
 ('C17', dtype('int64')),
 ('C18', dtype('int64')),
 ('C19', dtype('int64')),
 ('C20', dtype('int64')),
 ('C21', dtype('int64'))]

我们发现 C1的类型是int，其实应该转换为object

然后对于 banner_pos，device_type，device_conn_type 也应该转换为 object

In [6]:
list(train_set.columns)

['id',
 'click',
 'hour',
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']

In [15]:
# columns = ['C1','banner_pos','device_type','device_conn_type','C14','C15','C16','C17','C18',
#                'C19','C20','C21']
columns = [
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']
# for column in columns:
#     test_set[column] = test_set[column].astype('category')

In [18]:
# list(zip(test_set.columns, test_set.dtypes))
cat_sz = [(c, len(test_set[c].cat.categories)+1) for c in columns]
cat_sz
# 460w的数据量，其中每个类别的个数，发现 device_ip 和 device_id 是高基数的特征，
# 因此我们将其通过target encode进行编码，在分类问题中，其值就是一个

[('C1', 8),
 ('banner_pos', 7),
 ('site_id', 2826),
 ('site_domain', 3367),
 ('site_category', 23),
 ('app_id', 3953),
 ('app_domain', 202),
 ('app_category', 29),
 ('device_id', 291760),
 ('device_ip', 1077200),
 ('device_model', 5439),
 ('device_type', 5),
 ('device_conn_type', 5),
 ('C14', 1258),
 ('C15', 9),
 ('C16', 10),
 ('C17', 241),
 ('C18', 5),
 ('C19', 48),
 ('C20', 163),
 ('C21', 40)]

In [13]:
len(train_set['device_id'].unique()), len(train_set['device_ip'].unique())
# 根据这个数据，我们总共有4000

(2686408, 6729486)

In [16]:
train_set.groupby(by='device_id')['click'].agg({'size':'size'}).mean() 
# 平均没给device_id的个数

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


size    15.049452
dtype: float64

In [17]:
train_set.groupby(by='device_ip')['click'].agg({'size':'size'}).mean() 
# 平均没给device_ip的个数

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


size    6.007735
dtype: float64

In [7]:
encoder_id = MeanEncoder(['device_id'],prior_weight_func={'k':15,'f':1})
encoder_ip = MeanEncoder(['device_ip'],prior_weight_func={'k':6,'f':1})

In [10]:
pd.DataFrame(train_set.head()['device_id'])

Unnamed: 0,device_id
0,a99f214a
1,a99f214a
2,a99f214a
3,a99f214a
4,a99f214a


In [11]:
train_set_id = encoder_id.fit_transform(pd.DataFrame(train_set['device_id']),train_set['click'])

is deprecated and will be removed in a future version
  col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
is deprecated and will be removed in a future version
  col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})


In [12]:
train_set_id.head()

Unnamed: 0,device_id,device_id_pred_0,device_id_pred_1
0,a99f214a,0.824867,0.175133
1,a99f214a,0.824867,0.175133
2,a99f214a,0.824867,0.175133
3,a99f214a,0.824867,0.175133
4,a99f214a,0.824867,0.175133


In [15]:
train_set['device_id_pred_0'] = train_set_id['device_id_pred_0']

In [17]:
del train_set_id

In [18]:
train_set_ip = encoder_ip.fit_transform(pd.DataFrame(train_set['device_ip']),train_set['click'])

is deprecated and will be removed in a future version
  col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})


In [19]:
train_set_ip.head()

Unnamed: 0,device_ip,device_ip_pred_0,device_ip_pred_1
0,ddd2926e,0.79149,0.20851
1,96809ac8,0.830194,0.169806
2,b3cf8def,0.815262,0.184738
3,e8275b8f,0.825,0.175
4,9644d0bf,0.830194,0.169806


In [20]:
train_set['device_ip_pred_0'] = train_set_ip['device_ip_pred_0']

In [21]:
del train_set_ip

In [23]:
train_set.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C14,C15,C16,C17,C18,C19,C20,C21,device_id_pred_0,device_ip_pred_0
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,15706,320,50,1722,0,35,-1,79,0.824867,0.79149
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,15704,320,50,1722,0,35,100084,79,0.824867,0.830194
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,15704,320,50,1722,0,35,100084,79,0.824867,0.815262
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,15706,320,50,1722,0,35,100084,79,0.824867,0.825
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,18993,320,50,2161,0,35,-1,157,0.824867,0.830194


In [28]:
# 处理到这，我需要将几个数据都存储下来
import pickle
# import cPickle as pickle
import sklearn.externals.joblib as jl

In [26]:
train_set.to_csv(tmp_data_path+"train_set.csv")

In [30]:
# jl.dump(encoder_id,tmp_data_path+"encoder_id")

In [31]:
test_id =  encoder_id.transform(pd.DataFrame(test_set['device_id']))

In [33]:
test_set['device_id_pred_0'] = test_id['device_id_pred_0']

In [34]:
del test_id

In [36]:
test_ip = encoder_ip.transform(pd.DataFrame(test_set['device_ip']))
test_set['device_ip_pred_0'] = test_ip['device_ip_pred_0']
del test_ip

In [37]:
test_set.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C14,C15,C16,C17,C18,C19,C20,C21,device_id_pred_0,device_ip_pred_0
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,8330,320,50,761,3,175,100075,23,0.825848,0.725741
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,22676,320,50,2616,0,35,100083,51,0.825848,0.821387
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,22676,320,50,2616,0,35,100083,51,0.825848,0.76874
3,1.000109e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,18648,320,50,1092,3,809,100156,61,0.825848,0.966039
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,23160,320,50,2667,0,47,-1,221,0.825848,0.830194


In [38]:
jl.dump(test_set,tmp_data_path+"test_set.pkl")

['/mnt/wc/wc/kaggle/data/avazu-ctr-prediction//tmp_data/test_set.pkl']

# 时间字段处理
之前已经用平均数编码对 device_ip 和 device_id 进行了编码，下一步我们开始对hour字段进行处理

In [39]:
def to_datetime(time):
    return "20" + str(time) + "00"

In [40]:
from fastai.structured import *

In [49]:
# a = pd.DataFrame(train_set.head()['hour'].apply(to_datetime))
# add_datepart(a,'hour',drop=False)
# a
train_set['date'] = train_set['hour'].apply(to_datetime)

In [50]:
add_datepart(train_set,'date',drop=False)

In [51]:
jl.dump(train_set,tmp_data_path+"train_set.pkl")

['/mnt/wc/wc/kaggle/data/avazu-ctr-prediction//tmp_data/train_set.pkl']

In [52]:
train_set.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600


In [53]:
test_set['date'] = test_set['hour'].apply(to_datetime)
add_datepart(test_set,'date',drop=False)

In [54]:
jl.dump(test_set,tmp_data_path+"test_set.pkl")

['/mnt/wc/wc/kaggle/data/avazu-ctr-prediction//tmp_data/test_set.pkl']

In [55]:
train_set.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600


In [None]:
# columns = ['C1','banner_pos','device_type','device_conn_type','C14','C15','C16','C17','C18',
#                'C19','C20','C21']
# train_set[columns] = train_set[columns].astype(str)

In [None]:
# test_set['click'] = 0
# data = pd.concat([train_set,test_set])
# print("after contat, data.shape is ", data.shape)

## 算法
我们先来计算目前几个特征的 基数

In [61]:
columns = [
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
#  'device_id',
#  'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']
emb_szs = []
for column in columns:
#     print(column + ":" + str(len(train_set[column].unique())))
    c = len(train_set[column].unique())
    emb_szs.append((c, min(50, (c+1)//2)))

In [63]:
emb_szs_test = []
for column in columns:
#     print(column + ":" + str(len(train_set[column].unique())))
    c = len(test_set[column].unique())
    emb_szs_test.append((c, min(50, (c+1)//2)))

In [71]:
for column in columns:
    test_c_set = set(test_set[column].unique())
    train_c_set = set(train_set[column].unique())
    if not test_c_set.issubset(train_c_set):
        # 输出不包含在 train_set 中的值，发现不同的还挺多。那这个怎么解决呢？
        print(column,len(test_c_set.difference(train_c_set)))

site_id 105
site_domain 167
app_id 584
app_domain 21
device_model 52
C14 259
C17 39
C19 1
C21 2


通过上面简单的分析，我们发现测试集中的数据不完全都在训练集合中，因此采取 one-hot encode的时候，对于未出现的值，采取 ignore 的策略，即都是0

In [77]:
# 发现数据很好啊，都没有任何缺失
train_set.isna().any()

id                  False
click               False
hour                False
C1                  False
banner_pos          False
site_id             False
site_domain         False
site_category       False
app_id              False
app_domain          False
app_category        False
device_id           False
device_ip           False
device_model        False
device_type         False
device_conn_type    False
C14                 False
C15                 False
C16                 False
C17                 False
C18                 False
C19                 False
C20                 False
C21                 False
device_id_pred_0    False
device_ip_pred_0    False
date                False
Year                False
Month               False
Week                False
Day                 False
Dayofweek           False
Dayofyear           False
Is_month_end        False
Is_month_start      False
Is_quarter_end      False
Is_quarter_start    False
Is_year_end         False
Is_year_star

In [107]:
test_set.isna().any().any()

False

In [86]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

为了方便处理，此处我们将 train 和 test 一起处理

In [109]:
test_set['click'] = 0
# all_data = pd.concat([train_set,test_set])

In [110]:
list(train_set.columns)

['id',
 'click',
 'hour',
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'device_id_pred_0',
 'device_ip_pred_0',
 'date',
 'Year',
 'Month',
 'Week',
 'Day',
 'Dayofweek',
 'Dayofyear',
 'Is_month_end',
 'Is_month_start',
 'Is_quarter_end',
 'Is_quarter_start',
 'Is_year_end',
 'Is_year_start',
 'Elapsed']

In [111]:
# 想先来一个最简单的 逻辑回归，要做这个模型的话，我们还缺少什么呢？ 我们还要将我们的变量做one-hot编码。
# 或者我们现在也不在做one-hot编码，而是直接采用树模型来做的

In [112]:
train_set.head()['Is_month_end'].astype(int)

0    0
1    0
2    0
3    0
4    0
Name: Is_month_end, dtype: int64

In [115]:
from sklearn.ensemble import RandomForestClassifier

In [116]:
clf = RandomForestClassifier(n_estimators=10, max_depth=40, min_samples_split=100, min_samples_leaf=10, random_state=0, criterion='entropy',
                             max_features=8, verbose = 1, n_jobs=-1, bootstrap=False)

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
train_set.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,21,1,294,False,False,False,False,False,False,1413849600


In [None]:
train, test = train_test_split(train_set, test_size=0.1,random_state=1)