In [1]:
from tqdm import tqdm
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
import os
import numpy as np
import time
import random
import warnings
warnings.filterwarnings("ignore")

# 读取数据集

In [2]:
data_org_dir = 'data/data_tencent/'
data_prep_dir = 'data/preprocess/'
sample_test = False

In [3]:
if sample_test:
    df_data = pd.read_csv(data_prep_dir + 'train_feat_merge_mini.csv')
else:
    df_data = pd.read_csv(data_prep_dir + 'train_feat_merge.csv')

train_idx_lst = list(df_data[df_data['n_parts'] != 1].index)
valid_idx_lst = list(df_data[df_data['n_parts'] == 1].index)


In [4]:
df_data.columns

Index(['n_parts', 'aid', 'uid', 'label', 'LBS', 'age', 'appIdAction',
       'appIdInstall', 'carrier', 'consumptionAbility', 'ct', 'education',
       'gender', 'house', 'interest1', 'interest2', 'interest3', 'interest4',
       'interest5', 'kw1', 'kw2', 'kw3', 'marriageStatus', 'os', 'topic1',
       'topic2', 'topic3', 'advertiserId', 'campaignId', 'creativeId',
       'creativeSize', 'adCategoryId', 'productId', 'productType'],
      dtype='object')

# 特征工程

In [5]:
# 缺失情况统计
total = len(df_data)
for col_name in list(df_data.columns):
    df_missing = (df_data[df_data[col_name] == '-1'])
    
    print(col_name, ':', round(len(df_missing)/total, 2) * 100, '%')

n_parts : 0.0 %
aid : 0.0 %
uid : 0.0 %
label : 0.0 %
LBS : 0.0 %
age : 0.0 %
appIdAction : 98.0 %
appIdInstall : 98.0 %
carrier : 0.0 %
consumptionAbility : 0.0 %
ct : 0.0 %
education : 0.0 %
gender : 0.0 %
house : 0.0 %
interest1 : 9.0 %
interest2 : 34.0 %
interest3 : 97.0 %
interest4 : 98.0 %
interest5 : 25.0 %
kw1 : 10.0 %
kw2 : 3.0 %
kw3 : 95.0 %
marriageStatus : 0.0 %
os : 0.0 %
topic1 : 9.0 %
topic2 : 4.0 %
topic3 : 95.0 %
advertiserId : 0.0 %
campaignId : 0.0 %
creativeId : 0.0 %
creativeSize : 0.0 %
adCategoryId : 0.0 %
productId : 0.0 %
productType : 0.0 %


## 删除特征

1. 删除category特别大特别稀疏特征: 'appIdInstall', 'appIdAction', 'marriageStatus';
2. 删除缺失情况严重特征: 'interest3', 'interest4', 'kw3', 'topic3';


In [6]:
feat2drop = ['appIdInstall', 'appIdAction', 'marriageStatus', 'interest3', 'interest4', 'kw3', 'topic3']
df_data.drop(feat2drop, axis=1, inplace=True)


In [7]:
df_data.columns

Index(['n_parts', 'aid', 'uid', 'label', 'LBS', 'age', 'carrier',
       'consumptionAbility', 'ct', 'education', 'gender', 'house', 'interest1',
       'interest2', 'interest5', 'kw1', 'kw2', 'os', 'topic1', 'topic2',
       'advertiserId', 'campaignId', 'creativeId', 'creativeSize',
       'adCategoryId', 'productId', 'productType'],
      dtype='object')

## 定长离散特征
- 用户类: 'LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os'
- 广告类: 'aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId','productId', 'productType'



In [8]:
discrete_feat_lst = ['LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os',
                     'aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId',
                     'productId', 'productType']
t0 = time.time()
for discrete_feat in tqdm(discrete_feat_lst):
    enc = LabelEncoder()
    try:
        df_data[discrete_feat] = enc.fit_transform(df_data[discrete_feat].apply(int))
    except:
        df_data[discrete_feat] = enc.fit_transform(df_data[discrete_feat])


100%|██████████| 16/16 [01:01<00:00,  3.84s/it]


In [9]:
def one_hot_encoding(df_feat2enc):
    """

    :param df_feat2enc:
    :type df_feat2enc:
    :return:
    :rtype:
    """
    
    one_hot_enc.fit(df_feat2enc.values.reshape(-1, 1))
    feat_enc_arr = one_hot_enc.transform(df_feat2enc.values.reshape(-1, 1))#.toarray()

    return feat_enc_arr

# 数据量太大了, 一下子肝不完, 必须分开
discrete_feat_lst1 = ['LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender']
discrete_feat_lst2 = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize']
discrete_feat_lst3 = ['adCategoryId', 'productId', 'productType', 'house', 'os']

discrete_feat = [discrete_feat_lst1, discrete_feat_lst2, discrete_feat_lst3]
# 初始化
one_hot_enc = OneHotEncoder()

x_train = pd.DataFrame()
x_valid = pd.DataFrame()
###### 数据量很大的时候用这个 ######
for feat_lst in discrete_feat:
    
    for feat in tqdm(feat_lst):
        # 合并训练
        df_feat2enc = df_data[feat]
        one_hot_enc.fit(df_feat2enc.values.reshape(-1, 1))
        # 训练集
        train2enc = df_data.loc[train_idx_lst][feat]
        train_enc_arr = one_hot_enc.transform(train2enc.values.reshape(-1, 1))
        x_train = sparse.hstack([x_train, train_enc_arr])
        # 验证集
        valid2enc = df_data.loc[valid_idx_lst][feat]
        valid_enc_arr = one_hot_enc.transform(valid2enc.values.reshape(-1, 1))
        x_valid = sparse.hstack([x_valid, valid_enc_arr])
    
train_f_dir = data_prep_dir + 'train_x_sparse_onehot.npz'
np.save(train_f_dir, x_train)

valid_f_dir = data_prep_dir + 'valid_x_sparse_onehot.npz'
np.save(valid_f_dir, x_valid)

print(train_f_dir, 'is saved! Array shape is', x_train.shape)
print(valid_f_dir, 'is saved! Array shape is', x_valid.shape)

###### 测试数据用这个 ######
# train_x = np.empty([len(train_idx_lst), 1], dtype=int)
# valid_x = np.empty([len(valid_idx_lst), 1], dtype=int) 
# for feat in tqdm(discrete_feat_lst):
    
#     # 训练集
#     df_feat2enc_train = df_data.loc[train_idx_lst][feat]
#     train_enc_arr = one_hot_encoding(df_feat2enc_train)
#     train_x = sparse.hstack([train_x, train_enc_arr])
#     # 验证集
#     df_feat2enc_valid = df_data.loc[valid_idx_lst][feat]
#     valid_enc_arr = one_hot_encoding(df_feat2enc_valid)
#     valid_x = sparse.hstack([valid_x, valid_enc_arr])
    
# train_f_dir = data_prep_dir + 'train_x_sparse_onehot.npz'
# sparse.save_npz(train_f_dir, train_enc_arr)

# valid_f_dir = data_prep_dir + 'valid_x_sparse_onehot.npz'
# sparse.save_npz(valid_f_dir, valid_enc_arr)

# print(train_f_dir, 'is saved! Array shape is', train_x.shape)
# print(valid_f_dir, 'is saved! Array shape is', valid_x.shape)




100%|██████████| 6/6 [00:50<00:00,  8.35s/it]
100%|██████████| 5/5 [00:46<00:00,  9.36s/it]
100%|██████████| 5/5 [00:47<00:00,  9.57s/it]


data/preprocess/train_x_sparse_onehot.npz is saved! Array shape is (7038840, 1526)
data/preprocess/valid_x_sparse_onehot.npz is saved! Array shape is (1759974, 1526)


## 序列型特征
- 用户类: 'ct', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'os'，'topic1', 'topic2', 

In [10]:
sequence_feat_lst = ['ct', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic1', 'topic2']

### 构建序列长度特征
len_static_features = [x+'_len' for x in sequence_feat_lst]

In [22]:
# 初始化
df_feat_len = pd.DataFrame()
for sequence_feat in tqdm(sequence_feat_lst):
    
    df_feat2len = df_data[sequence_feat].apply(lambda x: len(x.split(' ')))
    df_feat_len = pd.concat([df_feat_len, df_feat2len], axis=1)

# 构建训练集&验证集
x_train = df_feat_len.iloc[train_idx_lst].values
x_valid = df_feat_len.iloc[valid_idx_lst].values

train_f_dir = data_prep_dir + 'train_x_sparse_seq_len.npz'
np.save(train_f_dir, x_train)

valid_f_dir = data_prep_dir + 'valid_x_sparse_seq_len.npz'
np.save(valid_f_dir, x_valid)

print(train_f_dir, 'is saved! Array shape is', x_train.shape)
print(valid_f_dir, 'is saved! Array shape is', x_valid.shape)

100%|██████████| 8/8 [01:08<00:00,  8.51s/it]


data/preprocess/train_x_sparse_seq_len.npz is saved! Array shape is (7038840, 8)
data/preprocess/valid_x_sparse_seq_len.npz is saved! Array shape is (1759974, 8)


### 构建序列count特征

In [30]:
# 初始化
x_train = pd.DataFrame()
x_valid = pd.DataFrame()
cnt_enc = CountVectorizer()

for sequence_feat in tqdm(sequence_feat_lst[1:]):
    
    # 合并训练
    df_feat2cnt = df_data[sequence_feat]
    cnt_enc.fit(df_feat2cnt.values)  #.reshape(-1, 1)
    # 训练集
    train2enc = df_data.loc[train_idx_lst][sequence_feat]
    train_enc_arr = cnt_enc.transform(train2enc.values)  # .reshape(-1, 1)
    x_train = sparse.hstack([x_train, train_enc_arr])
    # 验证集
    valid2enc = df_data.loc[valid_idx_lst][sequence_feat]
    valid_enc_arr = cnt_enc.transform(valid2enc.values)
    x_valid = sparse.hstack([x_valid, valid_enc_arr])
    
train_f_dir = data_prep_dir + 'train_x_sparse_cntv.npz'
# np.save(train_f_dir, x_train)

valid_f_dir = data_prep_dir + 'valid_x_sparse_cntv.npz'
# np.save(valid_f_dir, x_valid)

print(train_f_dir, 'is saved! Array shape is', x_train.shape)
print(valid_f_dir, 'is saved! Array shape is', x_valid.shape)


100%|██████████| 7/7 [18:12<00:00, 156.09s/it]

data/preprocess/train_x_sparse_cntv.npz is saved! Array shape is (7038840, 323142)
data/preprocess/valid_x_sparse_cntv.npz is saved! Array shape is (1759974, 323142)





## 用户id类特征
- 用户类: 'uid'
另外，因为也存在量大稀疏的问题，但是比赛的问题就是针对用户对广告的点击，因此uid对模型来说还是很有必要进行建模的，但是uid不使用category的方式建模，而是使用count和转化率建模：
- uid_count : 对uid进行出现频次的建模。
- uid_pos_count : 对uid进行正样本中的出现频次的建模。
- uid_ad_features_pos_count : 对uid组合所有广告特征'ad_static_feature'进行正样本中的出现频次的建模。
- 对uid进行如此多的建模方式是为了能对uid进行更加详细的表述，因为category的每一个特征就是一对一的，而count特征是多对一的，存在大量的信息损失，因此需要进行更多不同角度的建模，才能更好地表述uid。
特征方面还有一些长尾处理，未出现id的统一映射，长度和count特征的未出现次数的取临近值等trick。

## 特征选择

In [None]:
feat_fname_train = ['']

# 模型构建

In [14]:
def model_evaluation(y_train, y_train_pred, y_test, y_test_pred, model_name=''):
    """

    :param y_train:
    :type y_train:
    :param y_train_pred:
    :type y_train_pred:
    :param y_test:
    :type y_test:
    :param y_test_pred:
    :type y_test_pred:
    :param model_name:
    :type model_name:
    :return:
    :rtype:
    """
    print("{} Train AUC: {}, logloss: {}".format(model_name, roc_auc_score(y_train, y_train_pred), log_loss(y_train, y_train_pred)))
    # print("{} Train confusion matrix: {}".format(model_name, confusion_matrix(y_train, y_train_pred)))

    print("{} Test AUC: {}, logloss: {}".format(model_name, roc_auc_score(y_test, y_test_pred), log_loss(y_test, y_test_pred)))
    # print("{} Test confusion matrix: {}".format(model_name, confusion_matrix(y_test, y_test_pred)))

In [12]:
# construct y vector
y_train = np.array(df_data[df_data['n_parts'] != 1]['label'])
y_valid = np.array(df_data[df_data['n_parts'] == 1]['label'])


## Baseline版本
- 只使用离散特征，进行onehot编码
- 线性模型：LogisticRegression

In [13]:
print(x_train.shape, x_valid.shape)
# baseline版本
turned_param = {
        'penalty': 'l2',
        'C': 10,
        'solver': 'lbfgs',
        'tol': 1e-4,
        'max_iter': 10000
    }

clf = LogisticRegression(random_state=1, **turned_param)
clf.fit(x_train, y_train)

y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_valid)

model_evaluation(y_train, y_train_pred, y_valid, y_test_pred, model_name='LogisticRegression')


(7038840, 1526) (1759974, 1526)


NameError: name 'model_evaluation' is not defined

In [15]:
# 基于相似度


LogisticRegression Train AUC: 0.5000014816351326, logloss: 1.6558969428673103
LogisticRegression Test AUC: 0.5, logloss: 1.6581997519647336


In [15]:
for i, j in enumerate([1,1,1]):
    print(i, j)

0 1
1 1
2 1
