In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 数据处理

In [3]:
data = pd.read_csv('../data/criteo_sampled_data.csv')
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [4]:
cols = data.columns.values
dense_feats = [f for f in cols if f[0] == "I"]
sparse_feats = [f for f in cols if f[0] == "C"]

In [5]:
def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

In [6]:
data_dense = process_dense_feats(data, dense_feats)

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        
    return d

In [9]:
data_sparse = process_sparse_feats(data, sparse_feats)

In [10]:
X = pd.concat([data_dense, data_sparse], axis=1)
X.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0.693147,0.693147,1.791759,0.0,7.23201,1.609438,2.772589,1.098612,5.204007,0.693147,...,9,3439,213,3,4954,0,3,24768,52,14364
1,1.098612,0.0,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,...,0,2465,213,1,60664,0,3,8432,52,10835
2,1.098612,0.0,0.693147,2.70805,6.64379,4.49981,1.609438,1.098612,5.505332,0.693147,...,6,738,0,0,143786,9,3,7344,0,0
3,0.0,6.795706,0.0,0.0,8.387768,0.0,0.0,0.0,0.0,0.0,...,1,1648,0,0,67107,0,3,18107,0,0
4,1.386294,-1.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.0,0.693147,...,1,556,0,0,21257,0,2,22439,0,0


In [11]:
y = data.iloc[:,0]
y

0         0
1         0
2         0
3         0
4         0
         ..
599995    1
599996    0
599997    0
599998    0
599999    0
Name: label, Length: 600000, dtype: int64

In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix

In [13]:
train_X5,test_X5,train_y5,test_y5 = train_test_split(X,y,test_size = 0.2,random_state=121)
xgb = XGBClassifier(verbose=1, subsample=0.9, silent=True, objective='binary:logistic', 
            n_estimators=400, min_child_weight=0.5, max_depth=30, learning_rate=0.02, gamma=0.25, 
            colsample_bytree=0.7, colsample_bylevel=0.7, booster='gbtree')
xgb.fit(train_X5, train_y5)
y_pred5 = xgb.predict(test_X5)
print(y_pred5)
y_pred_pro5 = xgb.predict_proba(test_X5)
auc5 = roc_auc_score(test_y5, list(map(lambda x: x[1], y_pred_pro5)))
print("auc", auc5)
print(classification_report(test_y5, y_pred5))
print(confusion_matrix(test_y5, y_pred5))

[0 0 0 ... 0 0 0]
auc 0.7686525986662804
              precision    recall  f1-score   support

           0       0.79      0.95      0.86     89339
           1       0.67      0.26      0.38     30661

    accuracy                           0.78    120000
   macro avg       0.73      0.61      0.62    120000
weighted avg       0.76      0.78      0.74    120000

[[85288  4051]
 [22579  8082]]


In [None]:
# 从auc来看，xgboost这种boosting算法效果要优于深度学习的算法，因为ctr模型的本质是为了发现更多的特征，而树这种结构，天生就是特征的组合