## Description:
这里使用掉包的方式和亲自造轮子的方式进行kaggle上的criteo数据集的实战， 关于这个数据集的下载和介绍， 可以见GBDT+LR的那一节

In [2]:
# 导入包
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from pyfm import pylibfm

## 数据导入与简单处理

In [3]:
# 数据读取
path = 'criteo/'
df_train = pd.read_csv(path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

# 简单的数据预处理
# 去掉id列， 把测试集和训练集合并， 填充缺失值
df_train.drop(['Id'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

df_test['Label'] = -1

data = pd.concat([df_train, df_test])
data.fillna(-1, inplace=True)

In [3]:
data

Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,1,1.0,0,1.0,-1.0,227.0,1.0,173.0,18.0,50.0,...,3486227d,e88ffc9d,c393dc22,b1252a9d,57c90cd9,-1,bcdee96c,4d19a3eb,cb079c2d,456c12a0
1,1,4.0,1,1.0,2.0,27.0,2.0,4.0,2.0,2.0,...,07c540c4,92555263,-1,-1,242bb710,-1,3a171ecb,72c78f11,-1,-1
2,1,0.0,806,-1.0,-1.0,1752.0,142.0,2.0,0.0,50.0,...,07c540c4,25c88e42,21ddcdc9,b1252a9d,a0136dd2,-1,32c7478e,8fc66e78,001f3601,f37f3967
3,0,2.0,-1,42.0,14.0,302.0,38.0,25.0,38.0,90.0,...,e5ba7672,5aed7436,21ddcdc9,b1252a9d,c3abeb21,-1,423fab69,1793a828,e8b83407,5cef228f
4,1,0.0,57,2.0,1.0,2891.0,2.0,35.0,1.0,137.0,...,e5ba7672,642f2610,1d1eb838,b1252a9d,1640d50b,ad3062eb,423fab69,45ab94c8,2bf691b1,c84c4aec
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,-1,1.0,0,1.0,-1.0,149.0,5.0,1.0,0.0,0.0,...,d4bb7bd8,5aed7436,d16737e3,a458ea53,edc49a33,-1,93bad2c0,3fdb382b,e8b83407,80dd0a5b
396,-1,-1.0,-1,-1.0,-1.0,-1.0,-1.0,0.0,0.0,6.0,...,2005abd1,5162930e,-1,-1,12965bb8,-1,32c7478e,71292dbb,-1,-1
397,-1,0.0,300,4.0,-1.0,4622.0,25.0,20.0,6.0,55.0,...,8efede7f,a78bd508,21ddcdc9,5840adea,c2a93b37,-1,3a171ecb,1793a828,e8b83407,2fede552
398,-1,1.0,1,2.0,1.0,5.0,1.0,1.0,1.0,1.0,...,d4bb7bd8,a1d0cc4f,c68db44a,a458ea53,3b1ae854,-1,32c7478e,57e2c6c9,1575c75f,7132fed8


In [4]:
"""下面把特征列分开处理"""
continuous_fea = ['I'+str(i+1) for i in range(13)]
category_fea = ['C'+str(i+1) for i in range(26)]

In [5]:
# 类别特征编码
lab = LabelEncoder()
for col in category_fea:
    data[col] = data[col].astype('str')
    data[col] = lab.fit_transform(data[col])

In [6]:
# 分开
df_train = data[:df_train.shape[0]]
df_test = data[df_train.shape[0]:]
del df_test['Label']

In [7]:
# 生成数据集
x_train = df_train.drop(columns='Label')
y_train = df_train['Label'].values
x_test = df_test

In [8]:
# 标准化
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
# 转换格式
x_train = [{v: k for k, v in zip(i, range(len(i)))} for i in x_train]
x_test = [{v: k for k, v in zip(i, range(len(i)))} for i in x_test]

In [9]:
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=2020)

In [10]:
# 这里需要进行转换一下才能用这个包
v = DictVectorizer()
x_tr = v.fit_transform(x_tr)
x_val = v.transform(x_val)
x_test = v.transform(x_test)

In [11]:
# 建立模型
fm = pylibfm.FM(num_factors=200, num_iter=100, verbose=True, task='classification', initial_learning_rate=0.001, learning_rate_schedule='optimal')

In [12]:
fm.fit(x_tr, y_tr)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.53971
-- Epoch 2
Training log loss: 0.52317
-- Epoch 3
Training log loss: 0.51469
-- Epoch 4
Training log loss: 0.50653
-- Epoch 5
Training log loss: 0.50321
-- Epoch 6
Training log loss: 0.49856
-- Epoch 7
Training log loss: 0.49682
-- Epoch 8
Training log loss: 0.49321
-- Epoch 9
Training log loss: 0.49154
-- Epoch 10
Training log loss: 0.48990
-- Epoch 11
Training log loss: 0.48766
-- Epoch 12
Training log loss: 0.48649
-- Epoch 13
Training log loss: 0.48378
-- Epoch 14
Training log loss: 0.48180
-- Epoch 15
Training log loss: 0.47986
-- Epoch 16
Training log loss: 0.47894
-- Epoch 17
Training log loss: 0.47656
-- Epoch 18
Training log loss: 0.47654
-- Epoch 19
Training log loss: 0.47287
-- Epoch 20
Training log loss: 0.47443
-- Epoch 21
Training log loss: 0.47296
-- Epoch 22
Training log loss: 0.47111
-- Epoch 23
Training log loss: 0.46953
-- Epoch 24
Training log loss: 0.469

In [13]:
train_pre = fm.predict(x_tr)

In [14]:
log_loss(y_tr, train_pre)

0.4247870594982894

In [15]:
val_pre = fm.predict(x_val)
log_loss(y_val, val_pre)

0.4677241466075124

## 造轮子版

In [17]:
# 导入包
from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
import tensorflow.keras.backend as K

from tqdm import tqdm

In [24]:
# dense特征取对数， sparse特征类别编码
def process_feat(data, dense_feats, sparse_feats):
    df = data.copy()
    
    # dense
    df_dense = df[dense_feats].fillna(0.0)
    for f in tqdm(dense_feats):
        df_dense[f] = df_dense[f].apply(lambda x: np.log(1+x) if x > -1 else -1)
        
    # sparse
    df_sparse = df[sparse_feats].fillna('-1')
    for f in tqdm(sparse_feats):
        lbe = LabelEncoder()
        df_sparse[f] = lbe.fit_transform(df_sparse[f])
    
    df_new = pd.concat([df_dense, df_sparse], axis=1)
    return df_new

In [34]:
# FM 特征组合层
class crossLayer(layers.Layer):
    def __init__(self, input_dim, output_dim=10, **kwargs):
        super(crossLayer, self).__init__(**kwargs)
        
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        # 定义交叉特征的权重
        self.kernel = self.add_weight(name='kernel', shape=(self.input_dim, self.output_dim), initializer='glorot_uniform', trainable=True)
    
    def call(self, x): # 对照上述公式中的二次项优化公式理解
        a = K.pow(K.dot(x, self.kernel), 2)
        b = K.dot(K.pow(x, 2), K.pow(self.kernel, 2))
        return 0.5 * K.mean(a-b, 1, keepdims=True)
    
# 定义FM模型
def FM(feature_dim):
    inputs = Input(shape=(feature_dim, ))
    
    # 一阶特征
    linear = Dense(units=1, kernel_regularizer=regularizers.l2(0.01), bias_regularizer=regularizers.l2(0.01))(inputs)
    
    # 二阶特征
    cross = crossLayer(feature_dim)(inputs)
    add = Add()([linear, cross])            # 将一阶特征与二阶特征相加构建FM模型
    
    pred = Activation('sigmoid')(add)
    model = Model(inputs=inputs, outputs=pred)
    
    model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(), metrics=['binary_accuracy'])
    
    return model

In [22]:
# 读入数据
path = 'criteo/'
data = pd.read_csv(path + 'train.csv')

# 去掉id列， 把测试集和训练集合并， 填充缺失值
data.drop(['Id'], axis=1, inplace=True)

In [25]:
# dense 特征开头是I, sparse特征开头是C， label是标签
cols = data.columns.values

dense_feats = [f for f in cols if f[0] == 'I']
sparse_feats = [f for f in cols if f[0] == 'C']

# 数据预处理
feats = process_feat(data, dense_feats, sparse_feats)

100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 362.09it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 1086.04it/s]


In [27]:
# 划分训练和验证数据
x_trn, x_tst, y_trn, y_tst = train_test_split(feats, data['Label'], test_size=0.2, random_state=2020)

In [35]:
# 定义模型
model = FM(feats.shape[1])

Instructions for updating:
Colocations handled automatically by placer.


In [37]:
model.fit(x_trn, y_trn, epochs=100, batch_size=128, validation_data=(x_tst, y_tst))

Train on 1279 samples, validate on 320 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100


Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x184748251d0>

In [38]:
# 预测
y_pred = model.predict(x_tst)

In [39]:
y_pred

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],