In [66]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import progressbar

### dataProcess

In [37]:
from scipy.sparse import coo_matrix
import pickle as pkl

In [29]:
# 读取featureIndex数据，统计基本的信息，field等
FIELD_SIZES = [0] * 26
with open('../data/featindex.txt') as fin:
    for line in fin:
        line = line.strip().split(':')
        if len(line) > 1:
            featIndex = int(line[0]) - 1
            FIELD_SIZES[featIndex] += 1
   
FIELD_OFFSETS = [sum(FIELD_SIZES[:i]) for i in range(len(FIELD_SIZES))]
INPUT_DIM = sum(FIELD_SIZES)
OUTPUT_DIM = 1

print('field sizes:', FIELD_SIZES)
print('INPUT_DIM:', INPUT_DIM)

field sizes: [25, 445852, 36, 371, 4, 11328, 33995, 12, 7, 5, 4, 20, 2, 38, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8]
INPUT_DIM: 491713


In [30]:
# 读取libsvm格式数据成稀疏矩阵形式
# 0 5:1 9:1 140858:1 445908:1 446177:1 446293:1 449140:1 490778:1 491626:1 491634:1 491641:1 491645:1 491648:1 491668:1 491700:1 491708:1
def read_data(file_name):
    X = []
    D = []
    y = []
    with open(file_name) as fin:
        for line in fin:
            fields = line.strip().split()
            y_i = int(fields[0])
            X_i = [int(x.split(':')[0]) for x in fields[1:]]
            D_i = [int(x.split(':')[1]) for x in fields[1:]]
            y.append(y_i)
            X.append(X_i)
            D.append(D_i)
    y = np.reshape(np.array(y), [-1])
    X = libsvm_2_coo(zip(X, D), (len(X), INPUT_DIM)).tocsr()
    return X, y

In [31]:
# 工具函数，libsvm格式转成coo稀疏存储格式
def libsvm_2_coo(libsvm_data, shape):
    coo_rows = []
    coo_cols = []
    coo_data = []
    n = 0
    for x, d in libsvm_data:
        coo_rows.extend([n] * len(x))
        coo_cols.extend(x)
        coo_data.extend(d)
        n += 1
    coo_rows = np.array(coo_rows)
    coo_cols = np.array(coo_cols)
    coo_data = np.array(coo_data)
    return coo_matrix((coo_data, (coo_rows, coo_cols)), shape=shape)

In [68]:
# csr转成输入格式
def csr_2_input(csr_mat):
    if not isinstance(csr_mat, list):
        coo_mat = csr_mat.tocoo()
        indices = np.vstack((coo_mat.row, coo_mat.col)).transpose()
        values = csr_mat.data
        shape = csr_mat.shape
        return indices, values, shape
    else:
        inputs = []
        for csr_i in csr_mat:
            inputs.append(csr_2_input(csr_i))
        return inputs

In [47]:
train_file = '../data/train.txt'
test_file = '../data/test.txt'
input_dim = INPUT_DIM
train_data = read_data(train_file)
# train_data = shuffle(train_data)
test_data = read_data(test_file)
print('read finish')
print('train data size:', train_data[0].shape)
print('test data size:', test_data[0].shape)

read finish
train data size: (1742104, 491713)
test data size: (300928, 491713)


In [95]:
train_data = pkl.load(open('../data/train.pkl', 'rb'))
#train_data = shuffle(train_data)
test_data = pkl.load(open('../data/test.pkl', 'rb'))
# pkl.dump(train_data, open('../data/train.pkl', 'wb'))
# pkl.dump(test_data, open('../data/test.pkl', 'wb'))
print('read finish')
print('train data size:', train_data[0].shape)
print('test data size:', test_data[0].shape)

read finish
train data size: (1742104, 491713)
test data size: (300928, 491713)


In [48]:
# 训练集与测试集
train_size = train_data[0].shape[0]
test_size = test_data[0].shape[0]
num_feas = len(FIELD_SIZES)

#### 工具函数

In [97]:
# 在tensorflow中初始化各种参数变量
# init_vars = [var_name, var_shape, init_method, dtype]
STDDEV = 1e-3
MINVAL = -1e-3
MAXVAL = 1e-3
def init_var_map(init_vars, init_path=None):
    if init_path is not None:
        load_var_map = pkl.load(open(init_path, 'rb'))
        print('load variable map from', init_path, load_var_map.keys())
    var_map = {}
    for var_name, var_shape, init_method, dtype in init_vars:
        if init_method == 'zero':
            var_map[var_name] = tf.Variable(tf.zeros(var_shape, dtype=dtype), name=var_name, dtype=dtype)
        elif init_method == 'one':
            var_map[var_name] = tf.Variable(tf.ones(var_shape, dtype=dtype), name=var_name, dtype=dtype)
        elif init_method == 'normal':
            var_map[var_name] = tf.Variable(tf.random_normal(var_shape, mean=0.0, stddev=STDDEV, dtype=dtype),
                                            name=var_name, dtype=dtype)
        elif init_method == 'tnormal':
            var_map[var_name] = tf.Variable(tf.truncated_normal(var_shape, mean=0.0, stddev=STDDEV, dtype=dtype),
                                            name=var_name, dtype=dtype)
        elif init_method == 'uniform':
            var_map[var_name] = tf.Variable(tf.random_uniform(var_shape, minval=MINVAL, maxval=MAXVAL, dtype=dtype),
                                            name=var_name, dtype=dtype)
        elif init_method == 'xavier':
            maxval = np.sqrt(6. / np.sum(var_shape))
            minval = -maxval
            var_map[var_name] = tf.Variable(tf.random_uniform(var_shape, minval=minval, maxval=maxval, dtype=dtype),
                                            name=var_name, dtype=dtype)
        elif isinstance(init_method, int) or isinstance(init_method, float):
            var_map[var_name] = tf.Variable(tf.ones(var_shape, dtype=dtype) * init_method, name=var_name, dtype=dtype)
        elif init_method in load_var_map:
            if load_var_map[init_method].shape == tuple(var_shape):
                var_map[var_name] = tf.Variable(load_var_map[init_method], name=var_name, dtype=dtype)
            else:
                print('BadParam: init method', init_method, 'shape', var_shape, load_var_map[init_method].shape)
        else:
            print('BadParam: init method', init_method)
    return var_map

In [53]:
# 不同的优化器选择
def get_optimizer(opt_algo, learning_rate, loss):
    if opt_algo == 'adaldeta':
        return tf.train.AdadeltaOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'adagrad':
        return tf.train.AdagradOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'adam':
        return tf.train.AdamOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'ftrl':
        return tf.train.FtrlOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'gd':
        return tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'padagrad':
        return tf.train.ProximalAdagradOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'pgd':
        return tf.train.ProximalGradientDescentOptimizer(learning_rate).minimize(loss)
    elif opt_algo == 'rmsprop':
        return tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
    else:
        return tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [64]:
# 数据切片
def slice(csr_data, start=0, size=-1):
    if not isinstance(csr_data[0], list):
        if size == -1 or start + size >= csr_data[0].shape[0]:
            slc_data = csr_data[0][start:]
            slc_labels = csr_data[1][start:]
        else:
            slc_data = csr_data[0][start:start + size]
            slc_labels = csr_data[1][start:start + size]
    else:
        if size == -1 or start + size >= csr_data[0][0].shape[0]:
            slc_data = []
            for d_i in csr_data[0]:
                slc_data.append(d_i[start:])
            slc_labels = csr_data[1][start:]
        else:
            slc_data = []
            for d_i in csr_data[0]:
                slc_data.append(d_i[start:start + size])
            slc_labels = csr_data[1][start:start + size]
    return csr_2_input(slc_data), slc_labels

#### 基类模型

In [50]:
import tensorflow as tf

In [73]:
# 定义基类模型
dtype = tf.float32
class Model:
    def __init__(self):
        self.sess = None
        self.X = None
        self.y = None
        self.layer_keeps = None
        self.vars = None
        self.keep_prob_train = None
        self.keep_prob_test = None

    # run model
    def run(self, fetches, X=None, y=None, mode='train'):
            # 通过feed_dict传入数据
            feed_dict = {}
            if type(self.X) is list:
                for i in range(len(X)):
                    feed_dict[self.X[i]] = X[i]
            else:
                feed_dict[self.X] = X
            if y is not None:
                feed_dict[self.y] = y
            if self.layer_keeps is not None:
                if mode == 'train':
                    feed_dict[self.layer_keeps] = self.keep_prob_train
                elif mode == 'test':
                    feed_dict[self.layer_keeps] = self.keep_prob_test
            #通过session.run去执行op
            return self.sess.run(fetches, feed_dict)

    # 模型参数持久化
    def dump(self, model_path):
        var_map = {}
        for name, var in self.vars.iteritems():
            var_map[name] = self.run(var)
        pkl.dump(var_map, open(model_path, 'wb'))
        print('model dumped at', model_path)

#### FM Model
$\sum_{i=1}^{n-1}\sum_{j=i+1}^n<v_i,v_j>x_ix_j = \frac{1}{2}\sum_{j=1}^k[\sum_{i=1}^n(v_{i,j}x_i)^2 - \sum_{i=1}^n(v_{i,i}^2x_i^2)]$

变量：
- xv: $\sum_{j=1}^k[\sum_{i=1}^n(v_{i,j}x_i)^2]$

In [107]:
class FMModel(Model):
    def __init__(self, input_dim=None, output_dim=1, factor_order=10, init_path=None, opt_algo='gd', learning_rate=1e-2,
                 l2_w=0, l2_v=0, random_seed=None):
        Model.__init__(self)
        # 一次w、二次交叉v、偏置项b
        init_vars = [('w', [input_dim, output_dim], 'xavier', dtype),
                     ('v', [input_dim, factor_order], 'xavier', dtype),
                     ('b', [output_dim], 'zero', dtype)]        
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = tf.sparse_placeholder(dtype)
            self.y = tf.placeholder(dtype)
            self.vars = init_var_map(init_vars, init_path)

            w = self.vars['w']
            v = self.vars['v']
            b = self.vars['b']
            # 一次项
            xw = tf.sparse_tensor_dense_matmul(self.X, w)
            
            # 二次项： 交叉项 - 平方项
            X_square = tf.SparseTensor(self.X.indices, tf.square(self.X.values), tf.to_int64(tf.shape(self.X)))
            xv2 = tf.square(tf.sparse_tensor_dense_matmul(self.X, v))
            x2v2 = tf.sparse_tensor_dense_matmul(X_square, tf.square(v))
            p = 0.5 * tf.reshape(tf.reduce_sum(xv2 - x2v2, 1), [-1, output_dim])
            
            logits = tf.reshape(xw + b + p, [-1])
            self.y_prob = tf.sigmoid(logits)
            
            # 损失函数
            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.y)) + \
                        l2_w * tf.nn.l2_loss(xw) + \
                        l2_v * tf.nn.l2_loss(xv2)
            self.optimizer = get_optimizer(opt_algo, learning_rate, self.loss)
            
            config = tf.ConfigProto()
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)

In [110]:
# 超参数设定
min_round = 1
num_round = 20
early_stop_round = 5
batch_size = 1024

field_sizes = FIELD_SIZES
field_offsets = FIELD_OFFSETS

# FM参数设定
fm_params = {
    'input_dim': input_dim,
    'factor_order': 10,
    'opt_algo': 'gd',
    'learning_rate': 0.1,
    'l2_w': 0,
    'l2_v': 0,
}
print(fm_params)
fm_model = FMModel(**fm_params)

{'input_dim': 491713, 'factor_order': 10, 'opt_algo': 'gd', 'learning_rate': 0.1, 'l2_w': 0, 'l2_v': 0}


In [111]:
def train(model):
    print("training FM...")
    history_score = []
    for i in range(num_round):
        # 同样是优化器和损失两个op
        fetches = [model.optimizer, model.loss]
        if batch_size > 0:
            ls = []
            bar = progressbar.ProgressBar()
            print('[%d]\ttraining...' % i)
            for j in bar(range(int(train_size / batch_size + 1))):
                X_i, y_i = slice(train_data, j * batch_size, batch_size)
                # 训练
                _, l = model.run(fetches, X_i, y_i)
                ls.append(l)
        elif batch_size == -1:
            X_i, y_i = slice(train_data)
            _, l = model.run(fetches, X_i, y_i)
            ls = [l]
        train_preds = []
        print('[%d]\tevaluating...' % i)
        bar = progressbar.ProgressBar()
        for j in bar(range(int(train_size / 10000 + 1))):
            X_i, _ = slice(train_data, j * 10000, 10000)
            preds = model.run(model.y_prob, X_i, mode='test')
            train_preds.extend(preds)
        test_preds = []
        bar = progressbar.ProgressBar()
        for j in bar(range(int(test_size / 10000 + 1))):
            X_i, _ = slice(test_data, j * 10000, 10000)
            preds = model.run(model.y_prob, X_i, mode='test')
            test_preds.extend(preds)
        train_score = roc_auc_score(train_data[1], train_preds)
        test_score = roc_auc_score(test_data[1], test_preds)
        print('[%d]\tloss (with l2 norm):%f\ttrain-auc: %f\teval-auc: %f' % (i, np.mean(ls), train_score, test_score))
        history_score.append(test_score)
        if i > min_round and i > early_stop_round:
            if np.argmax(history_score) == i - early_stop_round and history_score[-1] - history_score[
                        -1 * early_stop_round] < 1e-5:
                print('early stop\nbest iteration:\n[%d]\teval-auc: %f' % (
                    np.argmax(history_score), np.max(history_score)))
                break

train(fm_model)

  0% |                                                                        |

training FM...
[0]	training...


100% |########################################################################|
 10% |#######                                                                 |

[0]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[0]	loss (with l2 norm):0.013319	train-auc: 0.611436	eval-auc: 0.626339
[1]	training...


100% |########################################################################|
 10% |#######                                                                 |

[1]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[1]	loss (with l2 norm):0.006374	train-auc: 0.625287	eval-auc: 0.648224
[2]	training...


100% |########################################################################|
  9% |######                                                                  |

[2]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[2]	loss (with l2 norm):0.006258	train-auc: 0.637162	eval-auc: 0.661508
[3]	training...


100% |########################################################################|
 10% |#######                                                                 |

[3]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[3]	loss (with l2 norm):0.006218	train-auc: 0.647016	eval-auc: 0.672154
[4]	training...


100% |########################################################################|
  8% |#####                                                                   |

[4]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[4]	loss (with l2 norm):0.006194	train-auc: 0.654740	eval-auc: 0.680905
[5]	training...


100% |########################################################################|
  8% |#####                                                                   |

[5]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[5]	loss (with l2 norm):0.006176	train-auc: 0.660925	eval-auc: 0.687084
[6]	training...


100% |########################################################################|
 10% |#######                                                                 |

[6]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[6]	loss (with l2 norm):0.006161	train-auc: 0.665848	eval-auc: 0.692500
[7]	training...


100% |########################################################################|
 11% |########                                                                |

[7]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[7]	loss (with l2 norm):0.006149	train-auc: 0.669692	eval-auc: 0.696592
[8]	training...


100% |########################################################################|
 11% |########                                                                |

[8]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[8]	loss (with l2 norm):0.006138	train-auc: 0.672975	eval-auc: 0.699832
[9]	training...


100% |########################################################################|
 10% |#######                                                                 |

[9]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[9]	loss (with l2 norm):0.006128	train-auc: 0.675740	eval-auc: 0.702804
[10]	training...


100% |########################################################################|
 10% |#######                                                                 |

[10]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[10]	loss (with l2 norm):0.006119	train-auc: 0.678206	eval-auc: 0.705429
[11]	training...


100% |########################################################################|
 10% |#######                                                                 |

[11]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[11]	loss (with l2 norm):0.006111	train-auc: 0.680468	eval-auc: 0.707831
[12]	training...


100% |########################################################################|
 10% |#######                                                                 |

[12]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[12]	loss (with l2 norm):0.006103	train-auc: 0.682616	eval-auc: 0.710113
[13]	training...


100% |########################################################################|
 10% |#######                                                                 |

[13]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[13]	loss (with l2 norm):0.006096	train-auc: 0.684681	eval-auc: 0.712213
[14]	training...


100% |########################################################################|
 10% |#######                                                                 |

[14]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[14]	loss (with l2 norm):0.006090	train-auc: 0.686688	eval-auc: 0.714255
[15]	training...


100% |########################################################################|
  5% |####                                                                    |

[15]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[15]	loss (with l2 norm):0.006083	train-auc: 0.688651	eval-auc: 0.716265
[16]	training...


100% |########################################################################|
 10% |#######                                                                 |

[16]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[16]	loss (with l2 norm):0.006077	train-auc: 0.690582	eval-auc: 0.718276
[17]	training...


100% |########################################################################|
 10% |#######                                                                 |

[17]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[17]	loss (with l2 norm):0.006071	train-auc: 0.692485	eval-auc: 0.720228
[18]	training...


100% |########################################################################|
 10% |#######                                                                 |

[18]	evaluating...


100% |########################################################################|
100% |########################################################################|
  0% |                                                                        |

[18]	loss (with l2 norm):0.006065	train-auc: 0.694355	eval-auc: 0.722149
[19]	training...


100% |########################################################################|
  8% |#####                                                                   |

[19]	evaluating...


100% |########################################################################|
100% |########################################################################|


[19]	loss (with l2 norm):0.006060	train-auc: 0.696178	eval-auc: 0.724021


0