In [1]:
from __future__ import print_function
from itertools import count
from collections import defaultdict
from scipy.sparse import csr
import numpy as np

In [2]:
def vectorize_dic(dic, ix = None, p=None, flag='nondebug'):
    if(ix == None):
        d = count(0)
        # 自增字典，每新增一个key，value加1
        ix = defaultdict(lambda:next(d))
    # 每个属性的取值个数最大是多少
    n = len(list(dic.values())[0])
    print('n: ', n)
    # 属性个数
    g = len(list(dic.keys()))
    print('g: ', g)
    # 属性和取值乘积
    nz = n*g
    if flag=='debug':
        print('nz: ', nz)
    col_ix = np.empty(nz, dtype = int)
    if flag=='debug':
        print('col_ix.shape ', col_ix.shape)
    i = 0
    for k, lis in dic.items():
        # ix存放每个属性取值对应的下标，这里是下标放到col_ix数组中，由于一个人有g个属性，所以用col_ix[i::g]的方式
        # 例如age属性，第一个用户col_ix下标是0，第二个用户是3，第三个用户是6，第四个是9
        # sex属性，第一个用户col_ix下标是1，第二个用户是4，第三个用户是7，第四个是10
        col_ix[i::g] = [ix[str(k)+str(el)] for el in lis]
        i += 1
    # row_ix标示了col_ix中那些属性是同一个人的
    row_ix = np.repeat(np.arange(n), g)
    data = np.ones(nz)
    if flag=='debug':
        print('row_ix.shape ', row_ix.shape)
        # 暂时不知道是干什么的
        print('data.shape ', data.shape)
        print('col_ix', col_ix)
        print('row_ix', row_ix)
        print('ix', ix)
        print('len(ix)', len(ix))
    if(p == None):
        p = len(ix)
    ixx = np.where(col_ix < p)
    if flag=='debug':
        print('p', p)
        print('ixx ', ixx)
        print('data[ixx]', data[ixx])
    return csr.csr_matrix(
        (data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

In [3]:
dic = {'age':[3,2,3,4],'sex':['boy','girl','boy','boy'],'name':['a','b','c','d']}
ans,ix = vectorize_dic(dic)
ans_dense = ans.todense()
print(ans)
print(ans_dense)

n:  4
g:  3
  (0, 0)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (1, 1)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (2, 0)	1.0
  (2, 3)	1.0
  (2, 7)	1.0
  (3, 2)	1.0
  (3, 3)	1.0
  (3, 8)	1.0
[[1. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 0. 0. 0. 1.]]


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
cols = ['user','item','rating','timestamp']
train = pd.read_csv('./ua.base', delimiter='\t', names=cols)
test = pd.read_csv('./ua.test', delimiter='\t', names=cols)
x_train, ix = vectorize_dic({'users':train.user.values,'items':train.item.values})
x_test, ix = vectorize_dic({'users':test.user.values, 'items':test.item.values}, ix, x_train.shape[1])
y_train = train.rating.values
y_test = test.rating.values
# x_train这种数据组织的形式：如
#(0，0)->1.0，(0, 943)->1.0，是指第0条样本user的取值是第0个特征，item的取值是第943个特征
#(1, 0)->1.0，(1, 944)->1.0，是指第0条样本user的取值是第0个特征，item的取值是第944个特征

n:  90570
g:  2
n:  9430
g:  2


In [5]:
print(x_train[:5])

x_train = x_train.todense()
x_test = x_test.todense()

print(x_train.shape)
print(x_test.shape)
print(x_train[:5])

  (0, 0)	1.0
  (0, 943)	1.0
  (1, 0)	1.0
  (1, 944)	1.0
  (2, 0)	1.0
  (2, 945)	1.0
  (3, 0)	1.0
  (3, 946)	1.0
  (4, 0)	1.0
  (4, 947)	1.0
(90570, 2623)
(9430, 2623)
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [6]:
import tensorflow as tf

n, p = x_train.shape
# number of latent factors
k = 10
# design features of users
X = tf.placeholder('float', shape=[None, p])
# target vector
Y = tf.placeholder('float', shape=[None, 1])
# bias and weight
w0 = tf.Variable(tf.zeros([1]))
W = tf.Variable(tf.zeros([p]))
# matrix factorization factors, randomly initialized
V = tf.Variable(tf.random_normal([p, k], stddev=0.01))
# estimation of y, initialized to 0
Y_hat = tf.Variable(tf.zeros([n, 1]))

In [8]:
# linear part
linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True))
print(W.shape, X.shape)
# fm部分
pair_interactions = tf.multiply(0.5, 
tf.reduce_sum(tf.pow(tf.matmul(X,V), 2) - 
tf.matmul(tf.pow(X, 2), tf.pow(V, 2)),keepdims=True))
print(pair_interactions.shape)
Y_hat = linear_terms + pair_interactions

(2623,) (?, 2623)
(1, 1)


In [9]:
lambda_w = tf.constant(0.001, name='lambda_w')
lambda_v = tf.constant(0.001, name='lambda_v')
print('W',W.shape, 'V', V.shape)
l2_norm = tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W,2)))+tf.reduce_sum(tf.multiply(lambda_v, tf.pow(V,2)))
print('l2_norm', l2_norm.shape)
error = tf.reduce_mean(tf.square(tf.subtract(Y, Y_hat)))
print('Y', Y.shape, 'Y_hat', Y_hat.shape, 'error', error.shape)
loss =  tf.add(error, l2_norm)
print('loss', loss.shape)

W (2623,) V (2623, 10)
l2_norm ()
Y (?, 1) Y_hat (?, 1) error ()
loss ()


In [10]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
def batcher(X, Y=None, batch_size=-1):
    n_samples = X.shape[0]
    
    if(batch_size == -1):
        batch_size = n_samples
    if(batch_size < 1):
        raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))
    for i in range(0, n_samples, batch_size):
        upper_bound = min(i+batch_size, n_samples)
        ret_x = X[i:upper_bound]
        ret_y = Y[i:upper_bound]
        yield(ret_x, ret_y)

In [12]:
from tqdm import tqdm_notebook as tqdm
epoches = 2
batch_size = 100

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in tqdm(range(epoches), unit='epoch'):
        perm = np.random.permutation(x_train.shape[0])
        for bx, by in batcher(x_train[perm], y_train[perm], batch_size):
            sess.run(optimizer, feed_dict={
                X:bx.reshape(-1, p),
                Y:by.reshape(-1, 1)
            })
    errors = []
    for bx, by in batcher(x_test, y_test):
        errors.append(sess.run(error, feed_dict={
            X:bx.reshape(-1, p),
            Y:by.reshape(-1, 1)
        }))
    RMSE = np.sqrt(np.array(errors).mean())
    print(RMSE)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


1.1053425
