## FM_more_feature

In [3]:

import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import tensorflow as tf
import pandas as pd
import time
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

### 1.处理user特征

In [2]:
u_user = pd.read_csv('ml-100k/u.user', sep='|', names=['u_id', 'age',  'gender', 'occupation', 'zip_code'])


In [4]:
u_user

Unnamed: 0,u_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [11]:
u_user['occupation'].unique()

array(['technician', 'other', 'writer', 'executive', 'administrator',
       'student', 'lawyer', 'educator', 'scientist', 'entertainment',
       'programmer', 'librarian', 'homemaker', 'artist', 'engineer',
       'marketing', 'none', 'healthcare', 'retired', 'salesman', 'doctor'],
      dtype=object)

In [5]:
# ordinalencoder 是将labels转为数字并且附带一定的features(数据特性)。
# 按照属性种类进行1 2 3 .... 编码
encode = OrdinalEncoder(dtype=np.int).fit_transform(u_user[[ 'gender', 'occupation','zip_code']])

In [6]:
encode

array([[  1,  19, 622],
       [  0,  13, 689],
       [  1,  20, 270],
       ...,
       [  1,  18, 743],
       [  0,  10, 566],
       [  1,  18, 561]])

In [12]:
# 将encode的拼接进 u_user中
u_user = pd.DataFrame(np.c_[u_user[['u_id','age']].values,encode], columns=['u_id', 'age',  'gender', 'occupation', 'zip_code'])

In [14]:
u_user.head()

Unnamed: 0,u_id,age,gender,occupation,zip_code
0,1,24,1,19,622
1,2,53,0,13,689
2,3,23,1,20,270
3,4,24,1,19,331
4,5,33,0,13,133


### 2.处理item特征

In [15]:

names = '''
m_id | m_title | release_date | video_release_date |
              IMDb_URL | unknown | Action | Adventure | Animation |
              Children | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western
              '''
names = [name.strip() for name in names.split('|')]

In [33]:
u_item = pd.read_csv('ml-100k/u.item', encoding='iso-8859-1', sep='|', names=names)
u_item

Unnamed: 0,m_id,m_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# 弃掉一些特征
u_item.drop(['m_title', 'video_release_date', 'IMDb_URL'], axis=1,inplace=True)

In [25]:
u_item['release_date'].value_counts()

01-Jan-1995    215
01-Jan-1994    213
01-Jan-1993    126
01-Jan-1997     98
01-Jan-1992     37
              ... 
25-Sep-1996      1
01-Jan-1930      1
17-Jul-1996      1
4-Feb-1971       1
19-Oct-1996      1
Name: release_date, Length: 240, dtype: int64

In [35]:
u_item['release_date'].fillna(method='ffill',inplace=True)

In [27]:
u_item['release_date'].value_counts()

01-Jan-1995    215
01-Jan-1994    213
01-Jan-1993    126
01-Jan-1997     98
01-Jan-1992     37
              ... 
25-Sep-1996      1
01-Jan-1930      1
17-Jul-1996      1
4-Feb-1971       1
19-Oct-1996      1
Name: release_date, Length: 240, dtype: int64

In [36]:
def get_year(x):
    year_ = x[-4:]
    return int(year_)

u_item['release_year'] = u_item['release_date'] .apply(get_year)

In [30]:
u_item

Unnamed: 0,m_id,release_date,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,1994,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# 存在的年份
u_item['release_year'] = u_item['release_year'] - u_item['release_year'].min()

In [38]:
u_item.drop('release_date', axis=1, inplace=True)

In [39]:
u_item

Unnamed: 0,m_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,73
1,2,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,73
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,73
3,4,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,73
4,5,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,76
1678,1679,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,76
1679,1680,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,76
1680,1681,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,72


### 3.提取训练测试数据

In [51]:
cols = ['user','item','rating','timestamp']
data = pd.read_csv('ml-100k/u.data', sep='\t',names=cols)
train = pd.read_csv('ml-100k/ua.base', sep='\t',names=cols)
test = pd.read_csv('ml-100k/ua.test', sep='\t',names=cols)
 
data

Unnamed: 0,user,item,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [52]:
n_user = len(data['user'].unique())
n_item = len(data['item'].unique())
y_train = train['rating'].values
y_test = test['rating'].values

In [53]:
train.head()


Unnamed: 0,user,item,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [54]:
train = train[['user','item']]
test = test[['user', 'item']]

In [45]:
train

Unnamed: 0,user,item
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
90565,943,1047
90566,943,1074
90567,943,1188
90568,943,1228


In [47]:
# item_info_dict = {}
# value_col = [
#     'release_year',
#  'unknown',
#  'Action',
#  'Adventure',
#  'Animation',
#  'Children',
#  'Comedy',
#  'Crime',
#  'Documentary',
#  'Drama',
#  'Fantasy',
#  'Film-Noir',
#  'Horror',
#  'Musical',
#  'Mystery',
#  'Romance',
#  'Sci-Fi',
#  'Thriller',
#  'War',
#  'Western',
#  ]
# for i in range(len(u_item)):
#     row = u_item.iloc[i]
#     item_info_dict[row['m_id']] = row[value_col].values

In [55]:
# 通过u_id 将user的信息进行扩展
train = pd.merge(train, u_user, how='left', left_on='user', right_on='u_id')
train.drop('u_id', axis = 1,inplace=True)
train

Unnamed: 0,user,item,age,gender,occupation,zip_code
0,1,1,24,1,19,622
1,1,2,24,1,19,622
2,1,3,24,1,19,622
3,1,4,24,1,19,622
4,1,5,24,1,19,622
...,...,...,...,...,...,...
90565,943,1047,22,1,18,561
90566,943,1074,22,1,18,561
90567,943,1188,22,1,18,561
90568,943,1228,22,1,18,561


In [56]:
# 通过m_id 将item的信息进行扩展
train = pd.merge(train, u_item, how='left', left_on='item', right_on='m_id')
train.drop('m_id', axis=1, inplace=True)
train

Unnamed: 0,user,item,age,gender,occupation,zip_code,unknown,Action,Adventure,Animation,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,1,24,1,19,622,0,0,0,1,...,0,0,0,0,0,0,0,0,0,73
1,1,2,24,1,19,622,0,1,1,0,...,0,0,0,0,0,0,1,0,0,73
2,1,3,24,1,19,622,0,0,0,0,...,0,0,0,0,0,0,1,0,0,73
3,1,4,24,1,19,622,0,1,0,0,...,0,0,0,0,0,0,0,0,0,73
4,1,5,24,1,19,622,0,0,0,0,...,0,0,0,0,0,0,1,0,0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,943,1047,22,1,18,561,0,0,0,0,...,0,0,0,0,0,0,0,0,0,74
90566,943,1074,22,1,18,561,0,0,0,0,...,0,0,0,0,0,0,0,0,0,72
90567,943,1188,22,1,18,561,0,1,0,0,...,0,0,0,0,0,0,0,0,1,68
90568,943,1228,22,1,18,561,0,1,0,0,...,0,0,0,0,0,0,0,0,0,73


In [57]:
test = pd.merge(test, u_user, how='left', left_on='user', right_on='u_id')
test.drop('u_id', axis = 1,inplace=True)
test = pd.merge(test, u_item, how='left', left_on='item', right_on='m_id')
test.drop('m_id', axis=1, inplace=True)

In [58]:
ct = ColumnTransformer([('u_i_onehot', OneHotEncoder(categories=[range(1,n_user+1), range(1,n_item+1)],sparse=False,dtype=np.int), ['user', 'item']),
                        ('gender_onehot', OneHotEncoder(dtype=np.int, sparse=False), ['gender','occupation','zip_code'])],
                       remainder='passthrough')

In [60]:
ct.fit(train)
X_train = ct.transform(train)
X_test = ct.transform(test)

### 将原来的26个特征进行one-hot之后 得到3464的维度

In [62]:
X_train.shape

(90570, 3464)

In [63]:
#特征维度与V的维度
n_feature = X_train.shape[1]
k = 10

In [64]:
#定义权重
w0 = tf.Variable(initial_value=tf.truncated_normal(shape=[1]), name='w0')
w = tf.Variable(initial_value=tf.truncated_normal(shape=[n_feature]), name='w')
V = tf.Variable(initial_value=tf.truncated_normal(shape=[k, n_feature]), name='V')

In [65]:
# None 指的是batch_size
X = tf.placeholder(dtype='float',shape=[None, n_feature], name="X")
y = tf.placeholder(dtype='float', shape=[None, 1], name= 'y')

### 按照公式来

$$\hat y = W0 + \sum_{i=1}^n w_ix_i + \sum_{i=1}^{n-1} \sum_{j=i+10}^n W_{ij}x_i x_j$$

推到后 原公式后面部分等于：

$$1/2 \sum_{f=1}^k \left( \sum_{i=1}^n (v_{i,f}x_i)^2 - \sum_{i=1}^n v_{i,f}^2 x_i^2 \right)$$

In [66]:
# 一阶部分
linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(X, w),axis=1,keepdims=True))

In [67]:
# 第二部分
pair_interactions = 1/2 * tf.reduce_sum(
    tf.square(tf.matmul(X, V, transpose_b=True)) 
    - tf.matmul(tf.square(X), tf.square(V), transpose_b=True),
    axis=1, keepdims=True)

In [68]:
y_hat = linear_terms + pair_interactions

In [70]:
# MSE
error = tf.reduce_mean(tf.square(y - y_hat))

In [71]:
# 进行l2正则化  对W 和V进行

lambda_w = tf.constant(0.001, name='lambda_w')
lambda_v = tf.constant(0.001, name='lambda_v')

l2_normal = lambda_w * tf.reduce_sum(tf.square(w)) + lambda_v*tf.reduce_sum(tf.square(V))
 

In [72]:
loss = error + l2_normal

### 3.进行训练 学习参数 W V

In [73]:
train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)

In [1]:
# 取batch—
def batcher(X_, y_=None, batch_size=-1):
    n_samples = X_.shape[0]

    if batch_size == -1:
        batch_size = n_samples
    if batch_size < 1:
       raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))

    for i in range(0, n_samples, batch_size):
        upper_bound = min(i + batch_size, n_samples)
        ret_x = X_[i:upper_bound]
        ret_y = None
        if y_ is not None:
            ret_y = y_[i:i + batch_size]
            yield (ret_x, ret_y)

In [2]:
epochs = 1500
batch_size = 1000

In [None]:
loss_scalar = tf.summary.scalar('loss', loss)
merged=tf.summary.merge_all()
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter(logdir='FM_mf_logs', graph=sess.graph)
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir='FM_mf_models')
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, save_path=ckpt.model_checkpoint_path)
        epoch_start =1001
    else:
        epoch_start = 1
    for epoch in range(epoch_start, epochs+1):
        s_time = time.time()
        perm = np.random.permutation(X_train.shape[0])
        for X_batch, y_batch in batcher(X_train[perm], y_train[perm], batch_size):
            _, loss_value = sess.run([train_op, loss], feed_dict={X:X_batch.reshape(-1,n_feature), y: y_batch.reshape(-1,1)})
        print('epoch{}_loss: {}, epoch_running_time: {}'.format(epoch, loss_value, time.time()-s_time))
            
        if epoch % 50 == 0:
            saver.save(sess, save_path='FM_mf_models/fm', global_step=epoch)
            merged_value = sess.run(merged, feed_dict={X:X_batch.reshape(-1,n_feature), y: y_batch.reshape(-1,1)})
            train_writer.add_summary(merged_value,global_step=epoch)
            
            #test
            error_test, y_test_pred = sess.run([error, y_hat], feed_dict={X:X_test.reshape(-1,n_feature), y:y_test.reshape(-1,1)})
            mmse = mean_squared_error(y_test, y_test_pred)
            print('loss_test: {}, mmse: {}'.format(error_test, mmse))
            
            
    train_writer.close()

In [None]:
#test
with tf.Session() as sess:
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir='FM_mf_models')
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, save_path=ckpt.model_checkpoint_path)
    else:
        print('no model')
        exit(1)
    test_error, y_test_hat = sess.run([error, y_hat], feed_dict={X:X_test.reshape(-1,n_feature), y: y_test.reshape(-1,1)})
    # test_error, y_test_hat = sess.run([error, y_hat], feed_dict={X:X_test.reshape(-1,n_feature), y: y_test.reshape(-1,1)})

    # print(y_test_hat.shape)
    # mmse = tf.sqrt(tf.reduce_mean(tf.square(y_test_hat[:,0]-y_test)))
    print('mmse: ', mean_squared_error(y_test, y_test_hat[:,0]))
    print('test_error', test_error)