In [8]:
import os
import sys
import tensorflow as tf
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
import numpy as np
import argparse
from util import *
from sklearn.metrics import *

In [23]:
#对标签进行one-hot编码
def onehot_encoder(labels, NUM_CLASSES):
    enc = LabelEncoder()
    labels = enc.fit_transform(labels)
    labels = labels.astype(np.int32)
    batch_size = tf.size(labels)
    labels = tf.expand_dims(labels, 1)
    indices = tf.expand_dims(tf.range(0, batch_size,1), 1)
    concated = tf.concat([indices, labels] , 1)
    onehot_labels = tf.sparse_to_dense(concated, tf.stack([batch_size, NUM_CLASSES]), 1.0, 0.0) 
    with tf.Session() as sess:
        return sess.run(onehot_labels)

In [34]:
#特征工程以及数据处理
def load_dataset():
    header = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
    df_user = pd.read_csv('data/u.user', sep='|', names=header)
    header = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
            'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
            'Thriller', 'War', 'Western']
    df_item = pd.read_csv('data/u.item', sep='|', names=header, encoding = "ISO-8859-1")
    df_item = df_item.drop(columns=['title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'])
    #数据分组-对年龄数据进行分组
    df_user['age'] = pd.cut(df_user['age'], [0,10,20,30,40,50,60,70,80,90,100], labels=['0-10','10-20','20-30','30-40','40-50','50-60','60-70','70-80','80-90','90-100'])
    #对genfer,occupation,age进行one-hot编码
    df_user = pd.get_dummies(df_user, columns=['gender', 'occupation', 'age'])
    df_user = df_user.drop(columns=['zip_code'])
    
    user_features = df_user.columns.values.tolist() #获取列名标题的函数
    movie_features = df_item.columns.values.tolist()#获取列名标题的函数
    cols = user_features + movie_features
    
    header = ['user_id', 'item_id', 'rating', 'timestamp']
    df_train = pd.read_csv('data/ua.base', sep='\t', names=header)
    df_train['rating'] = df_train.rating.apply(lambda x: 1 if int(x) == 5 else 0)
    df_train = df_train.merge(df_user, on='user_id', how='left') 
    df_train = df_train.merge(df_item, on='item_id', how='left')
    
    df_test = pd.read_csv('data/ua.test', sep='\t', names=header)
    df_test['rating'] = df_test.rating.apply(lambda x: 1 if int(x) == 5 else 0)
    df_test = df_test.merge(df_user, on='user_id', how='left') 
    df_test = df_test.merge(df_item, on='item_id', how='left')
    train_labels = onehot_encoder(df_train['rating'].astype(np.int32), 2)
    test_labels = onehot_encoder(df_test['rating'].astype(np.int32), 2)
    return df_train[cols].values, train_labels, df_test[cols].values, test_labels #最后这个函数返回training data,testing data以及它们的label

In [68]:
train,test,train_label,test_label=load_dataset()
print(train)

[[  1   0   1 ...,   0   0   0]
 [  1   0   1 ...,   1   0   0]
 [  1   0   1 ...,   1   0   0]
 ..., 
 [943   0   1 ...,   0   0   1]
 [943   0   1 ...,   0   0   0]
 [943   0   1 ...,   0   0   0]]


In [54]:
#分解数据读取函数
header = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
df_user = pd.read_csv('data/u.user', sep='|', names=header)
print(df_user)

     user_id  age gender     occupation zip_code
0          1   24      M     technician    85711
1          2   53      F          other    94043
2          3   23      M         writer    32067
3          4   24      M     technician    43537
4          5   33      F          other    15213
5          6   42      M      executive    98101
6          7   57      M  administrator    91344
7          8   36      M  administrator    05201
8          9   29      M        student    01002
9         10   53      M         lawyer    90703
10        11   39      F          other    30329
11        12   28      F          other    06405
12        13   47      M       educator    29206
13        14   45      M      scientist    55106
14        15   49      F       educator    97301
15        16   21      M  entertainment    10309
16        17   30      M     programmer    06355
17        18   35      F          other    37212
18        19   40      M      librarian    02138
19        20   42   

In [49]:
header = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
            'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
            'Thriller', 'War', 'Western']
df_item = pd.read_csv('data/u.item', sep='|', names=header, encoding = "ISO-8859-1")
#print(df_item)
df_item = df_item.drop(columns=['title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'])#删除无用的信息列
print(df_item)

      item_id  Action  Adventure  Animation  Children  Comedy  Crime  \
0           1       0          0          1         1       1      0   
1           2       1          1          0         0       0      0   
2           3       0          0          0         0       0      0   
3           4       1          0          0         0       1      0   
4           5       0          0          0         0       0      1   
5           6       0          0          0         0       0      0   
6           7       0          0          0         0       0      0   
7           8       0          0          0         1       1      0   
8           9       0          0          0         0       0      0   
9          10       0          0          0         0       0      0   
10         11       0          0          0         0       0      1   
11         12       0          0          0         0       0      1   
12         13       0          0          0         0       1   

In [55]:
#对年龄数据进行分组
df_user['age'] = pd.cut(df_user['age'], [0,10,20,30,40,50,60,70,80,90,100], labels=['0-10','10-20','20-30','30-40','40-50','50-60','60-70','70-80','80-90','90-100']) 
print(df_user)
#利用pandas中的get_dummies进行one-hot编码
df_user = pd.get_dummies(df_user, columns=['gender', 'occupation', 'age'])
print(df_user)
df_user = df_user.drop(columns=['zip_code'])
print(df_user)

     user_id    age gender     occupation zip_code
0          1  20-30      M     technician    85711
1          2  50-60      F          other    94043
2          3  20-30      M         writer    32067
3          4  20-30      M     technician    43537
4          5  30-40      F          other    15213
5          6  40-50      M      executive    98101
6          7  50-60      M  administrator    91344
7          8  30-40      M  administrator    05201
8          9  20-30      M        student    01002
9         10  50-60      M         lawyer    90703
10        11  30-40      F          other    30329
11        12  20-30      F          other    06405
12        13  40-50      M       educator    29206
13        14  40-50      M      scientist    55106
14        15  40-50      F       educator    97301
15        16  20-30      M  entertainment    10309
16        17  20-30      M     programmer    06355
17        18  30-40      F          other    37212
18        19  30-40      M     

In [58]:
user_features = df_user.columns.values.tolist()
print("user_features",user_features)
movie_features = df_item.columns.values.tolist()
print("movie_features",movie_features)
cols = user_features + movie_features
print(cols)

user_features ['user_id', 'gender_F', 'gender_M', 'occupation_administrator', 'occupation_artist', 'occupation_doctor', 'occupation_educator', 'occupation_engineer', 'occupation_entertainment', 'occupation_executive', 'occupation_healthcare', 'occupation_homemaker', 'occupation_lawyer', 'occupation_librarian', 'occupation_marketing', 'occupation_none', 'occupation_other', 'occupation_programmer', 'occupation_retired', 'occupation_salesman', 'occupation_scientist', 'occupation_student', 'occupation_technician', 'occupation_writer', 'age_0-10', 'age_10-20', 'age_20-30', 'age_30-40', 'age_40-50', 'age_50-60', 'age_60-70', 'age_70-80', 'age_80-90', 'age_90-100']
movie_features ['item_id', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
['user_id', 'gender_F', 'gender_M', 'occupation_administrator', 'occupation_artist', 'occupation_doctor', 'o

In [64]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df_train = pd.read_csv('data/ua.base', sep='\t', names=header)
print(df_train)
#将rating数据转化为one hot encode，rating等于5的则为1，否则为0
df_train['rating'] = df_train.rating.apply(lambda x: 1 if int(x) == 5 else 0)
print(df_train)
#将点击率的数据、用户数据以及电影信息数据进行合并，组成train data
df_train = df_train.merge(df_user, on='user_id', how='left') 
print(df_train)
df_train = df_train.merge(df_item, on='item_id', how='left')
print(df_train)

#同理可得testing data
df_test = pd.read_csv('data/ua.test', sep='\t', names=header)
df_test['rating'] = df_test.rating.apply(lambda x: 1 if int(x) == 5 else 0)
df_test = df_test.merge(df_user, on='user_id', how='left') 
df_test = df_test.merge(df_item, on='item_id', how='left')

       user_id  item_id  rating  timestamp
0            1        1       5  874965758
1            1        2       3  876893171
2            1        3       4  878542960
3            1        4       3  876893119
4            1        5       3  889751712
5            1        6       5  887431973
6            1        7       4  875071561
7            1        8       1  875072484
8            1        9       5  878543541
9            1       10       3  875693118
10           1       11       2  875072262
11           1       12       5  878542960
12           1       13       5  875071805
13           1       14       5  874965706
14           1       15       5  875071608
15           1       16       5  878543541
16           1       17       3  875073198
17           1       18       4  887432020
18           1       19       5  875071515
19           1       21       1  878542772
20           1       22       4  875072404
21           1       23       4  875072895
22         

In [66]:
#处理这些数据的label，将点击率作为这些数据的label，并进行onehot_encoder编码
train_labels = onehot_encoder(df_train['rating'].astype(np.int32), 2) #onehot_encoder是sk-learn中的一个函数，主要是在特征工程中，将一些分类型的变量转化为one-hot编码的格式
print(df_train['rating'])
print(train_labels)
test_labels = onehot_encoder(df_test['rating'].astype(np.int32), 2)

0        1
1        0
2        0
3        0
4        0
5        1
6        0
7        0
8        1
9        0
10       0
11       1
12       1
13       1
14       1
15       1
16       0
17       0
18       1
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
90540    0
90541    0
90542    0
90543    1
90544    0
90545    0
90546    0
90547    0
90548    0
90549    0
90550    0
90551    0
90552    0
90553    0
90554    0
90555    0
90556    0
90557    0
90558    0
90559    1
90560    0
90561    1
90562    0
90563    0
90564    0
90565    0
90566    0
90567    0
90568    0
90569    0
Name: rating, Length: 90570, dtype: int64
[[ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


In [69]:
#得到输入之后，我们使用tensorflow来设计我们的模型，目标函数包括两部分，线性以及交叉特征的部分，交叉特征直接使用我们最后推导的形式即可。
#输入：placeholder进行占位
def add_input(self):
    self.X = tf.placeholder('float32', [None, self.p])
    self.y = tf.placeholder('float32', [None, self.num_classes])
    self.keep_prob = tf.placeholder('float32')


In [70]:
#forward过程
def inference(self):
    with tf.variable_scope('linear_layer'): #传入已存在的name_scope对象时，则其范围内变量的前缀只与当前传入的对象有关，与tf.get_variable连用
        w0 = tf.get_variable('w0', shape=[self.num_classes],
                            initializer=tf.zeros_initializer())
        self.w = tf.get_variable('w', shape=[self.p, num_classes],
                             initializer=tf.truncated_normal_initializer(mean=0,stddev=0.01))
        self.linear_terms = tf.add(tf.matmul(self.X, self.w), w0) 

    with tf.variable_scope('interaction_layer'):
        self.v = tf.get_variable('v', shape=[self.p, self.k],
                            initializer=tf.truncated_normal_initializer(mean=0, stddev=0.01))
        self.interaction_terms = tf.multiply(0.5,
                                             tf.reduce_sum(
                                                 tf.subtract(
                                                     tf.pow(tf.matmul(self.X, self.v), 2),
                                                     tf.matmul(self.X, tf.pow(self.v, 2))),
                                                 1, keep_dims=True))
    self.y_out = tf.add(self.linear_terms, self.interaction_terms)
    if self.num_classes == 2:
        self.y_out_prob = tf.nn.sigmoid(self.y_out)
    elif self.num_classes > 2:
        self.y_out_prob = tf.nn.softmax(self.y_out)

#loss
def add_loss(self):
    if self.num_classes == 2:
        cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.y_out)
    elif self.num_classes > 2:
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.y_out)
    mean_loss = tf.reduce_mean(cross_entropy)
    self.loss = mean_loss
    tf.summary.scalar('loss', self.loss)

#计算accuracy
def add_accuracy(self):
    # accuracy
    self.correct_prediction = tf.equal(tf.cast(tf.argmax(self.y_out,1), tf.float32), tf.cast(tf.argmax(self.y,1), tf.float32))
    self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
    # add summary to accuracy
    tf.summary.scalar('accuracy', self.accuracy)

#训练
def train(self):
    self.global_step = tf.Variable(0, trainable=False)
    optimizer = tf.train.FtrlOptimizer(self.lr, l1_regularization_strength=self.reg_l1,
                                       l2_regularization_strength=self.reg_l2)
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)

#构建图
def build_graph(self):
    self.add_input()
    self.inference()
    self.add_loss()
    self.add_accuracy()
    self.train()