In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from keras.layers import Embedding, Lambda, Dense, Input, concatenate, dot, multiply,Flatten
from keras.models import Model
import keras.backend as K

np.random.seed(555)

#### **数据处理**
##### 数据集来源为Movielens-1M
##### 从movies.dat获取电影属性，从ratings.dat获取评分数据，从users.dat获取用户属性
##### 选择属性“gender”,“age”,"occupation","year"作为输入
##### 对获得的数据划分训练集和测试集，其中训练集占比为0.8，之后进行训练得到结果

In [None]:
 def get_users_attr(df_users):
    df_users = pd.read_csv(users_path,sep="::",names=['uid','gender','age','occupation','zipcode'],encoding='utf-8')
    user_ids=set(df_users['uid'].values)
    df_users = df_users.drop(['zipcode'],axis=1)
    for i in range(len(df_users)):
        if df_users.loc[i,'gender'] == 'F':
            df_users.loc[i,'gender'] = 1
        else:
            df_users.loc[i,'gender'] = 0
    return df_users,user_ids

In [None]:
def get_movies_attr(df_movies):
    year=df_movies['title'].values
    for i in range(len(year)):
        year[i]=year[i].split('(')[-1].split(')')[0]
    df_movies['year']=year
    genres_list = set()
    for sstr in df_movies['genres'].str.split('|'):
        genres_list = set().union(sstr, genres_list)
    genres_list = list(genres_list)
    #genres_dict = dict(zip(genres_list,np.zeros(len(genres_list)) ))
    #for sstr in df_movies['genres'].str.split('|'):
    #    for s in sstr:
    #        genres_dict[s]+=1
    #rank=sorted(genres_dict.items(),key=lambda kv:kv[1],reverse=True)
    #for i in range(5):
    #    genres=rank[i][0]
    for genres in genres_list:
        df_movies[genres] = df_movies['genres'].str.contains(genres).apply(lambda x:1 if x else 0)
    item_ids=set(df_movies['iid'].values)
    df_movies = df_movies.drop(['title','genres'],axis=1)
    return df_movies,item_ids

In [None]:
def get_dict(data):
    k_ids = set(data)
    n = len(k_ids)
    k_dict = dict(zip(k_ids, range(n)))
    return k_dict,n

In [None]:
class DataProcess:
    def __init__(self, ratings_path, users_path, movies_path, alpha=0.8):
        print("start load data")
        df_ratings=pd.read_csv(ratings_path,sep="::",names=['uid', 'iid', 'ratings','time'],encoding='utf-8',engine='python')
        df_users=pd.read_csv(users_path,sep="::",names=['uid', 'gender', 'age','occupation','zipcode'],encoding='utf-8',engine='python')
        df_movies=pd.read_csv(movies_path,sep="::",names=['iid', 'title', 'genres'],encoding='ISO-8859-1',engine='python')
        print("start process data")
        df_ratings=df_ratings.drop('time',axis=1)
        df_users,user_ids=get_users_attr(df_users)
        df_movies,item_ids=get_movies_attr(df_movies)
    
        self.n_user=len(user_ids)
        self.n_item=len(item_ids)
        u_id2idx = dict(zip(user_ids, range(self.n_user)))
        i_id2idx = dict(zip(item_ids, range(self.n_item)))
        
        df_ratings = df_ratings.assign(uid=[u_id2idx[uid] for uid in df_ratings.uid])
        df_ratings = df_ratings.assign(iid=[i_id2idx[iid] for iid in df_ratings.iid])
        df_ratings = df_ratings.loc[df_ratings['uid']<=100]
        df_ratings = df_ratings.loc[df_ratings['iid']<=500]
        
        self.df = pd.merge(pd.merge(df_ratings, df_users, on='uid'), df_movies, on='iid')
        
        gender_dict,self.n_gender=get_dict(self.df['gender'])
        self.df = self.df.assign(gender=[gender_dict[k] for k in self.df['gender']])
        
        age_dict,self.n_age=get_dict(self.df['age'])
        self.df = self.df.assign(age=[age_dict[k] for k in self.df['age']])
        
        occupation_dict,self.n_occupation=get_dict(self.df['occupation'])
        self.df = self.df.assign(occupation=[occupation_dict[k] for k in self.df['occupation']])
        
        year_dict,self.n_year=get_dict(self.df['year'])
        self.df = self.df.assign(year=[year_dict[k] for k in self.df['year']])
        
        self.df=pd.DataFrame(self.df,dtype=np.float64)
        
        self.X = self.df.drop('ratings', axis=1)
        self.Y = self.df['ratings'].values

#### **Nerual_MF_V1**
##### 借鉴了老师给的Nerual_MF_V1

In [None]:
class Nerual_MF_V1:
    def __init__(self,n_gender,n_age,n_occupation,n_year):

        # Input layer
        a_in = Input(shape=[1], name='gender')
        b_in = Input(shape=[1], name='age')
        c_in = Input(shape=[1], name='occupation')
        d_in = Input(shape=[1], name='year')
        
        # Layer 2
        embedding_a = Embedding(input_dim=n_gender, output_dim=100, name='embedding_a')(a_in)
        embedding_b = Embedding(input_dim=n_age, output_dim=100, name='embedding_b')(b_in)
        embedding_c = Embedding(input_dim=n_occupation, output_dim=100, name='embedding_c')(c_in)
        embedding_d = Embedding(input_dim=n_year, output_dim=100, name='embedding_d')(d_in)

        # Layer 3
        SqueezeEmbed = Lambda(lambda x: K.squeeze(x, 1))
        Sa=SqueezeEmbed(embedding_a)
        Sb=SqueezeEmbed(embedding_b)
        Sc=SqueezeEmbed(embedding_c)
        Sd=SqueezeEmbed(embedding_d)
        h_mv = concatenate([Sa, Sb])
        h_usr = concatenate([Sc, Sd])

        # Layer 4
        h_mv = Dense(units=100, activation='relu')(h_mv)
        h_usr = Dense(units=100, activation='relu')(h_usr)
        
        # Output Layer
        y = dot([h_mv, h_usr], axes=-1)
        
        self.model = Model(inputs=[a_in, b_in, c_in, d_in], outputs=[y], name='WideDeep')

#### **Nerual_MF_V2**
##### 借鉴了老师给的Nerual_MF_V2

In [None]:
class Nerual_MF_V2:
    def __init__(self,n_gender,n_age,n_occupation,n_year):

        # Input layer
        a_in = Input(shape=[1], name='gender')
        b_in = Input(shape=[1], name='age')
        c_in = Input(shape=[1], name='occupation')
        d_in = Input(shape=[1], name='year')
        
        # Layer 2
        embedding_a = Embedding(input_dim=n_gender, output_dim=100, name='embedding_a')(a_in)
        embedding_b = Embedding(input_dim=n_age, output_dim=100, name='embedding_b')(b_in)
        embedding_c = Embedding(input_dim=n_occupation, output_dim=100, name='embedding_c')(c_in)
        embedding_d = Embedding(input_dim=n_year, output_dim=100, name='embedding_d')(d_in)

        # Layer 3
        SqueezeEmbed = Lambda(lambda x: K.squeeze(x, 1))
        Sa=SqueezeEmbed(embedding_a)
        Sb=SqueezeEmbed(embedding_b)
        Sc=SqueezeEmbed(embedding_c)
        Sd=SqueezeEmbed(embedding_d)
        h_mv = concatenate([Sa, Sb])
        h_usr = concatenate([Sc, Sd])

        # Layer 4
        h_mv = Dense(units=100, activation='relu')(h_mv)
        h_usr = Dense(units=100, activation='relu')(h_usr)
        
        # Output Layer
        h_mv_usr = multiply([h_mv, h_usr])
        h_feat = Dense(units=100, activation='relu')(h_mv_usr)
        y = Dense(units=1, name='y')(h_feat)
        
        self.model = Model(inputs=[a_in, b_in, c_in, d_in], outputs=[y], name='WideDeep')

#### **数据导入及二次处理**
##### 这些本应放在DataProcess里面，但是由于不方便测试而且DataProcess中内容已经很多，所以我把部分数据处理放到后面

In [None]:
ratings_path="../input/movielens-1m-dataset/ratings.dat"
users_path="../input/movielens-1m-dataset/users.dat"
movies_path="../input/movielens-1m-dataset/movies.dat"
dp = DataProcess(ratings_path,users_path,movies_path)
X_train,X_test,Y_train,Y_test=train_test_split(dp.X,dp.Y,test_size=0.2,random_state=123)

In [None]:
X1=X_train['gender'].values
X2=X_train['age'].values
X3=X_train['occupation'].values
X4=X_train['year'].values
X_train=[X1,X2,X3,X4]
X1=X_test['gender'].values
X2=X_test['age'].values
X3=X_test['occupation'].values
X4=X_test['year'].values
X_test=[X1,X2,X3,X4]

#### **训练&预测**
##### 对两个模型结果进行对比，结果显示v2要略优于v1
##### 使用MAE和RMSE进行评估

In [None]:
md = Nerual_MF_V1(dp.n_gender,dp.n_age,dp.n_occupation,dp.n_year)
md.model.compile(optimizer='adam', loss='mse')
# 模型训练
md.model.fit(x=X_train, y=Y_train, epochs=20, batch_size=64)
# 模型预测
Y_predict = md.model.predict(x=X_test)

# MAE,RMSE
n = len(Y_predict)
test_mae = np.sum(np.fabs(Y_predict.flatten() - Y_test)) / n
test_rmse = np.sqrt(np.sum((Y_predict.flatten() - Y_test)**2) / n)
print('test MAE :{:.4f}|test RMSE :{:.4f}'.format(test_mae,test_rmse))

In [None]:
md = Nerual_MF_V2(dp.n_gender,dp.n_age,dp.n_occupation,dp.n_year)
md.model.compile(optimizer='adam', loss='mse')
# 模型训练
md.model.fit(x=X_train, y=Y_train, epochs=20, batch_size=64)
# 模型预测
Y_predict = md.model.predict(x=X_test)

# MAE,RMSE
n = len(Y_predict)
test_mae = np.sum(np.fabs(Y_predict.flatten() - Y_test)) / n
test_rmse = np.sqrt(np.sum((Y_predict.flatten() - Y_test)**2) / n)
print('test MAE :{:.4f}|test RMSE :{:.4f}'.format(test_mae,test_rmse))