In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from keras.layers import Embedding, Lambda, Dense, Input, concatenate, dot, multiply,Flatten
from keras.models import Model
import keras.backend as K

np.random.seed(555)

In [2]:
 def get_users_attr(df_users):
    df_users = pd.read_csv(users_path,sep="::",names=['uid','gender','age','occupation','zipcode'],encoding='utf-8')
    user_ids=set(df_users['uid'].values)
    df_users = df_users.drop(['zipcode'],axis=1)
    for i in range(len(df_users)):
        if df_users.loc[i,'gender'] == 'F':
            df_users.loc[i,'gender'] = 1
        else:
            df_users.loc[i,'gender'] = 0
    return df_users,user_ids

In [3]:
def get_movies_attr(df_movies):
    year=df_movies['title'].values
    for i in range(len(year)):
        year[i]=year[i].split('(')[-1].split(')')[0]
    df_movies['year']=year
    genres_list = set()
    for sstr in df_movies['genres'].str.split('|'):
        genres_list = set().union(sstr, genres_list)
    genres_list = list(genres_list)
    #genres_dict = dict(zip(genres_list,np.zeros(len(genres_list)) ))
    #for sstr in df_movies['genres'].str.split('|'):
    #    for s in sstr:
    #        genres_dict[s]+=1
    #rank=sorted(genres_dict.items(),key=lambda kv:kv[1],reverse=True)
    #for i in range(5):
    #    genres=rank[i][0]
    for genres in genres_list:
        df_movies[genres] = df_movies['genres'].str.contains(genres).apply(lambda x:1 if x else 0)
    item_ids=set(df_movies['iid'].values)
    df_movies = df_movies.drop(['title','genres'],axis=1)
    return df_movies,item_ids

In [4]:
def get_dict(data):
    k_ids = set(data)
    n = len(k_ids)
    k_dict = dict(zip(k_ids, range(n)))
    return k_dict,n

In [5]:
class DataProcess:
    def __init__(self, ratings_path, users_path, movies_path, alpha=0.8):
        print("start load data")
        df_ratings=pd.read_csv(ratings_path,sep="::",names=['uid', 'iid', 'ratings','time'],encoding='utf-8',engine='python')
        df_users=pd.read_csv(users_path,sep="::",names=['uid', 'gender', 'age','occupation','zipcode'],encoding='utf-8',engine='python')
        df_movies=pd.read_csv(movies_path,sep="::",names=['iid', 'title', 'genres'],encoding='ISO-8859-1',engine='python')
        print("start process data")
        df_ratings=df_ratings.drop('time',axis=1)
        df_users,user_ids=get_users_attr(df_users)
        df_movies,item_ids=get_movies_attr(df_movies)
    
        self.n_user=len(user_ids)
        self.n_item=len(item_ids)
        u_id2idx = dict(zip(user_ids, range(self.n_user)))
        i_id2idx = dict(zip(item_ids, range(self.n_item)))
        
        df_ratings = df_ratings.assign(uid=[u_id2idx[uid] for uid in df_ratings.uid])
        df_ratings = df_ratings.assign(iid=[i_id2idx[iid] for iid in df_ratings.iid])
        df_ratings = df_ratings.loc[df_ratings['uid']<=100]
        df_ratings = df_ratings.loc[df_ratings['iid']<=100]
        
        self.df = pd.merge(pd.merge(df_ratings, df_users, on='uid'), df_movies, on='iid')
        '''
        gender_dict,self.n_gender=get_dict(self.df['gender'])
        self.df = self.df.assign(gender=[gender_dict[k] for k in self.df['gender']])
        
        age_dict,self.n_age=get_dict(self.df['age'])
        self.df = self.df.assign(age=[age_dict[k] for k in self.df['age']])
        
        occupation_dict,self.n_occupation=get_dict(self.df['occupation'])
        self.df = self.df.assign(occupation=[occupation_dict[k] for k in self.df['occupation']])
        
        year_dict,self.n_year=get_dict(self.df['year'])
        self.df = self.df.assign(year=[year_dict[k] for k in self.df['year']])
        
        self.df=pd.DataFrame(self.df,dtype=np.float64)
        '''
        self.X = self.df.drop('ratings', axis=1)
        self.Y = self.df['ratings'].values

# FM model

In [6]:
class WideDeep:
    def __init__(self,n_gender,n_age,n_occupation,n_year):
    #,n_uid,n_iid):

        # Input layer
        a_in = Input(shape=[n_gender], name='gender')
        b_in = Input(shape=[n_age], name='age')
        c_in = Input(shape=[n_occupation], name='occupation')
        d_in = Input(shape=[n_year], name='year')
        #e_in = Input(shape=[n_uid], name='uid')
        #f_in = Input(shape=[n_iid], name='iid')
        
        # Layer 2
        embedding_a = Embedding(input_dim=n_gender, output_dim=100, name='embedding_a')(a_in)
        embedding_b = Embedding(input_dim=n_age, output_dim=100, name='embedding_b')(b_in)
        embedding_c = Embedding(input_dim=n_occupation, output_dim=100, name='embedding_c')(c_in)
        embedding_d = Embedding(input_dim=n_year, output_dim=100, name='embedding_d')(d_in)
        #embedding_e = Embedding(input_dim=n_uid, output_dim=100, name='embedding_e')(e_in)
        #embedding_f = Embedding(input_dim=n_iid, output_dim=100, name='embedding_f')(f_in)

        # Layer 3
        f=Flatten()
        Sa=f(embedding_a)
        Sb=f(embedding_b)
        Sc=f(embedding_c)
        Sd=f(embedding_d)
        #Se=f(embedding_e)
        #Sf=f(embedding_f)
        concat_h = concatenate([Sa,Sb,Sc,Sd])
        #concat_h = concatenate([embedding_a,embedding_b,embedding_c,embedding_d])

        # Layer 4
        h_6 = Dense(units=200, activation='tanh',  name='h_6')(concat_h)
        concat_h_6 = concatenate([a_in, b_in, c_in, d_in, h_6])
        
        # Output Layer
        y = Dense(units=1, activation='sigmoid', input_dim=6, name='output')(concat_h_6)

        self.model = Model(inputs=[a_in, b_in, c_in, d_in], outputs=[y], name='WideDeep')

# Train & Predict

In [7]:
ratings_path="../input/movielens-1m-dataset/ratings.dat"
users_path="../input/movielens-1m-dataset/users.dat"
movies_path="../input/movielens-1m-dataset/movies.dat"
dp = DataProcess(ratings_path,users_path,movies_path)
X_train,X_test,Y_train,Y_test=train_test_split(dp.X,dp.Y,test_size=0.2,random_state=123)

start load data
start process data


  return func(*args, **kwargs)


In [8]:
headers=['gender','age','occupation','year','uid','iid']
df = pd.concat([X_train[headers],X_test[headers]])
df = df.reset_index(drop=True)
n_gender=len(set(df['gender'].values))
n_age=len(set(df['age'].values))
n_occupation=len(set(df['occupation'].values))
n_year=len(set(df['year'].values))
n_uid=len(set(df['uid'].values))
n_iid=len(set(df['iid'].values))
df=pd.get_dummies(df,columns=headers)
X_train = df.iloc[:len(X_train),:].values
X_test = df.iloc[len(X_train):,:].values

In [9]:
Y_train[Y_train<=3]=0
Y_train[Y_train>3]=1

In [10]:
Y_test[Y_test<=3]=0
Y_test[Y_test>3]=1

In [11]:
Y_train

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,

In [12]:
X1=X_train[:,:n_gender]
X2=X_train[:,n_gender:n_gender+n_age]
X3=X_train[:,n_gender+n_age:n_gender+n_age+n_occupation]
X4=X_train[:,n_gender+n_age+n_occupation:n_gender+n_age+n_occupation+n_year]
#X5=X_train[:,n_gender+n_age+n_occupation+n_year:n_gender+n_age+n_occupation+n_year+n_uid]
#X6=X_train[:,n_gender+n_age+n_occupation+n_year+n_uid:]
X_train=[X1,X2,X3,X4]
X1=X_test[:,:n_gender]
X2=X_test[:,n_gender:n_gender+n_age]
X3=X_test[:,n_gender+n_age:n_gender+n_age+n_occupation]
X4=X_test[:,n_gender+n_age+n_occupation:n_gender+n_age+n_occupation+n_year]
#X5=X_test[:,n_gender+n_age+n_occupation+n_year:n_gender+n_age+n_occupation+n_year+n_uid]
#X6=X_test[:,n_gender+n_age+n_occupation+n_year+n_uid:]
X_test=[X1,X2,X3,X4]

In [13]:
md = WideDeep(n_gender,n_age,n_occupation,n_year)
md.model.compile(optimizer='adam', loss='binary_crossentropy')
md.model.summary()

Model: "WideDeep"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
gender (InputLayer)             [(None, 2)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 7)]          0                                            
__________________________________________________________________________________________________
occupation (InputLayer)         [(None, 19)]         0                                            
__________________________________________________________________________________________________
year (InputLayer)               [(None, 3)]          0                                            
___________________________________________________________________________________________

2022-07-03 10:11:11.492356: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [14]:
# 模型训练
md.model.fit(x=X_train, y=Y_train, epochs=10, batch_size=64)

Epoch 1/10


2022-07-03 10:11:11.722702: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f73c5b40ad0>

In [15]:
# 模型预测
Y_predict = md.model.predict(x=X_test)

# MAE
n = len(Y_predict)
Y_predict[Y_predict>=0.55]=1
Y_predict[Y_predict<0.55]=0
test_mae = np.sum(np.fabs(Y_predict.flatten() - Y_test)) / n
test_rmse = np.sqrt(np.sum((Y_predict.flatten() - Y_test)**2) / n)
print('test MAE :{:.4f}|test RMSE :{:.4f}'.format(test_mae,test_rmse))

test MAE :0.3953|test RMSE :0.6288
