In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
#这部分学习了蚂蚁学python大佬的内容

In [2]:
df_user= pd.read_csv("../data/ml-1m/users.dat", sep="::", header=None, engine="python", 
                     names="UserID::Gender::Age::Occupation::Zip-code".split("::"))
df_movie= pd.read_csv("../data/ml-1m/movies.dat", sep="::", header=None, engine="python", 
                     names="MovieID::Title::Genres".split("::"))
df_rating= pd.read_csv("../data/ml-1m/ratings.dat", sep="::", header=None, engine="python", 
                     names="UserID::MovieID::Rating::Timestamp".split("::"))

In [3]:
#统计题材频率
genre_dict = dict()
for genres in df_movie["Genres"].str.split("|"):
    for genre in genres:
        if genre not in genre_dict:
            genre_dict[genre] = 1
        genre_dict[genre] += 1
genre_dict

{'Animation': 106,
 "Children's": 252,
 'Comedy': 1201,
 'Adventure': 284,
 'Fantasy': 69,
 'Romance': 472,
 'Drama': 1604,
 'Action': 504,
 'Crime': 212,
 'Thriller': 493,
 'Horror': 344,
 'Sci-Fi': 277,
 'Documentary': 128,
 'War': 144,
 'Musical': 115,
 'Mystery': 107,
 'Film-Noir': 45,
 'Western': 69}

In [4]:
#为每个电影选择频率最高的题材
def get_highrate_genre(x):
    sub_genre = {}
    for genre in x.split("|"):
        sub_genre[genre] = genre_dict[genre]
    return sorted(sub_genre.items(), key=lambda x:x[1], reverse=True)[0][0]
df_movie["Genres"] = df_movie["Genres"].map(get_highrate_genre)

In [5]:
df_movie.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Comedy
1,2,Jumanji (1995),Adventure
2,3,Grumpier Old Men (1995),Comedy
3,4,Waiting to Exhale (1995),Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
def add_index_column(para_dif, column_name):
    values = list(para_dif[column_name].unique())
    value_index_dict = {value:index for index, value in enumerate(values)}
    para_dif[f"{column_name}_idx"] = para_dif[column_name].map(value_index_dict)

In [7]:
add_index_column(df_user, "UserID")
add_index_column(df_user, "Gender")
add_index_column(df_user, "Age")
add_index_column(df_user, "Occupation")
add_index_column(df_movie, "MovieID")
add_index_column(df_movie, "Genres")

In [8]:
df = pd.merge(pd.merge(df_rating, df_user),df_movie)
df.drop(columns=["Timestamp", "Zip-code", "Title"], inplace=True)

In [9]:
num_users = df["UserID_idx"].max() + 1
num_movies = df["MovieID_idx"].max() + 1
num_genders = df["Gender_idx"].max() + 1
num_ages = df["Age_idx"].max() + 1
num_occupations = df["Occupation_idx"].max() + 1
num_genres = df["Genres_idx"].max() + 1
num_users, num_movies, num_genders, num_ages, num_occupations, num_genres

(6040, 3883, 2, 7, 21, 18)

In [10]:
#对频分进行大小归一化
min_value = df["Rating"].min()
max_value = df["Rating"].max()
df["Rating"] = df["Rating"].map(lambda x: (x-min_value)/(max_value - min_value))
df.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,UserID_idx,Gender_idx,Age_idx,Occupation_idx,Genres,MovieID_idx,Genres_idx
0,1,1193,1.0,F,1,10,0,0,0,0,Drama,1176,2
1,2,1193,1.0,M,56,16,1,1,1,1,Drama,1176,2
2,12,1193,0.75,M,25,12,11,1,2,7,Drama,1176,2
3,15,1193,0.75,M,25,7,14,1,2,3,Drama,1176,2
4,17,1193,1.0,M,50,1,16,1,4,6,Drama,1176,2


In [11]:
#随机采样10%的数据做训练数据
df_sample=df.sample(frac = 0.1)
X = df_sample[["UserID_idx", "Gender_idx", "Age_idx", "Occupation_idx", "MovieID_idx", "Genres_idx"]]
Y = df_sample.pop("Rating")

In [37]:
#通过函数式API搭建模型
# 输入
user_id = keras.layers.Input(shape=(1,), name="user_id")
gender = keras.layers.Input(shape=(1,), name="gender")
age = keras.layers.Input(shape=(1,), name="age")
occupation = keras.layers.Input(shape=(1,), name="occupation")
movie_id = keras.layers.Input(shape=(1,), name="movie_id")
genre = keras.layers.Input(shape=(1,), name="genre")

# embedding 这里使用离散化，分桶，onehot更好
input_embedding = tf.keras.layers.concatenate([
        layers.Embedding(num_users, 100)(user_id), 
        layers.Embedding(num_genders, 2)(gender), 
        layers.Embedding(num_ages, 2)(age), 
        layers.Embedding(num_occupations, 2)(occupation),
        layers.Embedding(num_movies, 100)(movie_id),
        layers.Embedding(num_genres, 2)(genre)
])
#lr
output = layers.Dense(1, activation='sigmoid')(input_embedding)

lr_model = keras.models.Model(inputs=[user_id, gender, age, occupation, movie_id, genre], outputs=[output])

In [38]:
lr_model.compile(loss=tf.keras.losses.MeanSquaredError(), 
              optimizer=keras.optimizers.RMSprop())

In [39]:
fit_x_train = [
        X["UserID_idx"], 
        X["Gender_idx"],
        X["Age_idx"],
        X["Occupation_idx"],
        X["MovieID_idx"],
        X["Genres_idx"]
    ]
history = lr_model.fit(
    x=fit_x_train,
    y=Y,
    batch_size=32,
    epochs=3,
    verbose=1
)

Train on 100021 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [40]:
lr_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
occupation (InputLayer)         [(None, 1)]          0                                            
____________________________________________________________________________________________

In [20]:
inputs = df_sample[["UserID_idx", "Gender_idx", "Age_idx", "Occupation_idx", "MovieID_idx", "Genres_idx"]].head(10)
lr_model.predict(
        [inputs["UserID_idx"], 
        inputs["Gender_idx"],
        inputs["Age_idx"],
        inputs["Occupation_idx"],
        inputs["MovieID_idx"],
        inputs["Genres_idx"]])

array([[[0.5564158 ]],

       [[0.51679534]],

       [[0.47350368]],

       [[0.48130527]],

       [[0.7634226 ]],

       [[0.7762455 ]],

       [[0.7143663 ]],

       [[0.70344865]],

       [[0.6972403 ]],

       [[0.4059684 ]]], dtype=float32)

In [17]:
Y[:10]

900536    0.25
697275    0.50
995769    0.50
358202    0.25
733490    0.75
89014     0.75
288483    1.00
5761      1.00
812646    0.75
576595    0.50
Name: Rating, dtype: float64