In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf


In [2]:
movieSamples = pd.read_csv('movies.csv')
ratingSamples = pd.read_csv('ratings.csv')
userGenres = pd.read_csv('userGenres.csv')

In [3]:
def addSampleLabel(ratingSamples):
    # Rating > 3.5 will label positive to that movie(1)
    ratingSamples['label'] = np.where(ratingSamples['rating']>=3.5, 1, 0)
    return ratingSamples

def addMovieFeatures(movieSamples, ratingSamplesWithLabel):
    samplesWithMovies1 = ratingSamples.join(movieSamples.set_index(['movieId'],verify_integrity=True), 
                                        on=['movieId'], how='left')
    #Extract release year
    samplesWithMovies1['releaseYear'] = np.where(samplesWithMovies1['title'].str.len()<6, 1990
                                             ,samplesWithMovies1['title'].str.slice(-5, -1))
    
    samplesWithMovies1.drop(['title'],axis = 1,inplace = True)
    
    #Extract top-3 movie genre
    samplesWithMovies1[['movieGenre1', 'movieGenre2', 'movieGenre3']] = samplesWithMovies1['genres'].str.split("|",expand=True)[[0,1,2]]
    
    #Get movie avh,std,count of rating
    rating = samplesWithMovies1.groupby('movieId')['rating'].agg(['count', 'mean', 'std'])
    samplesWithMovies2 = samplesWithMovies1.join(rating,on = ['movieId'],how = 'left')
    samplesWithMovies3 = samplesWithMovies1.rename(columns={"count": "movieRatingCount", "mean": "movieAvgRating","std":"movieRatingStddev"})
    
    return  samplesWithMovies3


def addUserFeatures(samplesWithMovieFeatures,userGenres):
    # Order by timestamp
    t = samplesWithMovieFeatures.sort_values(['timestamp'],ascending=False)[samplesWithMovieFeatures['label']==1].groupby('userId')['movieId'].apply(list)
    t = t.to_frame()
    t = t.rename(columns = {"movieId": "userPositiveHistory"})
    #Get user rating history order by time
    t['liststring'] = [','.join(map(str, l)) for l in t['userPositiveHistory']]
    t[['userRatedMovie1', 'userRatedMovie2', 'userRatedMovie3','userRatedMovie4','userRatedMovie5']] = t['liststring'].str.split(",",expand=True)[[0,1,2,3,4]]
    t.drop(['userPositiveHistory','liststring'],axis = 1,inplace = True)
    
    samplesWithMovieFeatures = samplesWithMovieFeatures.join(t,on = 'userId',how = 'left', rsuffix='_right')
    
    #Get avg,std,count of user rating history
    rating = samplesWithMovieFeatures.groupby('userId')['rating'].agg(['count', 'mean', 'std'])
    samplesWithMovieFeatures = samplesWithMovieFeatures.join(rating,on = ['userId'],how = 'left')
    samplesWithMovieFeatures = samplesWithMovieFeatures.rename(columns={"count": "userRatingCount", "mean": "userAvgRating","std":"userRatingStddev"})
    
    #Get release year of movie that user rated
    samplesWithMovieFeatures['releaseYear'] = pd.to_numeric(samplesWithMovieFeatures['releaseYear'])
    rating = samplesWithMovieFeatures.groupby('userId')['releaseYear'].agg(['mean', 'std'])
    samplesWithMovieFeatures = samplesWithMovieFeatures.join(rating,on = ['userId'],how = 'left')
    samplesWithMovieFeatures = samplesWithMovieFeatures.rename(columns={"mean": "userAvgReleaseYear","std":"userReleaseYearStddev"})

    samplesWithMovieFeatures = samplesWithMovieFeatures.join(userGenres,on = 'userId',how = 'left', rsuffix='_right')
    
    #Delete users that ratingcount<=1
    samplesWithMovieFeatures = samplesWithMovieFeatures[samplesWithMovieFeatures.userRatingCount>1]
    
    return samplesWithMovieFeatures

def splitAndSaveTrainingTestSamples(samplesWithUserFeatures, file_path):
    train, test = train_test_split(samplesWithUserFeatures, test_size=0.2)
    #trainingSavePath = file_path + '/trainingSamples'
    #testSavePath = file_path + '/testSamples'
    train.to_csv(r'train.csv', index = False, header=True)
    test.to_csv(r'test.csv', index = False, header=True)

In [4]:
if __name__ == '__main__':
    file_path = '/wyt/users/wang/Desktop/Movie'
    
    movieSamples = pd.read_csv('movies.csv')
    ratingSamples = pd.read_csv('ratings.csv')
    userGenres = pd.read_csv('userGenres.csv')
    
    #movieResourcesPath = file_path + "/Movie Recommand/movies.csv"
    #ratingsResourcesPath = file_path + "/Movie Recommand/ratings.csv"
    ratingSamplesWithLabel = addSampleLabel(ratingSamples)
    
    samplesWithMovieFeatures = addMovieFeatures(movieSamples, ratingSamplesWithLabel)
    
    samplesWithUserFeatures = addUserFeatures(samplesWithMovieFeatures,userGenres)
    # save samples as csv format
    splitAndSaveTrainingTestSamples(samplesWithUserFeatures, file_path + "/train_test")



In [5]:
#training samples path
training_samples_file_path = tf.keras.utils.get_file("trainingSamples.csv",
                                                     "file:///Users/wang/Desktop/Movie/trainingSamples.csv")
# Test samples path
test_samples_file_path = tf.keras.utils.get_file("testSamples.csv",
                                                 "file:///Users/wang/Desktop/Movie/testSamples.csv")

Downloading data from file:///Users/wang/Desktop/Movie/trainingSamples.csv
Downloading data from file:///Users/wang/Desktop/Movie/testSamples.csv


In [None]:
# load sample as tf dataset
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=12,
        label_name='label',
        na_value="0",
        num_epochs=1,
        ignore_errors=True)
    return dataset


# split as test dataset and training dataset
train_dataset = get_dataset(training_samples_file_path)
test_dataset = get_dataset(test_samples_file_path)

# genre features vocabulary
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy', 'Western', 'Documentary',
               'Sci-Fi', 'Drama', 'Thriller',
               'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']

GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

# all categorical features
categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)
# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
categorical_columns.append(user_emb_col)

# all numerical features
numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
                     tf.feature_column.numeric_column('movieRatingCount'),
                     tf.feature_column.numeric_column('movieAvgRating'),
                     tf.feature_column.numeric_column('movieRatingStddev'),
                     tf.feature_column.numeric_column('userRatingCount'),
                     tf.feature_column.numeric_column('userAvgRating'),
                     tf.feature_column.numeric_column('userRatingStddev')]

# cross feature between current movie and user historical movie
rated_movie = tf.feature_column.categorical_column_with_identity(key='userRatedMovie1', num_buckets=1001)
crossed_feature = tf.feature_column.indicator_column(tf.feature_column.crossed_column([movie_col, rated_movie], 10000))

# define input for keras model
inputs = {
    'movieAvgRating': tf.keras.layers.Input(name='movieAvgRating', shape=(), dtype='float32'),
    'movieRatingStddev': tf.keras.layers.Input(name='movieRatingStddev', shape=(), dtype='float32'),
    'movieRatingCount': tf.keras.layers.Input(name='movieRatingCount', shape=(), dtype='int32'),
    'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
    'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
    'userRatingCount': tf.keras.layers.Input(name='userRatingCount', shape=(), dtype='int32'),
    'releaseYear': tf.keras.layers.Input(name='releaseYear', shape=(), dtype='int32'),

    'movieId': tf.keras.layers.Input(name='movieId', shape=(), dtype='int32'),
    'userId': tf.keras.layers.Input(name='userId', shape=(), dtype='int32'),
    'userRatedMovie1': tf.keras.layers.Input(name='userRatedMovie1', shape=(), dtype='int32'),

    'userGenre1': tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'),
    'userGenre2': tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'),
    'userGenre3': tf.keras.layers.Input(name='userGenre3', shape=(), dtype='string'),
    'userGenre4': tf.keras.layers.Input(name='userGenre4', shape=(), dtype='string'),
    'userGenre5': tf.keras.layers.Input(name='userGenre5', shape=(), dtype='string'),
    'movieGenre1': tf.keras.layers.Input(name='movieGenre1', shape=(), dtype='string'),
    'movieGenre2': tf.keras.layers.Input(name='movieGenre2', shape=(), dtype='string'),
    'movieGenre3': tf.keras.layers.Input(name='movieGenre3', shape=(), dtype='string'),
}

# wide and deep model architecture
# deep part for all input features
deep = tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns)(inputs)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
# wide part for cross feature
wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs)
both = tf.keras.layers.concatenate([deep, wide])
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(both)
model = tf.keras.Model(inputs, output_layer)

# compile the model, set loss function, optimizer and evaluation metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])

# train the model
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath= '/Users/lifan/Desktop/Movie',
    monitor='val_accuracy', save_best_only=True,
    save_weights_only=True, mode='max', save_freq='epoch',
    options=None)
#model.load_weights('/usr4/cs640/wyutong/ondemand/yutong/')
#model.fit(train_ds,validation_data=val_ds,epochs=1,shuffle=True,callbacks = model_checkpoint_callback )
model.fit(train_dataset, epochs=10,shuffle=True,callbacks = model_checkpoint_callbackepochs)


# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
                                                                                   test_roc_auc, test_pr_auc))

# print some predict results
predictions = model.predict(test_dataset)
for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))


Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Epoch 1/5
   5335/Unknown - 281s 53ms/step - loss: 0.7799 - accuracy: 0.6048 - auc: 0.6232 - auc_1: 0.6604