In [50]:
from collections import Counter
import gc
import re

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tffm import TFFMRegressor

In [2]:
movie_df = pd.read_csv('ml-latest-small/movies.csv')
display(movie_df.shape)
movie_df.head(3)

(9742, 3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [3]:
rating_df = pd.read_csv('ml-latest-small/ratings.csv')
display(rating_df.shape)
rating_df.head(3)

(100836, 4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


## preprocessing

In [4]:
# extract movie year
def rule(title):
    if title == 'Babylon 5':
        return 1994
    elif title == 'Ready Player One':
        return 2018
    elif title == 'Hyena Road':
        return 2015
    elif title == 'The Adventures of Sherlock Holmes and Doctor Watson':
        return 1986
    elif title == 'Nocturnal Animals' \
        or title == 'Paterson' \
        or title == 'Moonlight' \
        or title == 'The OA':
        return 2016
    elif title == 'Cosmos':
        return 1980
    elif title == 'Maria Bamford: Old Baby' \
        or title == 'Generation Iron 2':
        return 2017
    elif title == 'Black Mirror':
        return 2011
    return 2006


def func(title):
    try:
        ex = re.search(r'(\([0-9]{4}\))', title).group()
        return int(re.search(r'[0-9]{4}', ex).group())
    except:
        return rule(title)

In [5]:
# genres
def get_genres(movie_genres):
    genres = set()
    for movie in movie_genres:
        for genre in movie.split('|'):
            if genre == '(no genres listed)':
                continue
            genres.add(genre)
    genres = list(genres)
    genres.sort()
    return genres


genres = get_genres(movie_df['genres'])
def get_genre_vectors(movie_genre):
    return {genre: 1 if genre in movie_genre else 0 for genre in genres}


def get_genres_df(movie_df):
    movie_genres = [get_genre_vectors(genre) for genre in movie_df['genres']]
    df = pd.DataFrame(movie_genres)
    return df

In [6]:
def get_feature_df(movie_df, rating_df):
    # preprocessing movie_df
    movie_df = movie_df.assign(year=movie_df['title'].apply(lambda title: func(title)))
    movie_genres_df = get_genres_df(movie_df)
    ex_movie_df = pd.concat([movie_df, movie_genres_df], axis=1)
    del movie_genres_df
    
    # merge
    merge_df = pd.merge(rating_df, ex_movie_df, on='movieId')
    merge_df.drop(labels=['timestamp', 'title', 'genres'], axis=1, inplace=True)
    del ex_movie_df
    
    user_one_hot_df = pd.get_dummies(merge_df['userId'].apply(lambda x: 'user_{0:03d}'.format(x)))
    movie_one_hot_df = pd.get_dummies(merge_df['movieId'].apply(lambda x: 'movie_{0:03d}'.format(x)))
    df = pd.concat([user_one_hot_df, movie_one_hot_df, merge_df], axis=1)
    df.drop(labels=['userId', 'movieId'], axis=1, inplace=True)
    del merge_df
    del user_one_hot_df
    del movie_one_hot_df
    gc.collect()
    
    return df

In [19]:
feature_df = get_feature_df(movie_df, rating_df.sample(10000))
display(feature_df.shape)
display(feature_df.head(3))

(10000, 4236)

Unnamed: 0,user_001,user_002,user_003,user_004,user_005,user_006,user_007,user_008,user_009,user_010,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0


factorization machine

In [52]:
model = TFFMRegressor(
    order=2,
    rank=7,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)
display('create model.')

'create model.'

In [51]:
X = feature_df.drop(labels=['rating'], axis=1).values
y = feature_df['rating'].values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
del X
del y
gc.collect()

70

In [53]:
model.fit(x_train, y_train)

In [54]:
y_preds = model.predict(x_test)

In [55]:
# evaluations
print(f'MSE: {mean_squared_error(y_test, y_preds)}')

MSE: 4002.168850686591


In [56]:
y_preds

array([  -6.428589 ,   -6.3564453,   -4.087494 , ...,  -10.72876  ,
         -1.476532 , -122.93381  ], dtype=float32)