# MovieLens

## Prepare 

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import sys
import os
from os.path import join as pjoin
from glob import glob
from matplotlib import pyplot as plt
sys.path.append('..')

In [2]:
import cv2
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from utils.commands import unzip, mkdir, call, count_file, KaggleCLI, execute_in, unzip_all
from utils.plot import plot_images, plot_confusion_matrix

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [52]:
from keras import optimizers, initializers, losses, callbacks, regularizers
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Flatten, InputLayer, BatchNormalization, Dropout, Embedding, dot, add, concatenate
from keras.utils.data_utils import get_file
from keras.regularizers import l2
from keras.optimizers import Adam

In [4]:
model_path = pjoin(os.getcwd(), 'models')
cal_path = pjoin(os.getcwd(), 'cal')
data_path = pjoin(os.getcwd(), 'data')
for p in [model_path, cal_path, data_path]:
    mkdir(p)

Things will be done:
- examine the data
- Collaborative Filtering


## Examine Data

### Loading data

In [13]:
get_file(pjoin(data_path, 'ml-latest-small.zip'),
         origin='http://files.grouplens.org/datasets/movielens/ml-latest-small.zip')

Downloading data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip


'/home/shared/ZhaoyuWorkSpace/kaggle_fun/movielens/data/ml-latest-small.zip'

In [5]:
ratings = pd.read_csv(pjoin(data_path, 'ml-latest-small', 'ratings.csv'))

### Preprocessing Data

In [9]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

In [10]:
train, valid = train_test_split(ratings, test_size=0.2)

In [11]:
print train.shape, valid.shape

(80003, 4) (20001, 4)


## collaborative filtering 

### Simple filtering

In [46]:
def build_model():
    n_factors = 50
    
    user_in = Input(shape=(1,), dtype='int64', name='user_in')
    u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-4))(user_in)
    ub = Embedding(n_users, 1, input_length=1)(user_in)
    
    movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
    m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-4))(movie_in)
    mb = Embedding(n_movies, 1, input_length=1)(movie_in)
    
    x = dot([u, m], -1)
    x = Flatten()(x)
    ub = Flatten()(ub)
    x = add([x, ub])
    mb = Flatten()(mb)
    x = add([x, mb])
    model = Model([user_in, movie_in], x)
    model.summary()
    return model

model = build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_29 (Embedding)        (None, 1, 50)        33550       user_in[0][0]                    
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, 1, 50)        453300      movie_in[0][0]                   
__________________________________________________________________________________________________
dot_9 (Dot

In [47]:
model.compile(Adam(0.001), loss='mse')

In [48]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, epochs=5, 
          validation_data=([valid.userId, valid.movieId], valid.rating))

Train on 80003 samples, validate on 20001 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc245c7c110>

In [49]:
model.optimizer.lr = 0.0001

In [51]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, epochs=10, 
          validation_data=([valid.userId, valid.movieId], valid.rating))

Train on 80003 samples, validate on 20001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc245f98610>

### Filtering with neural net

In [53]:
def build_model():
    n_factors = 50
    
    user_in = Input(shape=(1,), dtype='int64', name='user_in')
    u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-4))(user_in)
    
    movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
    m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-4))(movie_in)
    
    x = concatenate([u, m], -1)
    x = Flatten()(x)
    x = Dropout(0.3)(x)
    x = Dense(100)(x)
    x = Dropout(0.5)(x)
    x = Dense(1)(x)
    model = Model([user_in, movie_in], x)
    model.summary()
    return model

model = build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_33 (Embedding)        (None, 1, 50)        33550       user_in[0][0]                    
__________________________________________________________________________________________________
embedding_34 (Embedding)        (None, 1, 50)        453300      movie_in[0][0]                   
__________________________________________________________________________________________________
concatenat

In [54]:
model.compile(Adam(0.001), loss='mse')

In [55]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, epochs=3, 
          validation_data=([valid.userId, valid.movieId], valid.rating))

Train on 80003 samples, validate on 20001 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc244ff2cd0>