### We are going to do recommendations on MovieLens (netflix) type data.

In [1]:
# install surprise package
# https://surprise.readthedocs.io/en/stable/index.html

!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 5.2MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670924 sha256=5677c10838a4d7ccb3e9b2a1abd5641371d8eb46570dbbace3f1b20213ed77ab
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:
from surprise import KNNBaseline, SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy

import io  # needed because of weird encoding of u.item file


## Step 1 - Read Data

In [3]:
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')
print ("downloaded data")

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
downloaded data


## Step 2 - Train

In [4]:
%%time

## Train the algo

trainset = data.build_full_trainset()

sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)


algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
CPU times: user 2.53 s, sys: 70.6 ms, total: 2.6 s
Wall time: 2.61 s


In [5]:
# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9239  0.9191  0.9141  0.9156  0.9119  0.9169  0.0042  
MAE (testset)     0.7249  0.7212  0.7157  0.7181  0.7158  0.7191  0.0035  
Fit time          1.91    1.92    1.93    2.01    1.94    1.94    0.04    
Test time         5.07    4.75    4.98    4.78    5.1

{'fit_time': (1.9122724533081055,
  1.9188756942749023,
  1.9316704273223877,
  2.0105323791503906,
  1.9439971446990967),
 'test_mae': array([0.72485881, 0.72122852, 0.71572297, 0.71805951, 0.71576478]),
 'test_rmse': array([0.92393415, 0.91908774, 0.91407964, 0.91561212, 0.91189253]),
 'test_time': (5.066446781158447,
  4.753808259963989,
  4.981435060501099,
  4.781335115432739,
  5.130431413650513)}

## Step 3 - Calculate The RMSE 

We want to see how our model does.

Anything less than plus or minus 0.5 star should be considered a success. That means on a scale of one to five we were less than half a star off.

In [6]:
testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True) 



RMSE: 0.5584


0.558390314020892

## Step 4 - Parse Data

In [7]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

In [8]:
from surprise import get_dataset_dir

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

print ("rid_to_name:")
iterator = iter(rid_to_name.items())
for i in range(10):
    print(next(iterator))

print()
print ("name_to_rid:")
iterator = iter(name_to_rid.items())
for i in range(10):
    print(next(iterator))

rid_to_name:
('1', 'Toy Story (1995)')
('2', 'GoldenEye (1995)')
('3', 'Four Rooms (1995)')
('4', 'Get Shorty (1995)')
('5', 'Copycat (1995)')
('6', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)')
('7', 'Twelve Monkeys (1995)')
('8', 'Babe (1995)')
('9', 'Dead Man Walking (1995)')
('10', 'Richard III (1995)')

name_to_rid:
('Toy Story (1995)', '1')
('GoldenEye (1995)', '2')
('Four Rooms (1995)', '3')
('Get Shorty (1995)', '4')
('Copycat (1995)', '5')
('Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', '6')
('Twelve Monkeys (1995)', '7')
('Babe (1995)', '8')
('Dead Man Walking (1995)', '9')
('Richard III (1995)', '10')


## Step 5 - Do Recommendations

Find similar movies


In [9]:
# Retrieve inner id of the movie Toy Story

movie_name = 'Toy Story (1995)'
# movie_name = 'Get Shorty (1995)'

movie_raw_id = name_to_rid[movie_name]
movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
neighbors = algo.get_neighbors(movie_inner_id, k=10)

# Convert inner ids of the neighbors into names.
neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in neighbors)
neighbors = (rid_to_name[rid]
                       for rid in neighbors)

print()
print('The 10 nearest neighbors for :  ', movie_name)
for m in neighbors:
    print(m)



The 10 nearest neighbors for :   Toy Story (1995)
Lion King, The (1994)
Raiders of the Lost Ark (1981)
Liar Liar (1997)
Beauty and the Beast (1991)
E.T. the Extra-Terrestrial (1982)
Dragonheart (1996)
Craft, The (1996)
That Thing You Do! (1996)
Aladdin (1992)
Private Parts (1997)
