In [26]:
USER = 'ruvim'

# Load preferences

In [27]:
from _recommend_helper import load_rated, dataset
import pandas as pd

In [28]:
try:
    pref = load_rated(USER)
except FileNotFoundError:
    raise FileNotFoundError(f'no preference file for {USER!r} found, make sure you ran ../rate_movies/rate_movies.ipynb')

In [29]:
pref.head()

Unnamed: 0,movieId,rating,userId
5,2412,4.0,ruvim
7,3248,1.0,ruvim
8,2422,1.5,ruvim
10,2404,1.0,ruvim
19,586,3.0,ruvim


In [30]:
ratings = dataset.load('ratings.csv').drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Combine user ratings with all MovieLens ratings to create a DataFrame ready for learning!

In [31]:
from _recommend_helper import conv_to_user_item_rating_order

In [32]:
df_for_surprise = pd.concat([ratings, pref]).reset_index(drop=True)
df_for_surprise = conv_to_user_item_rating_order(df_for_surprise)
df_for_surprise

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100941,ruvim,8874,4.0
100942,ruvim,1225,2.5
100943,ruvim,6016,4.0
100944,ruvim,6711,2.0


# Machine learning time

This notebook uses the `scikit-surprise` package to generate recommendations. The package implements several useful recommendation algorithms such as support vector decomposition (`SVD`).

In [33]:
from _recommend_helper import load_model, save_model

In [34]:
try:
    model = load_model(USER)
except FileNotFoundError:
    from surprise import Dataset, Reader, SVD
    from surprise.model_selection import GridSearchCV
    param_grid = {'n_factors': (10, 20), 'n_epochs': (20, 30)}
    grid = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'], refit=True)
    dataset_for_surprise = Dataset.load_from_df(df_for_surprise, Reader())
    grid.fit(dataset_for_surprise)
    model = grid.best_estimator['rmse']
    save_model(USER, model)

In [35]:
movieIds = set(ratings['movieId'])
not_rated_by_user = movieIds - set(pref['movieId'])
predictions = [(movieId, model.predict(USER, movieId).est) for movieId in not_rated_by_user]
# predictions is a list of tuples of the format (movieId, estimated_rating) 

In [36]:
from _recommend_helper import posters
def get_top_predictions(predictions, top_n=None):
    if top_n is None:
        top_n = 20
    return sorted(predictions, key=lambda tuple_: -tuple_[1])[:top_n]
def display_top_predictions(predictions, top_n=None):
    top_predictions = get_top_predictions(predictions, top_n)
    for movieId, est in top_predictions:
        print(f'Estimted rating {est:.1f}: "{posters.get_title(movieId)}"')
        try:
            posters.display_movie(movieId, show_title=False)
        except TypeError:  # no movie poster
            pass

In [156]:
display_top_predictions(predictions, 30)

Estimted rating 4.5: "Streetcar Named Desire, A (1951)"


Estimted rating 4.5: "There Will Be Blood (2007)"


Estimted rating 4.4: "Blazing Saddles (1974)"


Estimted rating 4.4: "Cool Hand Luke (1967)"


Estimted rating 4.4: "High Noon (1952)"


Estimted rating 4.4: "Top Secret! (1984)"


Estimted rating 4.4: "Paths of Glory (1957)"


Estimted rating 4.4: "Three Billboards Outside Ebbing, Missouri (2017)"


Estimted rating 4.4: "Laputa: Castle in the Sky (Tenkû no shiro Rapyuta) (1986)"


Estimted rating 4.4: "Philadelphia Story, The (1940)"


Estimted rating 4.3: "Hustler, The (1961)"


Estimted rating 4.3: "Dogville (2003)"


Estimted rating 4.3: "Glory (1989)"


Estimted rating 4.3: "My Fair Lady (1964)"


Estimted rating 4.3: "Lifeboat (1944)"


Estimted rating 4.3: "Dark Knight, The (2008)"


Estimted rating 4.3: "Grave of the Fireflies (Hotaru no haka) (1988)"


Estimted rating 4.3: "Monty Python's And Now for Something Completely Different (1971)"


Estimted rating 4.3: "Boot, Das (Boat, The) (1981)"


Estimted rating 4.3: "Toy Story 3 (2010)"


Estimted rating 4.3: "Mary and Max (2009)"


Estimted rating 4.3: "Seventh Seal, The (Sjunde inseglet, Det) (1957)"


Estimted rating 4.3: "On the Waterfront (1954)"


Estimted rating 4.3: "Five Easy Pieces (1970)"


Estimted rating 4.3: "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)"


Estimted rating 4.3: "His Girl Friday (1940)"


Estimted rating 4.3: "Do the Right Thing (1989)"


Estimted rating 4.3: "Hot Fuzz (2007)"


Estimted rating 4.3: "Casino Royale (2006)"


Estimted rating 4.3: "Wallace & Gromit: The Best of Aardman Animation (1996)"


# Finally, let's see which movies differ greatly from the `avg` rating

If the model thinks you'll like a movie whose average rating is significantly low, that's interesting!

In [161]:
avg_ratings = ratings.drop('userId', axis=1).groupby('movieId').mean()
user_ratings = pd.DataFrame(predictions, columns=['movieId', 'rating']).set_index('movieId')
diff = user_ratings - avg_ratings
diff.head()  # positive means rating for USER is higher than average

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,
2,
3,0.053745
4,0.329136
5,0.010077


In [165]:
def get_most_different(diff, top_n=5, positive=False):
    '''If positive: return movies with greatest user - avg rating. 
    Else: return movies with least user - avg rating.
    '''
    diff = diff.dropna()
    diff.columns = ['user_minus_avg']
    return diff.sort_values('user_minus_avg').iloc[-top_n:] if positive else diff.sort_values(
        'user_minus_avg').iloc[:top_n]

In [166]:
most_pos_diff = get_most_different(diff, 10, positive=True)
most_pos_diff.head()

Unnamed: 0_level_0,user_minus_avg
movieId,Unnamed: 1_level_1
4775,2.800873
102735,2.807637
141668,2.813594
138798,2.823332
26696,2.827325


In [167]:
most_neg_diff = get_most_different(diff, 10, positive=False)
most_neg_diff.head()

Unnamed: 0_level_0,user_minus_avg
movieId,Unnamed: 1_level_1
120130,-1.635498
173963,-1.575724
163386,-1.55673
172793,-1.544369
136834,-1.536534


In [168]:
for movieId, (diff,) in pd.concat([most_neg_diff, most_pos_diff]).iterrows():
    print(f'Predicted {diff:.1f} {"more" if diff > 0 else "less"} than the average')
    posters.display_movie(movieId)

Predicted -1.6 less than the average


"Into the Forest of Fireflies' Light (2011)"

Predicted -1.6 less than the average


'Empties (2007)'

Predicted -1.6 less than the average


'Winnie the Pooh and the Day of Concern (1972)'

Predicted -1.5 less than the average


'Vovka in the Kingdom of Far Far Away (1965)'

Predicted -1.5 less than the average


'The Eye: Infinity (2005)'

Predicted -1.5 less than the average


'The Adventures of Sherlock Holmes and Dr. Watson: Bloody Signature (1979)'

Predicted -1.5 less than the average


'The Girls (1961)'

Predicted -1.5 less than the average


'George Carlin: You Are All Diseased (1999)'

Predicted -1.5 less than the average


'Sisters (Syostry) (2001)'

Predicted -1.5 less than the average


'Wings, Legs and Tails (1986)'

Predicted 2.8 more than the average


'Glitter (2001)'

Predicted 2.8 more than the average


'Captain America (1979)'

Predicted 2.8 more than the average


'War Room (2015)'

Predicted 2.8 more than the average


'Joe Dirt 2: Beautiful Loser (2015)'

Predicted 2.8 more than the average


'Lionheart (1990)'

Predicted 2.8 more than the average


'Tooth Fairy 2 (2012)'

Predicted 2.8 more than the average


'Are We There Yet? (2005)'

Predicted 2.9 more than the average


'Aloha (2015)'

Predicted 2.9 more than the average


'Fullmetal Alchemist 2018 (2017)'

Predicted 2.9 more than the average


'Brothers Solomon, The (2007)'