# Data
- [Movie Lens 20m]( https://grouplens.org/datasets/movielens/20m/)
- [Movie Lens 100k](https://grouplens.org/datasets/movielens/100k/)

## Other datasets
- [Goodreads Dataset](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home)
- [Download Dataset here](https://drive.google.com/a/redbubble.com/uc?id=196W2kDoZXRPjzbTjM6uvTidn6aTpsFnS)

In [1]:
!pip install -q scikit-surprise elasticsearch progressbar2

In [2]:
from zipfile import ZipFile
import requests
import os
import json
import pandas as pd
from elasticsearch import Elasticsearch

In [3]:
# ml_ds = "ml-20m"
ml_ds = "ml-100k"
if not os.path.exists(ml_ds):
    print(f"Downloading dataset {ml_ds} from movielens")
    url = f"http://files.grouplens.org/datasets/movielens/{ml_ds}.zip"
    r = requests.get(url)
    with open(f"{ml_ds}.zip", 'wb') as f:
        f.write(r.content)
    with ZipFile(f"{ml_ds}.zip", 'r') as zipObj:
        zipObj.extractall()

# Train the recommendation model

In [4]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

In [5]:
!head "{ml_ds}/u.data"

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013


In [6]:
# Load the movielens-100k dataset (download it if needed).
# Load the ratings
ratings = pd.read_csv(f"{ml_ds}/u.data", sep='\s+', header=None)
ratings.columns = ['user id', 'item id', 'rating', 'timestamp']
data = Dataset.load_from_df(ratings[['user id', 'item id', 'rating']], reader = Reader(rating_scale=(ratings.rating.max(), ratings.rating.max())))
trainset = data.build_full_trainset()
# Use the famous SVD algorithm.
algo = SVD()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7d4b8fae20>

In [7]:
embeddings = algo.qi
print("Shape of the embedding matrix", embeddings.shape)

Shape of the embedding matrix (1682, 100)


# Compute the embedding clusters

In [8]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from progressbar import progressbar
import numpy as np

def compute_clusters(embeddings, subvector_dimension, n_clusters, stride_step=None, mini_batch=True, batch_size=100):
    stride_step = subvector_dimension if stride_step is None else stride_step
    strides = [(start, start+subvector_dimension) for start in range(0, embeddings.shape[1]-subvector_dimension+1, stride_step)]
    n_subvectors = len(strides)
    
    clusters = np.zeros((embeddings.shape[0], n_subvectors))
    for subvector, (start, end) in progressbar(enumerate(strides), max_value=n_subvectors):
        X = embeddings[:,start:end]
        if mini_batch:
            kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=0, verbose=0).fit(X)
        else:
            kmeans = KMeans(n_clusters=n_clusters, random_state=0, verbose=0).fit(X)
        clusters[:, subvector] = kmeans.labels_
    return clusters


In [9]:
clusters = compute_clusters(embeddings, 10, 16, stride_step=5, mini_batch=False)
print("Shape of the clusters matrix", clusters.shape)

100% (19 of 19) |########################| Elapsed Time: 0:00:34 Time:  0:00:34


Shape of the clusters matrix (1682, 19)


In [10]:
# Convert map each embedding back to a movie
item_ids = [trainset.to_raw_iid(i) for i in trainset.all_items()]
embeddings_df = pd.DataFrame(data=zip(item_ids, embeddings, clusters))
embeddings_df.columns = ['movie id', 'embeddings', 'clusters']
embeddings_df.set_index('movie id', inplace=True)
embeddings_df.head()

Unnamed: 0_level_0,embeddings,clusters
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1
242,"[0.20782335757816314, 0.11365519436144282, -0....","[4.0, 4.0, 5.0, 0.0, 0.0, 12.0, 2.0, 2.0, 9.0,..."
302,"[-0.049309109275262566, -0.13062960131502072, ...","[10.0, 5.0, 2.0, 11.0, 0.0, 8.0, 12.0, 13.0, 7..."
377,"[-0.19234432910536006, 0.02594788200355742, -0...","[8.0, 13.0, 11.0, 4.0, 11.0, 14.0, 4.0, 12.0, ..."
51,"[0.06383749429437204, 0.14527807113499341, -0....","[3.0, 0.0, 12.0, 4.0, 13.0, 9.0, 13.0, 1.0, 10..."
346,"[0.12648971364219952, 0.018650557343776356, -0...","[6.0, 0.0, 12.0, 6.0, 2.0, 11.0, 8.0, 12.0, 9...."


# Index the documents in ES

## Load the movie dataset

In [11]:
# Taken from the README
columns = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 
           'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 
           'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
features = ['movie title', 'IMDb URL', 'Genre']
items = pd.read_csv("ml-100k/u.item", sep='|', header=None, encoding='latin1')
items.columns = columns
items.set_index('movie id', inplace=True)

items['IMDb URL'].fillna("", inplace=True)

# Convert the genre columns into a single column
items['Genre'] = items.reset_index().melt('movie id', var_name='genre').query('value == 1')\
       .groupby('movie id')['genre']\
       .apply(list)

# Only keep the interesting features
items = items.loc[:,features]

# Add average ratings
avg_ratings = ratings.groupby('item id').agg({'rating': 'mean', 'user id': 'count'}).reset_index()
avg_ratings.columns = ['movie id', 'avg rating', 'ratings']
avg_ratings.set_index('movie id', inplace=True)

items = items.merge(avg_ratings, right_index=True, left_index=True)
items = items.merge(embeddings_df, right_index=True, left_index=True)
print("Shape of the items matrix", items.shape)
# items['clusters'] = clusters
items.head()

Shape of the items matrix (1682, 7)


Unnamed: 0_level_0,movie title,IMDb URL,Genre,avg rating,ratings,embeddings,clusters
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Toy Story (1995),http://us.imdb.com/M/title-exact?Toy%20Story%2...,"[Animation, Children's, Comedy]",3.878319,452,"[0.12190266216256603, 0.2510029000553083, 0.07...","[14.0, 6.0, 8.0, 5.0, 12.0, 13.0, 2.0, 1.0, 6...."
2,GoldenEye (1995),http://us.imdb.com/M/title-exact?GoldenEye%20(...,"[Action, Adventure, Thriller]",3.206107,131,"[-0.03482466320014828, 0.13617556032065942, 0....","[4.0, 6.0, 9.0, 1.0, 3.0, 5.0, 5.0, 9.0, 14.0,..."
3,Four Rooms (1995),http://us.imdb.com/M/title-exact?Four%20Rooms%...,[Thriller],3.033333,90,"[-0.126453935516089, -0.17279759353164445, -0....","[11.0, 11.0, 3.0, 15.0, 9.0, 1.0, 11.0, 14.0, ..."
4,Get Shorty (1995),http://us.imdb.com/M/title-exact?Get%20Shorty%...,"[Action, Comedy, Drama]",3.550239,209,"[-0.0196448767405285, -0.22630784013334612, -0...","[10.0, 15.0, 2.0, 0.0, 9.0, 0.0, 1.0, 6.0, 1.0..."
5,Copycat (1995),http://us.imdb.com/M/title-exact?Copycat%20(1995),"[Crime, Drama, Thriller]",3.302326,86,"[0.06427454306598854, -0.04722566274684215, -0...","[6.0, 11.0, 10.0, 12.0, 12.0, 12.0, 8.0, 9.0, ..."


## ElasticSearch setup

In [12]:
# Create a client that will talk to the elastic search container running alongside this notebook
es_client = Elasticsearch(hosts=[{'host': 'elasticsearch', 'port': '9200'}])

In [13]:
# (re)Create an index for our movies
index_definition = {
    "settings" : {
        "similarity": {
          "simple_cluster_score": {
            "type": "scripted",
            "script": {
              "source": "return query.boost * (doc.freq / doc.length) ;"
            }
          }
        }
    },
    "mappings" : {
        "_source": {
          "enabled": True # we store the whole document for debugging purposes
        },
        "properties" : {
            "title" : { "type" : "text" },
            "genre" : { "type" : "text" },
            "avg_rating" : { "type" : "float" },
            "ratings" : { "type" : "integer" },
            "imdb_url" : { "type" : "keyword" },
            "clusters": {
              "type": "text",
              "analyzer": "keyword",
              "term_vector": "yes", # this is required for the more_like_this query to work
              "fields": {
                "simple": {
                  "type": "text",
                  "analyzer": "keyword",
                  "similarity": "simple_cluster_score",
                  "term_vector": "yes" # this is required for the more_like_this query to work
                }
              }
            }
        }
    }
}

es_client.indices.delete(index="movies", ignore_unavailable=True)
es_client.indices.create(index="movies", body=index_definition)



{'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'}

In [18]:
items.head().to_dict(orient='index')

def clusters_to_string(clusters):
    return ["x{i}k{k:.0f}".format(i=i, k=k) for i, k in enumerate(clusters)]

def index_movie(es_client, movie_id, title, avg_rating, ratings, genre, imdb_url, clusters):
    doc = {
        "title": title,
        "genre": genre,
        "avg_rating": avg_rating,
        "ratings": ratings,
        "imdb_url": imdb_url,
        "clusters": clusters
    }
    try:
        es_client.index(index="movies", body=doc, id=movie_id)
    except Exception as e:
        print("Error while indexing", doc)
        print(e)

In [19]:
# Index all movies in our ElasticSearch "cluster"
#  in real life this should be done in batches for performance
for movie_id, movie in progressbar(items.to_dict(orient='index').items()):
    clusters = clusters_to_string(movie['clusters'])
    index_movie(es_client, movie_id, title=movie['movie title'], avg_rating=movie['avg rating'], ratings=movie['ratings'],
                genre=movie['Genre'], imdb_url=movie['IMDb URL'], clusters=clusters)

100% (1682 of 1682) |####################| Elapsed Time: 0:00:03 Time:  0:00:03


In [20]:
def search(genre, liked_movies=[], pers_weight=100, mlt_field='clusters.simple', debug=False):
    should_clauses = [
        {
            "function_score": {
            "field_value_factor": {
                "field": "avg_rating",
                "factor": 1
            }
        }
        }
    ]
    if len(liked_movies) > 0:
        should_clauses += [{
          "more_like_this": {
            "fields": [ mlt_field ],
            "like": [ { "_index": 'movies', "_id": str(i) } for i in liked_movies ],
            "min_term_freq": 1,
            "max_query_terms": 32,
            "min_doc_freq": 1,
            "boost": pers_weight
          }
        }]
    query = {
        "explain": "true",
        "query" : {
            "bool": {
                "filter": [{
                    "term" : {
                        "genre": genre
                    }
                }],
                "should": should_clauses
            }
        }
    }
    results =  es_client.search(index="movies", body=query)
    if debug:
        print(json.dumps(query))
    for i, hit in enumerate(results['hits']['hits']):
        print(f"{i} {hit['_source']['title']} (score: {hit['_score']:.2f}")
    

In [21]:
search("action")

0 Star Wars (1977) (score: 4.36
1 Godfather, The (1972) (score: 4.28
2 Raiders of the Lost Ark (1981) (score: 4.25
3 Titanic (1997) (score: 4.25
4 Empire Strikes Back, The (1980) (score: 4.20
5 Boot, Das (1981) (score: 4.20
6 Godfather: Part II, The (1974) (score: 4.19
7 African Queen, The (1951) (score: 4.18
8 Princess Bride, The (1987) (score: 4.17
9 Braveheart (1995) (score: 4.15


In [22]:
search("action", [2], debug=True)

{"explain": "true", "query": {"bool": {"filter": [{"term": {"genre": "action"}}], "should": [{"function_score": {"field_value_factor": {"field": "avg_rating", "factor": 1}}}, {"more_like_this": {"fields": ["clusters.simple"], "like": [{"_index": "movies", "_id": "2"}], "min_term_freq": 1, "max_query_terms": 32, "min_doc_freq": 1, "boost": 100}}]}}}
0 Clear and Present Danger (1994) (score: 35.15
1 Executive Decision (1996) (score: 34.94
2 Independence Day (ID4) (1996) (score: 29.75
3 Tomorrow Never Dies (1997) (score: 29.74
4 The Deadly Cure (1996) (score: 29.32
5 Muppet Treasure Island (1996) (score: 29.08
6 Sudden Death (1995) (score: 29.04
7 Star Wars (1977) (score: 4.36
8 Godfather, The (1972) (score: 4.28
9 Raiders of the Lost Ark (1981) (score: 4.25


In [23]:
for user_id, row in ratings[ratings.rating >= 3].groupby('user id').agg({'item id': list}).sample(10).iterrows():
    selected_movies = row['item id'][:5] # pick 10 movies
    titles = items.loc[selected_movies]['movie title'].values
    genre = items.loc[selected_movies]['Genre'].values[0][0]
    print(f"{genre} movies for user {user_id}")
    print(" liked movies:", ", ".join(titles))
    search(genre.lower(), selected_movies)
    print()
    
    

Animation movies for user 490
 liked movies: Toy Story (1995), Game, The (1997), Dead Man Walking (1995), Tin Cup (1996), Men in Black (1997)
0 Close Shave, A (1995) (score: 4.49
1 Wrong Trousers, The (1993) (score: 4.47
2 Wallace & Gromit: The Best of Aardman Animation (1996) (score: 4.45
3 Faust (1994) (score: 4.20
4 Grand Day Out, A (1992) (score: 4.11
5 Toy Story (1995) (score: 3.88
6 Aladdin (1992) (score: 3.81
7 Winnie the Pooh and the Blustery Day (1968) (score: 3.80
8 Beauty and the Beast (1991) (score: 3.79
9 Lion King, The (1994) (score: 3.78

Horror movies for user 814
 liked movies: Village of the Damned (1995), M (1931), Fargo (1996), Psycho (1960), Shining, The (1980)
0 Blob, The (1958) (score: 55.35
1 Psycho (1960) (score: 4.10
2 Alien (1979) (score: 4.03
3 Nightwatch (1997) (score: 4.00
4 Young Frankenstein (1974) (score: 3.94
5 Braindead (1992) (score: 3.86
6 Shining, The (1980) (score: 3.83
7 Birds, The (1963) (score: 3.81
8 Jaws (1975) (score: 3.77
9 Night Flier (199