# [Spotlight ](https://github.com/maciejkula/spotlight)
Spotlight uses PyTorch to build both deep and shallow recommender models. By providing both a slew of building blocks for loss functions (various pointwise and pairwise ranking losses), representations (shallow factorization representations, deep sequence models), and utilities for fetching (or generating) recommendation datasets, it aims to be a tool for rapid exploration and prototyping of new recommender models.

## [Docs](https://maciejkula.github.io/spotlight/)

In [5]:
import vaex

df = vaex.open('data/imdb.parquet').as_numpy()

userid = 'userId'
itemid = 'movieId'
title = 'title'

counts = df[itemid].value_counts()
counts = counts[counts > 100]  # Remove rare movies
df = df[df[itemid].isin(counts.index)]
unique_movies = df.groupby(['movieId', 'title']).agg({'count': 'count'})
titles = {movie: name for movie, name in
          zip(unique_movies['movieId'].tolist(), unique_movies['title'].tolist())}

df = df.head(1000_000)  # for quick development
df.head(2)

#,userId,movieId,rating,timestamp,name,title,genres,year,url
0,1,2,3.5,20050402T235347,Fausto Orms,Jumanji,"[""Adventure"",""Children"",""Fantasy""]",1995,'http://image.tmdb.org/t/p/w500/vzmL6fP7aPKNKPRT...
1,5,2,3.0,19961225T152609,Antony Maguire,Jumanji,"[""Adventure"",""Children"",""Fantasy""]",1995,'http://image.tmdb.org/t/p/w500/vzmL6fP7aPKNKPRT...


In [6]:
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel

dataset = Interactions(user_ids=df['userId'].values,
                       item_ids=df['movieId'].values,
                       ratings=df['rating'].values,
                       timestamps=df['timestamp'].to_numpy())

""" 
Note.
In recommendations engines, many times you want to train-test-split while preserving 
high ratings on the test-set, beacuse you will only recommend those.
In real-life you are not evaluated by how much you can tell people didn't like a movie.

In this example we ignore this (:
"""

train, test = random_train_test_split(dataset)

model = ExplicitFactorizationModel(n_iter=1)
model.fit(train)

rmse = rmse_score(model, test)
print(f"rmse: {rmse}")

rmse: 0.8437027364504517


In [7]:
# groupby-concatenate currently not supported in vaex
topk = 5
users = df[['userId', 'movieId']].to_pandas_df()
users_history = users.groupby(['userId'])['movieId'].apply(list).to_dict()
qustions = set(df['movieId'].unique())
users_options = {user: qustions.difference(history) for user, history in users_history.items()}
most_popular = list(df['movieId'].value_counts()[:topk].index)
most_popular_titles = [titles.get(i) for i in most_popular]
print(f"Most popular movies: {most_popular_titles}")

Most popular movies: ['Pulp Fiction', 'Shawshank Redemption, The', 'Silence of the Lambs, The', 'Star Wars: Episode IV - A New Hope', 'Terminator 2: Judgment Day']


In [8]:
import pyarrow as pa
import pandas as pd
import numpy as np


@vaex.register_function()
def recommend(ar, topk=5):
    ret = []
    for user in ar.tolist():
        user_options = list(users_options.get(user, most_popular))
        if not user_options or pd.isna(user) or user < 0:
            ret.append(most_popular_titles)
        else:
            # cool way to sort topk
            recommendations = model.predict(user,
                                            np.array(user_options)).argsort()[-topk:][
                              ::-1]
            recommendations = [titles.get(i) for i in recommendations]
            ret.append(recommendations)
    return pa.array(ret)


df.add_function('recommend', recommend)
df['recommendations'] = df.userId.recommend()
df.head(2)

#,userId,movieId,rating,timestamp,name,title,genres,year,url,recommendations
0,1,2,3.5,20050402T235347,Fausto Orms,Jumanji,"[""Adventure"",""Children"",""Fantasy""]",1995,'http://image.tmdb.org/t/p/w500/vzmL6fP7aPKNKPRT...,"""['Pulp Fiction', 'Shawshank Redemption, The', '..."
1,5,2,3.0,19961225T152609,Antony Maguire,Jumanji,"[""Adventure"",""Children"",""Fantasy""]",1995,'http://image.tmdb.org/t/p/w500/vzmL6fP7aPKNKPRT...,"""['Toy Story', 'Sudden Death', 'American Preside..."


In [None]:
from goldilox import Pipeline

pipeline = Pipeline.from_vaex(df)
pipeline.raw = {"userId": 5}
pipeline.inference(pipeline.raw, columns=['userId', 'recommendations'])

Note that you must provide the columns at inference time or you will get an error as the other columns are not Nones but missing.