# Netflix recommendations

<div class="alert alert-block alert-warning">
Replace <code>YOUR_GITHUB_TOKEN</code> in the install script. To get your token follow the instructions in the <a href="../README.md">README.md</a>
</div>

## Boilerplate

In [None]:
%pip install gcsfs pandas
%pip install  'https://us-central1-data-359211.cloudfunctions.net/github-proxy/superlinked-0.9.0.post1+git.5684bae2-py3-none-any.whl?token=YOUR_GITHUB_TOKEN'

## Imports and constants

In [1]:
import pandas as pd

from datetime import timedelta

from superlinked.framework.common.dag.recency_node import PeriodTimeParam
from superlinked.framework.common.schema.schema import schema
from superlinked.framework.common.schema.schema_object import String, Timestamp
from superlinked.framework.common.schema.id_schema_object import IdField
from superlinked.framework.common.parser.dataframe_parser import DataFrameParser
from superlinked.framework.dsl.executor.in_memory.in_memory_executor import InMemoryExecutor, InMemoryApp
from superlinked.framework.dsl.index.index import Index
from superlinked.framework.dsl.query.param import Param
from superlinked.framework.dsl.query.query import Query
from superlinked.framework.dsl.query.result import Result
from superlinked.framework.dsl.source.in_memory_source import InMemorySource
from superlinked.framework.dsl.space.text_similarity_space import TextSimilaritySpace
from superlinked.framework.dsl.space.recency_space import RecencySpace

In [2]:
MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"
YEAR_IN_DAYS = 365
TOP_N = 10
DATASET_URL = "https://storage.googleapis.com/superlinked-notebook-netflix-shows-dataset/titles.csv"

## Explore dataset

In [3]:
movie_df = pd.read_csv(DATASET_URL)
movie_df = movie_df[["description", "genres", "title", "release_year"]].drop_duplicates(subset=["description"]).dropna(how='any')
movie_df['id'] = movie_df["description"].map(hash)
movie_df["genres"] = movie_df["genres"].apply(lambda x: " ".join(eval(x)))
movie_df["timestamp"] = [pd.Timestamp(year=year, month=1, day=1).timestamp() for year in movie_df["release_year"].tolist()]
movie_df.head()

Unnamed: 0,description,genres,title,release_year,id,timestamp
0,This collection includes 12 World War II-era p...,documentation,Five Came Back: The Reference Films,1945,8278881001097473458,-788918400.0
1,A mentally unstable Vietnam War veteran works ...,drama crime,Taxi Driver,1976,6189909913043756299,189302400.0
2,Intent on seeing the Cahulawassee River before...,drama action thriller european,Deliverance,1972,-3438940832439607707,63072000.0
3,"King Arthur, accompanied by his squire, recrui...",fantasy action comedy,Monty Python and the Holy Grail,1975,2384525310414784514,157766400.0
4,12 American military prisoners in World War II...,war action,The Dirty Dozen,1967,-4641624687685937899,-94694400.0


## Set up Superlinked

In [4]:
@schema
class MovieSchema:
    description: String
    title: String
    release_timestamp: Timestamp
    genres: String
    id: IdField

In [5]:
movie = MovieSchema()

In [6]:
description_space = TextSimilaritySpace(text=movie.description, model=MODEL)
title_space = TextSimilaritySpace(text=movie.title, model=MODEL)
genre_space = TextSimilaritySpace(text=movie.genres, model=MODEL)
recency_space = RecencySpace(timestamp=movie.release_timestamp, period_time_param_list=[
    PeriodTimeParam(timedelta(days=4 * YEAR_IN_DAYS)), 
    PeriodTimeParam(timedelta(days=10 * YEAR_IN_DAYS)), 
    PeriodTimeParam(timedelta(days=40 * YEAR_IN_DAYS))],
    negative_filter=0)

In [7]:
movie_index = Index(spaces=[description_space, title_space, genre_space, recency_space])

In [8]:
query_text_param = Param("query_text")

simple_query = (
    Query(movie_index, weights={
        description_space: Param("description_weight"),
        title_space: Param("title_weight"),
        genre_space: Param("genre_weight"),
        recency_space: Param("recency_weight")
    })
    .find(movie)
    .similar(description_space.text, query_text_param)
    .similar(title_space.text, query_text_param)
    .similar(genre_space.text, query_text_param)
)

advanced_query = (
    Query(movie_index, weights={
        description_space: Param("description_weight"),
        title_space: Param("title_weight"),
        genre_space: Param("genre_weight"),
        recency_space: Param("recency_weight")
    })
    .find(movie)
    .similar(description_space.text, Param("description_query_text"))
    .similar(title_space.text, Param("title_query_text"))
    .similar(genre_space.text, Param("genre_query_text"))
)

In [9]:
df_parser = DataFrameParser(schema=movie, mapping={movie.release_timestamp: "timestamp"})

In [10]:
source: InMemorySource = InMemorySource(movie, parser=df_parser)
executor: InMemoryExecutor = InMemoryExecutor(sources=[source], indices=[movie_index])
app: InMemoryApp = executor.run()

This next one might take several minutes to run. Getting a coffee or water, or doing a quick planking workout is advised.

In [11]:
source.put([movie_df])

## Run queries

### Helpers

In [12]:
KEEPCOLS = ["description", "genres", "title", "release_year", "order"]

def get_ordered_result_tuples(result: Result, top_n: int) -> list[tuple[int]]:
    return [(i+1, int(entity.id_.object_id)) for i, entity in enumerate(result.entities[:top_n])]

def get_movies_by_id_list(id_list_tuple: list[tuple[int]], df: pd.DataFrame, keepcols: list[str] | None = None) -> pd.DataFrame:
    if keepcols is None:
        keepcols = list(KEEPCOLS)
    if df.index.name != "id":
        df = df.set_index("id")
    result_df = df.loc[[id_tuple[1] for id_tuple in id_list_tuple]]
    result_df["order"] = [id_tuple[0] for id_tuple in id_list_tuple]
    return result_df[keepcols].reset_index(drop=True).set_index("order")

def parse_results(result: Result, df: pd.DataFrame, top_n: int = TOP_N) -> pd.DataFrame:
    id_tuples = get_ordered_result_tuples(result=result, top_n=top_n)
    return get_movies_by_id_list(id_list_tuple=id_tuples, df=df)

### Queries

With the simple query, I can search with my text in all of the fields

In [13]:
result: Result = app.query(
    simple_query,
    query_text="Heartfelt romantic comedy",
    description_weight=1,
    title_weight=1,
    genre_weight=1,
    recency_weight=0
)

In [14]:
parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,When a group of people meets at the same party...,comedy romance,"Love, Surreal and Odd",2017
2,"An LA girl, unlucky in love, falls for an East...",romance comedy,Love Hard,2021
3,Romantic anthology web series revolving around...,drama romance,Love Daily,2018
4,"In this romantic comedy, several friends, each...",comedy romance,F*ck Love Too,2022
5,Aspiring pop star Erica ends up as the enterta...,romance comedy,Resort to Love,2021
6,'Love Actually' follows the lives of eight ver...,drama comedy romance,Love Actually,2003
7,Exploration into the tense relationship of suc...,romance,Love or Money,2021
8,It tells the love story of two childhood sweet...,comedy romance,A Love So Beautiful,2017
9,Two young kids fall in love with each other. B...,romance drama,Endless Love,1981
10,Rebellious Mickey and good-natured Gus navigat...,comedy drama romance,Love,2016


After looking at the results, I see some titles I have already seen. I can bias towards recent titles by upweighting recency. Weights are normalised to have unit sum, so you don't have to worry about how you set them.

In [15]:
result: Result = app.query(
    simple_query,
    query_text="Heartfelt romantic comedy",
    description_weight=1,
    title_weight=1,
    genre_weight=1,
    recency_weight=3
)

In [16]:
parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"An LA girl, unlucky in love, falls for an East...",romance comedy,Love Hard,2021
2,Aspiring pop star Erica ends up as the enterta...,romance comedy,Resort to Love,2021
3,Exploration into the tense relationship of suc...,romance,Love or Money,2021
4,"To save her small law firm, earnest lawyer Sus...",comedy romance,"Love, Guaranteed",2020
5,While trying to make his sister's wedding day ...,comedy romance,Love Wedding Repeat,2020
6,Love is as tough as it is sweet for a lovestru...,comedy drama,A Love So Beautiful,2020
7,A rising black painter tries to break into a c...,romance drama,Really Love,2020
8,Fashion assistant Maca has just about got her ...,comedy music romance drama,Sounds Like Love,2021
9,A 33-year-old fashion marketing director at a ...,drama romance,I Need Romance,2021
10,An amalgamation of four different love stories...,romance drama,World Famous Lover,2020


Still using the simple query, I can give more weight to spaces if I think my query is more related to that space - matches there should count more. Here I give additional weight to the genre, leave the description as is, and downweight the title as my query text is mostly a genre with some additional context. I keep recency with unit weight too, as I would like my results to be a bit biased towards recent movies.

In [17]:
result = app.query(
    simple_query,
    query_text="Heartfelt romantic comedy",
    description_weight=1,
    title_weight=0.1,
    genre_weight=2,
    recency_weight=1
)

In [18]:
parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"In this romantic comedy, several friends, each...",comedy romance,F*ck Love Too,2022
2,"An LA girl, unlucky in love, falls for an East...",romance comedy,Love Hard,2021
3,Short films follow young adults as they naviga...,drama romance comedy,Feels Like Ishq,2021
4,Incurable romantic Lotte's life is upended whe...,comedy romance,Just Say Yes,2021
5,"In pursuit of both success and validation, a g...",romance comedy,Slay,2021
6,A blue-collar baker strikes up a relationship ...,comedy drama romance,The Baker and the Beauty,2020
7,Lara Jean and Peter have just taken their roma...,romance comedy drama,To All the Boys: P.S. I Still Love You,2020
8,When professional ambitions clash with persona...,romance comedy drama,Love Aaj Kal,2020
9,Aspiring pop star Erica ends up as the enterta...,romance comedy,Resort to Love,2021
10,"In and around Lucknow University in 1989, coup...",drama comedy romance,Taj Mahal 1989,2020


With the advanced query, I can even supply different search terms for each attribute of the movie.

In [19]:
result = app.query(
    advanced_query,
    description_query_text="Heartfelt lovely romantic comedy for a cold autumn evening.",
    title_query_text="love",
    genre_query_text="drama comedy romantic",
    description_weight=1,
    title_weight=1,
    genre_weight=1,
    recency_weight=0
)

In [20]:
parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,'Love Actually' follows the lives of eight ver...,drama comedy romance,Love Actually,2003
2,Rebellious Mickey and good-natured Gus navigat...,comedy drama romance,Love,2016
3,"An LA girl, unlucky in love, falls for an East...",romance comedy,Love Hard,2021
4,Two young kids fall in love with each other. B...,romance drama,Endless Love,1981
5,A rising black painter tries to break into a c...,romance drama,Really Love,2020
6,Romantic anthology web series revolving around...,drama romance,Love Daily,2018
7,Adam and Marklin’s 5-year relationship has gon...,comedy drama romance,Almost Love,2019
8,Love is as tough as it is sweet for a lovestru...,comedy drama,A Love So Beautiful,2020
9,"Near by Christmas, in an old and charming town...",romance european comedy,Love Is a Story,2015
10,This black humor pan-Arabic anthology series i...,comedy drama romance,"Love, Life & Everything in Between",2022


And even give different weights to each subsearch if I really do care that the title is related to love but I am not really emotionally attached to my description of the movie I would want to see.

In [21]:
result = app.query(
    advanced_query,
    description_query_text="Heartfelt lovely romantic comedy for a cold autumn evening.",
    title_query_text="love",
    genre_query_text="drama comedy romantic",
    description_weight=0.2,
    title_weight=3,
    genre_weight=1,
    recency_weight=0
)

In [22]:
parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Rebellious Mickey and good-natured Gus navigat...,comedy drama romance,Love,2016
2,The story of a family and the various situatio...,thriller drama,Love,2020
3,'Love Actually' follows the lives of eight ver...,drama comedy romance,Love Actually,2003
4,Two young kids fall in love with each other. B...,romance drama,Endless Love,1981
5,A rising black painter tries to break into a c...,romance drama,Really Love,2020
6,Adam and Marklin’s 5-year relationship has gon...,comedy drama romance,Almost Love,2019
7,"The story of Richard and Mildred Loving, an in...",drama romance,Loving,2016
8,"An LA girl, unlucky in love, falls for an East...",romance comedy,Love Hard,2021
9,In order to receive a bone marrow transplant q...,drama romance,Well-Intended Love,2019
10,It tells the love story of two childhood sweet...,comedy romance,A Love So Beautiful,2017


Then I can bias again towards recent movies

In [23]:
result = app.query(
    advanced_query,
    description_query_text="Heartfelt lovely romantic comedy for a cold autumn evening.",
    title_query_text="love",
    genre_query_text="drama comedy romantic",
    description_weight=0.2,
    title_weight=3,
    genre_weight=1,
    recency_weight=5
)

In [24]:
parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The story of a family and the various situatio...,thriller drama,Love,2020
2,A rising black painter tries to break into a c...,romance drama,Really Love,2020
3,"An LA girl, unlucky in love, falls for an East...",romance comedy,Love Hard,2021
4,Love is as tough as it is sweet for a lovestru...,comedy drama,A Love So Beautiful,2020
5,Rebellious Mickey and good-natured Gus navigat...,comedy drama romance,Love,2016
6,"To save her small law firm, earnest lawyer Sus...",comedy romance,"Love, Guaranteed",2020
7,"In 1990s Turkey, a group of teenage outcasts b...",drama romance comedy,Love 101,2020
8,Fashion assistant Maca has just about got her ...,comedy music romance drama,Sounds Like Love,2021
9,A-Cheng collects debts for a gangster and does...,romance drama,Man in Love,2021
10,"After 14 years devoid of romance, a struggling...",drama comedy romance,Was It Love?,2020


Or maybe to older ones

In [25]:
result = app.query(
    advanced_query,
    description_query_text="Heartfelt lovely romantic comedy for a cold autumn evening.",
    title_query_text="love",
    genre_query_text="drama comedy romantic",
    description_weight=0.2,
    title_weight=3,
    genre_weight=1,
    recency_weight=-10
)

parse_results(result, movie_df, 10)

Unnamed: 0_level_0,description,genres,title,release_year
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Two young kids fall in love with each other. B...,romance drama,Endless Love,1981
2,"Anil, a street singer, is humiliated and drive...",drama romance,Disco Dancer,1982
3,An honest man dreams of a better life for his ...,romance crime drama,Ujala,1959
4,Two talented song-and-dance men team up after ...,romance comedy,White Christmas,1954
5,"Brian Cohen is an average young Jewish man, bu...",comedy,Life of Brian,1979
6,Geeky student Arnie Cunningham falls for Chris...,horror thriller european,Christine,1983
7,Maharaj Brajbhan lives a wealthy lifestyle in ...,drama action romance,Bandie,1978
8,Two small children and a ship's cook survive a...,romance action drama,The Blue Lagoon,1980
9,"In the 1930s, bored waitress Bonnie Parker fal...",crime drama action,Bonnie and Clyde,1967
10,"Ragab, a poor sailor, returns home to Alexandr...",action drama romance thriller,Dark Waters,1956
