## Content-based filtering

Recommendation system based on movie genres, using MovieLens dataset.

In [25]:
from surprise import Dataset
import pandas as pd
import os

In [3]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] 

 Y


Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/kacperraczy/.surprise_data/ml-100k


In [50]:
data_path = "~/.surprise_data/ml-100k/ml-100k"
items_path = os.path.join(data_path, "u.item")
genres_path = os.path.join(data_path, "u.genre")

In [51]:
item_columns = ["id", "title", "release date", "video release date", "IMDb URL", 
                "unknown", "Action", "Adventure", "Animation", 
                "Children's", "Comedy", "Crime", "Documentary", 
                "Drama", "Fantasy", "Film-Noir", "Horror", 
                "Musical", "Mystery", "Romance", "Sci-Fi",
                "Thriller", "War", "Western"]
genres = item_columns[5:]

In [61]:
items_df = pd.read_csv(items_path, delimiter="|", names=item_columns, header=None, encoding="iso-8859-1")

In [64]:
items_df

Unnamed: 0,id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [85]:
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

tfidf = TfidfTransformer()
tfidf_matrix = tfidf.fit_transform(items_df[genres].values).toarray()
tfidf_matrix.shape

(1682, 19)

### Cosine similarity

In [87]:
# dot product of tfidf matrices gives cosine similarity
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(1682, 1682)

### Genre recomendation based on movie

In [104]:
titles = items_df['title']
indices = pd.Series(items_df.index, index=items_df['title'])

def genre_recomendation(title, k):
    index = indices[title]
    similarity_row = list(enumerate(cosine_sim[index]))
    del similarity_row[index]
    
    similarity_row = sorted(similarity_row, key=lambda x: x[1], reverse=True)
    similarity_row = similarity_row[:k]
    
    recommended_indices = [i[0] for i in similarity_row]
    recommended_titles = titles.iloc[recommended_indices]
    
    return recommended_titles

In [105]:
genre_recomendation('Lion King, The (1994)', 20)

98      Snow White and the Seven Dwarfs (1937)
102             All Dogs Go to Heaven 2 (1996)
417                          Cinderella (1950)
419                 Alice in Wonderland (1951)
431                            Fantasia (1940)
472           James and the Giant Peach (1996)
500                               Dumbo (1941)
537                           Anastasia (1997)
587                Beauty and the Beast (1991)
595        Hunchback of Notre Dame, The (1996)
623               Three Caballeros, The (1945)
988                    Cats Don't Dance (1997)
94                              Aladdin (1992)
541                          Pocahontas (1995)
1090                      Pete's Dragon (1977)
992                            Hercules (1997)
101                     Aristocats, The (1970)
403                           Pinocchio (1940)
624             Sword in the Stone, The (1963)
945              Fox and the Hound, The (1981)
Name: title, dtype: object

In [106]:
genre_recomendation('Pulp Fiction (1994)', 20)

75                    Carlito's Way (1993)
181                      GoodFellas (1990)
292                   Donnie Brasco (1997)
345                    Jackie Brown (1997)
503                Bonnie and Clyde (1967)
627                        Sleepers (1996)
910                        Twilight (1998)
1105               Newton Boys, The (1998)
1121        They Made Me a Criminal (1939)
1155                          Cyclo (1995)
1190       Letter From Death Row, A (1998)
1193             Once Were Warriors (1994)
1225       Night Falls on Manhattan (1997)
1438                  Jason's Lyric (1994)
1452           Angel on My Shoulder (1946)
1504    Killer: A Journal of Murder (1995)
1518               New Jersey Drive (1995)
1637                    Normal Life (1996)
129                     Kansas City (1996)
308                        Deceiver (1997)
Name: title, dtype: object

In [107]:
genre_recomendation('Killer: A Journal of Murder (1995)', 20)

55                  Pulp Fiction (1994)
75                 Carlito's Way (1993)
181                   GoodFellas (1990)
292                Donnie Brasco (1997)
345                 Jackie Brown (1997)
503             Bonnie and Clyde (1967)
627                     Sleepers (1996)
910                     Twilight (1998)
1105            Newton Boys, The (1998)
1121     They Made Me a Criminal (1939)
1155                       Cyclo (1995)
1190    Letter From Death Row, A (1998)
1193          Once Were Warriors (1994)
1225    Night Falls on Manhattan (1997)
1438               Jason's Lyric (1994)
1452        Angel on My Shoulder (1946)
1518            New Jersey Drive (1995)
1637                 Normal Life (1996)
129                  Kansas City (1996)
308                     Deceiver (1997)
Name: title, dtype: object