# Recommender systems

In [1]:
# Data manipulation:
import pandas as pd
import numpy as np

# Data exploration:
!pip install pandas_profiling
import pandas_profiling as pp

# sklearn
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




# Exploring the data

Download the [data](https://canvas.supinfo.com/courses/85/files/11364) or find it on [kaggle](https://www.kaggle.com/orgesleka/imdbmovies)'s website and save it under ./data of your current directory on your host. (/home/jovyan/data in the docker container)

In [2]:
directory = '../data'

imdb_df = pd.read_csv('{}/imdb.csv'.format(directory), escapechar='\\')

In [3]:
data_report = pp.ProfileReport(imdb_df)
data_report.to_file(output_file='{}/imdb_report.html'.format(directory))

HBox(children=(FloatProgress(value=0.0, description='variables', max=44.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=81.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=4.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




In [4]:
data_report.to_widgets()

Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…

# Simple content based recommender system using [KNN](https://scikit-learn.org/stable/modules/neighbors.html)

In [5]:
# embeddings = get_embeddings(words)
X_df = pd.get_dummies(imdb_df[[
    'imdbRating',
    'ratingCount',
    'duration',
    'year',
    'type',
    'nrOfWins',
    'nrOfNominations',
    'nrOfPhotos',
    'nrOfNewsArticles',
    'nrOfUserReviews',
    'nrOfGenre',
    'Action',
    'Adult',
    'Adventure',
    'Animation',
    'Biography',
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Family',
    'Fantasy',
    'FilmNoir',
    'GameShow',
    'History',
    'Horror',
    'Music',
    'Musical',
    'Mystery',
    'News',
    'RealityTV',
    'Romance',
    'SciFi',
    'Short',
    'Sport',
    'TalkShow',
    'Thriller',
    'War',
    'Western'
]])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

X = imp_mean.fit_transform(X_df)

In [6]:
tree = NearestNeighbors(
    n_neighbors=30, 
    algorithm='brute',
    metric= 'cosine'
)

In [7]:
tree.fit(X)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=30, p=2,
                 radius=1.0)

In [8]:
imdb_df[imdb_df['title'].str.contains('Matrix')]

Unnamed: 0,fn,tid,title,wordsInTitle,url,imdbRating,ratingCount,duration,year,type,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
166,titles01/tt0133093,tt0133093,Matrix (1999),matrix,http://www.imdb.com/title/tt0133093/,8.7,841150.0,8160.0,1999.0,video.movie,...,0,0,0,1,0,0,0,0,0,0
837,titles02/index.html.2766,tt0234215,Matrix Reloaded (2003),matrix reloaded,http://www.imdb.com/title/tt0234215/,7.2,333743.0,8280.0,2003.0,video.movie,...,0,0,0,1,0,0,0,0,0,0
841,titles02/index.html.2770,tt0242653,Matrix Revolutions (2003),matrix revolutions,http://www.imdb.com/title/tt0242653/,6.7,288612.0,7740.0,2003.0,video.movie,...,0,0,0,1,0,0,0,0,0,0
4604,titles03/index.html.4587,tt0594932,HBO First Look The Matrix Revolutions (TV Epis...,hbo first look the matrix revolutions tv episode,http://www.imdb.com/title/tt0594932/,7.3,10.0,780.0,2003.0,video.episode,...,0,0,0,0,0,0,0,0,0,0
13240,titles04/index.html.7348,tt0274085,Sex and the Matrix (TV Short 2000),sex and the matrix tv short,http://www.imdb.com/title/tt0274085/,7.2,714.0,360.0,2000.0,video.tv,...,0,0,0,0,1,0,0,0,0,0
13803,titles04/index.html.8294,tt0365467,Making 'The Matrix' (TV Movie 1999),making the matrix tv movie,http://www.imdb.com/title/tt0365467/,6.8,169.0,1560.0,1999.0,video.movie,...,0,0,0,1,1,0,0,0,0,0


In [9]:
movie_of_interest = X[166]


distances, neighbors = tree.kneighbors([movie_of_interest], 5)

In [10]:
display(neighbors[0])

array([166, 167, 178, 136, 139])

In [11]:
display(imdb_df.iloc[neighbors[0]])

Unnamed: 0,fn,tid,title,wordsInTitle,url,imdbRating,ratingCount,duration,year,type,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
166,titles01/tt0133093,tt0133093,Matrix (1999),matrix,http://www.imdb.com/title/tt0133093/,8.7,841150.0,8160.0,1999.0,video.movie,...,0,0,0,1,0,0,0,0,0,0
167,titles01/tt0137523,tt0137523,Fight Club (1999),fight club,http://www.imdb.com/title/tt0137523/,8.9,900388.0,8340.0,1999.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
178,titles01/tt0209144,tt0209144,Memento (2000),memento,http://www.imdb.com/title/tt0209144/,8.6,605480.0,6780.0,2000.0,video.movie,...,0,0,0,0,0,0,0,1,0,0
136,titles01/tt0109830,tt0109830,Forrest Gump (1994),forrest gump,http://www.imdb.com/title/tt0109830/,8.8,799314.0,8520.0,1994.0,video.movie,...,0,0,1,0,0,0,0,0,0,0
139,titles01/tt0110912,tt0110912,Pulp Fiction (1994),pulp fiction,http://www.imdb.com/title/tt0110912/,9.0,911042.0,9240.0,1994.0,video.movie,...,0,0,0,0,0,0,0,1,0,0
