# Content Filtering

## Importing Libraries and loading data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#reading movie file
movie_df = pd.read_csv("movies.csv")

In [3]:
movie_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy
5,6,Heat,Action|Crime|Thriller
6,7,Sabrina,Comedy|Romance
7,8,Tom and Huck,Adventure|Children
8,9,Sudden Death,Action
9,10,GoldenEye,Action|Adventure|Thriller


In [4]:
# Break up the big genre string into a string array
movie_df['genres'] = movie_df['genres'].str.split('|')

# convert genre to string value
movie_df['genres'] = movie_df['genres'].fillna("").astype('str')

In [5]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men,"['Comedy', 'Romance']"
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II,['Comedy']


## Recommandation based on genre

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(movie_df['genres'])
tfidf_matrix.shape


(9742, 177)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim[:4,:4]

array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

In [16]:
# build a 1-dimesional array with movie title
titles= movie_df['title']
indices = pd.Series(movie_df.index, index=movie_df['title'])
indices

title
Toy Story                                 0
Jumanji                                   1
Grumpier Old Men                          2
Waiting to Exhale                         3
Father of the Bride Part II               4
                                       ... 
Black Butler: Book of the Atlantic     9737
No Game No Life: Zero                  9738
Flint                                  9739
Bungo Stray Dogs: Dead Apple           9740
Andrew Dice Clay: Dice Rules           9741
Length: 9742, dtype: int64

In [22]:
# Funtion that get movie recommendation based on the consine simillarity score of movie ganre

def genre_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [23]:
genre_recommendation('Dark Knight ').head(10)

8387                        Need for Speed 
8149    Grandmaster, The (Yi dai zong shi) 
123                              Apollo 13 
8026                            Life of Pi 
8396                                  Noah 
38                         Dead Presidents 
341                            Bad Company 
347           Faster Pussycat! Kill! Kill! 
430                      Menace II Society 
568                        Substitute, The 
Name: title, dtype: object

## Recommandation based on title

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(movie_df['title'])
tfidf_matrix.shape


(9742, 20413)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim[:4,:4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [32]:
# build a 1-dimesional array with movie title
titles= movie_df['title']
indices = pd.Series(movie_df.index, index=movie_df['title'])
indices

title
Toy Story                                 0
Jumanji                                   1
Grumpier Old Men                          2
Waiting to Exhale                         3
Father of the Bride Part II               4
                                       ... 
Black Butler: Book of the Atlantic     9737
No Game No Life: Zero                  9738
Flint                                  9739
Bungo Stray Dogs: Dead Apple           9740
Andrew Dice Clay: Dice Rules           9741
Length: 9742, dtype: int64

In [34]:
# Funtion that get movie recommendation based on the consine simillarity score of movie titles

def genre_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [35]:
genre_recommendation('Dark Knight ').head(10)

7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
Name: title, dtype: object

# Collaborative Filtering