# Content Filtering

## Importing Libraries and loading data

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [67]:
#reading movie file
movie_df = pd.read_csv("movies.csv")

In [68]:
movie_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy
5,6,Heat,Action|Crime|Thriller
6,7,Sabrina,Comedy|Romance
7,8,Tom and Huck,Adventure|Children
8,9,Sudden Death,Action
9,10,GoldenEye,Action|Adventure|Thriller


In [104]:
# Break up the big genre string into a string array
movie_df['genres'] = movie_df['genres'].str.split('|')

# convert genre to string value
movie_df['genres'] = movie_df['genres'].fillna("").astype('str')

In [70]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men,"['Comedy', 'Romance']"
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II,['Comedy']


## Example with TF-IDF Implementation with python (Mannually)

In [173]:
# Two simple documents containing one sentence each.

documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

# splite sentence based on space
bowA = documentA.split(' ')
bowB = documentB.split(' ')

# find unique words in both the sentence
unique_words = set(bowA).union(set(bowB))

# create dictionary of words and their occurence for each document.
numOfWordsA = dict.fromkeys(unique_words, 0)
for word in bowA:
    numOfWordsA[word] += 1; 
    
# create dictionary of words and their occurence for each document.    
numOfWordsB = dict.fromkeys(unique_words, 0)    
for word in bowB:
    numOfWordsB[word] += 1;        

print(numOfWordsA)
print(numOfWordsB)


{'around': 0, 'went': 1, 'the': 1, 'sat': 0, 'fire': 0, 'out': 1, 'for': 1, 'children': 0, 'a': 1, 'walk': 1, 'man': 1}
{'around': 1, 'went': 0, 'the': 2, 'sat': 1, 'fire': 1, 'out': 0, 'for': 0, 'children': 1, 'a': 0, 'walk': 0, 'man': 0}


### Term Frequency (TF)

In [185]:
# compute TF
def computeTF(wordDict, bag_of_words):
    print(wordDict, bag_of_words)
    tfDict = {}
    bag_of_words_count = len(bag_of_words)
    print(bag_of_words_count)
    for word, count in wordDict.items():
        #print(word,count)
        tfDict[word] = count / float(bag_of_words_count)
    return tfDict        

In [186]:
tf_A = computeTF(numOfWordsA,bowA)
tf_B = computeTF(numOfWordsB,bowB)

print(tf_A)
print(tf_B)


{'around': 0, 'went': 1, 'the': 1, 'sat': 0, 'fire': 0, 'out': 1, 'for': 1, 'children': 0, 'a': 1, 'walk': 1, 'man': 1} ['the', 'man', 'went', 'out', 'for', 'a', 'walk']
7
{'around': 1, 'went': 0, 'the': 2, 'sat': 1, 'fire': 1, 'out': 0, 'for': 0, 'children': 1, 'a': 0, 'walk': 0, 'man': 0} ['the', 'children', 'sat', 'around', 'the', 'fire']
6
{'around': 0.0, 'went': 0.14285714285714285, 'the': 0.14285714285714285, 'sat': 0.0, 'fire': 0.0, 'out': 0.14285714285714285, 'for': 0.14285714285714285, 'children': 0.0, 'a': 0.14285714285714285, 'walk': 0.14285714285714285, 'man': 0.14285714285714285}
{'around': 0.16666666666666666, 'went': 0.0, 'the': 0.3333333333333333, 'sat': 0.16666666666666666, 'fire': 0.16666666666666666, 'out': 0.0, 'for': 0.0, 'children': 0.16666666666666666, 'a': 0.0, 'walk': 0.0, 'man': 0.0}


### Inverse Data Frequency (IDF)

In [155]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))           
    
    return idfDict

In [158]:
idfs = computeIDF([numOfWordsA,numOfWordsB])
idfs

{'around': 0.6931471805599453,
 'went': 0.6931471805599453,
 'the': 0.0,
 'sat': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'out': 0.6931471805599453,
 'for': 0.6931471805599453,
 'children': 0.6931471805599453,
 'a': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'man': 0.6931471805599453}

### TF-IDF

In [162]:
def computeTFIDF(tf_bag_of_words, idfs):
    tfidf = {}
    for word, val in tf_bag_of_words.items():
        tfidf[word] = val * idfs[word]
    return tfidf    

In [164]:
# Compute the TF-IDF scores for all the words
tfidf_A = computeTFIDF(tf_A,idfs)
tfidf_B = computeTFIDF(tf_B,idfs)

df = pd.DataFrame([tfidf_A,tfidf_B])

df


Unnamed: 0,around,went,the,sat,fire,out,for,children,a,walk,man
0,0.0,0.063013,0.0,0.0,0.0,0.063013,0.063013,0.0,0.063013,0.063013,0.063013
1,0.063013,0.0,0.0,0.063013,0.063013,0.0,0.0,0.063013,0.0,0.0,0.0


#### Rather than manually implementing TF-IDF ourselves, we could use the class provided by sklearn. The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations.

In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [167]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0


## Recommandation based on genre

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(movie_df['genres'])
feature_names = tf.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
#tfidf_matrix.shape
df


Unnamed: 0,action,action adventure,action animation,action children,action comedy,action crime,action documentary,action drama,action fantasy,action horror,...,sci fi,thriller,thriller imax,thriller war,thriller western,war,war imax,war western,western,western imax
0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.238499,0.0,0.533576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,0.353726,0.0,0.791366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
print(tfidf_matrix)

  (0, 63)	0.4051430286389587
  (0, 47)	0.3681884973089335
  (0, 34)	0.38369482677526473
  (0, 18)	0.4008862821540716
  (0, 108)	0.30254034715329503
  (0, 59)	0.16761357728391116
  (0, 46)	0.3162303113127544
  (0, 33)	0.32335863498874723
  (0, 17)	0.26110809240797916
  (1, 51)	0.5795995638728872
  (1, 19)	0.5337814180965866
  (1, 108)	0.36554429536140276
  (1, 46)	0.382085190978399
  (1, 17)	0.31548378439611124
  (2, 68)	0.7695974416123483
  (2, 160)	0.5242383036039113
  (2, 59)	0.36454626441402677
  (3, 103)	0.5645649298589199
  (3, 62)	0.5417511322516687
  (3, 96)	0.2904365851652309
  (3, 160)	0.4522400920963429
  (3, 59)	0.31447995130958456
  (4, 59)	1.0
  (5, 84)	0.604518892749723
  (5, 5)	0.5454388121871825
  :	:
  (9733, 38)	0.835677806885533
  (9733, 96)	0.23714974930952545
  (9733, 33)	0.495381266784903
  (9734, 62)	0.7846149876753742
  (9734, 96)	0.42063760299449465
  (9734, 59)	0.4554594691761476
  (9735, 33)	1.0
  (9736, 86)	1.0
  (9737, 2)	0.5335755137706529
  (9737, 35)	0.4

In [74]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim[:4,:4]

array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

In [75]:
print(cosine_sim)

[[1.         0.31379419 0.0611029  ... 0.         0.16123168 0.16761358]
 [0.31379419 1.         0.         ... 0.         0.         0.        ]
 [0.0611029  0.         1.         ... 0.         0.         0.36454626]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.16123168 0.         0.         ... 0.         1.         0.        ]
 [0.16761358 0.         0.36454626 ... 0.         0.         1.        ]]


In [87]:
# build a 1-dimesional array with movie title
titles= movie_df['title']
indices = pd.Series(movie_df.index, index=movie_df['title'])
indices

title
Toy Story                                 0
Jumanji                                   1
Grumpier Old Men                          2
Waiting to Exhale                         3
Father of the Bride Part II               4
                                       ... 
Black Butler: Book of the Atlantic     9737
No Game No Life: Zero                  9738
Flint                                  9739
Bungo Stray Dogs: Dead Apple           9740
Andrew Dice Clay: Dice Rules           9741
Length: 9742, dtype: int64

In [102]:
# Funtion that get movie recommendation based on the consine simillarity score of movie ganre

def genre_recommendation(title):
    idx = indices[title]
    print(idx);
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1], reverse=True)
    #print(sim_scores)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    return titles.iloc[movie_indices]

In [103]:
genre_recommendation('Dark Knight ').head(10)

6710
[7768, 8032, 8080, 140, 2417, 5778, 7375, 3576, 3190, 6858, 4242, 5060, 1305, 5483, 6815, 5934, 4749, 7877, 8766, 6690]


7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
Name: title, dtype: object

## Recommandation based on title

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(movie_df['title'])
tfidf_matrix.shape


(9742, 20413)

In [80]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim[:4,:4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [81]:
# build a 1-dimesional array with movie title
titles= movie_df['title']
indices = pd.Series(movie_df.index, index=movie_df['title'])
indices

title
Toy Story                                 0
Jumanji                                   1
Grumpier Old Men                          2
Waiting to Exhale                         3
Father of the Bride Part II               4
                                       ... 
Black Butler: Book of the Atlantic     9737
No Game No Life: Zero                  9738
Flint                                  9739
Bungo Stray Dogs: Dead Apple           9740
Andrew Dice Clay: Dice Rules           9741
Length: 9742, dtype: int64

In [82]:
# Funtion that get movie recommendation based on the consine simillarity score of movie titles

def genre_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [83]:
genre_recommendation('Dark Knight ').head(10)

7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
Name: title, dtype: object

# Collaborative Filtering