we have two MovieLens datasets.

* The Full Dataset: Consists of 26,000,000 ratings and 750,000 tag applications applied to 45,000 movies by 270,000 users. Includes tag genome data with 12 million relevance scores across 1,100 tags.
* The Small Dataset: Comprises of 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval

import warnings; 
warnings.simplefilter('ignore')



In [2]:
mmd = pd.read_csv('MovieDataset/movies_metadata.csv')
mmd.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
mmd.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [4]:
mmd = mmd[mmd['genres'] != '[]']

In [5]:
mmd.shape

(43024, 24)

In [6]:
mmd = mmd.drop(['tagline', 'homepage', 'belongs_to_collection'], axis = 1)

In [7]:
mmd.dropna(inplace=True)

In [8]:
mmd.drop_duplicates(inplace=True)

In [9]:
mmd.iloc[0].genres

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [10]:
# convert in just genres to just name of list

def convertGernes(obj):
    l = []
    for i in literal_eval(obj):
        l.append(i['name'])
    return l
convertGernes(mmd.iloc[0].genres)

['Animation', 'Comedy', 'Family']

In [11]:
mmd['genres'] = mmd['genres'].apply(convertGernes)
mmd['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [12]:
mmd['year'] = mmd['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


Metadata Based Recommender

To build our standard metadata based content recommender, we will need to merge our current dataset with the crew and the keyword datasets. Let us prepare this data as our first step.


In [13]:
credits = pd.read_csv('MovieDataset/credits.csv')
keywords = pd.read_csv('MovieDataset/keywords.csv')
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [14]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
mmd['id'] = mmd['id'].astype('int')

In [15]:
mmd.shape

(42069, 22)

In [16]:
mmd = mmd.merge(credits, on='id')
mmd = mmd.merge(keywords, on='id')

In [17]:
links_small = pd.read_csv('MovieDataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]
links_small = links_small['tmdbId'].astype('int')
links_small

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

In [46]:
# small metadata
smd = mmd[mmd['id'].isin(links_small)]
smd.shape

(9143, 25)


    Crew: From the crew, we will only pick the director as our feature since the others don't contribute that much to the feel of the movie.
    Cast: Choosing Cast is a little more tricky. Lesser known actors and minor roles do not really affect people's opinion of a movie. Therefore, we must only select the major characters and their respective actors. Arbitrarily we will choose the top 3 actors that appear in the credits list.


In [47]:
smd[['cast', 'crew', 'keywords']].head()

Unnamed: 0,cast,crew,keywords
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [48]:
# converting each entry as dictionary
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

In [49]:
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [50]:
smd

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,...,title,video,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size
0,False,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,...,Toy Story,False,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",13,106
1,False,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,...,Jumanji,False,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",26,16
2,False,0,"[Romance, Comedy]",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,...,Grumpier Old Men,False,6.5,92.0,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",7,4
3,False,16000000,"[Comedy, Drama, Romance]",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,...,Waiting to Exhale,False,6.1,34.0,1995,"[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",10,10
4,False,0,[Comedy],11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,...,Father of the Bride Part II,False,5.7,173.0,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",12,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38088,False,8000000,[Drama],159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,0.038998,/yWp7PgydSlxlhl7benKhTnCvRjN.jpg,...,The Last Brickmaker in America,False,7.0,1.0,2001,"[{'cast_id': 1, 'character': 'Henry Cobb', 'cr...","[{'credit_id': '544475aac3a36819fb000578', 'de...","[{'id': 6054, 'name': 'friendship'}, {'id': 20...",7,2
38288,False,1000000,"[Thriller, Romance]",392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",7.333139,/q1lrN6ZrIsOs077lQB86aPGKZRF.jpg,...,Rustom,False,7.3,25.0,2016,"[{'cast_id': 0, 'character': 'Rustom Pavri', '...","[{'credit_id': '5951baf692514129c4016600', 'de...","[{'id': 10540, 'name': 'bollywood'}]",14,16
38337,False,15050000,"[Adventure, Drama, History, Romance]",402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",1.423358,/q2XVemXiWSa18mbaVpI3rbLXG2u.jpg,...,Mohenjo Daro,False,6.7,26.0,2016,"[{'cast_id': 0, 'character': 'Sarman', 'credit...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...","[{'id': 10540, 'name': 'bollywood'}]",12,16
38471,False,15000000,"[Action, Adventure, Drama, Horror, Science Fic...",315011,tt4262980,ja,シン・ゴジラ,From the mind behind Evangelion comes a hit la...,9.285519,/8YWirGQidtZeSEmhqvQM5FrI6N1.jpg,...,Shin Godzilla,False,6.6,152.0,2016,"[{'cast_id': 4, 'character': 'Rando Yaguchi : ...","[{'credit_id': '560892fa92514177550018b2', 'de...","[{'id': 1299, 'name': 'monster'}, {'id': 7671,...",49,27


In [51]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [52]:
smd['director'] = smd['crew'].apply(get_director)

In [53]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x])

In [54]:
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [55]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x])

 creating a metadata dump for every movie which consists of genres, director, main actors and keywords. I then use a Count Vectorizer to create our count matrix as we did in the Description Recommender.

 preparation of my genres and credits data:

  * Strip Spaces and Convert to Lowercase from all our features. This way, our engine will not confuse between Johnny Depp and Johnny Galecki.
  * Mention Director 3 times to give it more weight relative to the entire cast.


In [56]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [57]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [58]:
smd['director'] = smd['director'].apply(lambda x: [x,x, x])


Keywords

We will do a small amount of pre-processing of our keywords before putting them to any use. As a first step, we calculate the frequenct counts of every keyword that appears in the dataset.


In [59]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'


In [60]:
s = s.value_counts()
s[:5]

independent film        606
woman director          543
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. Therefore, these can be safely removed. Finally, we will convert every word to its stem so that words such as Dogs and Dog are considered the same.

In [61]:
s = s[s > 1]

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [63]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [64]:
def concate_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [65]:
smd['keywords'] = smd['keywords'].apply(concate_keywords)

In [66]:
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [67]:
smd['combined'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['combined'] = smd['combined'].apply(lambda x: ' '.join(x))

In [68]:
smd['combined']

0        jealousi toy boy friendship friend rivalri boy...
1        boardgam disappear basedonchildren'sbook newho...
2        fish bestfriend duringcreditssting waltermatth...
3        basedonnovel interracialrelationship singlemot...
4        babi midlifecrisi confid age daughter motherda...
                               ...                        
38088    friendship sidneypoitier wendycrewson jayo.san...
38288    bollywood akshaykumar ileanad'cruz eshagupta t...
38337    bollywood hrithikroshan poojahegde kabirbedi a...
38471    monster godzilla giantmonst destruct kaiju hir...
38718    music documentari paulmccartney ringostarr joh...
Name: combined, Length: 9143, dtype: object

In [41]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['combined'])

In [42]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [70]:
smd.reset_index(inplace = True)

In [100]:
titles = smd['title']
tmdbId = smd['id']
indices = pd.Series(smd.index, index=smd['title'])

In [105]:
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        9138
Rustom                                                9139
Mohenjo Daro                                          9140
Shin Godzilla                                         9141
The Beatles: Eight Days a Week - The Touring Years    9142
Length: 9143, dtype: int64

In [102]:
new_smd = smd[['id', 'title', 'combined']]
new_smd

Unnamed: 0,id,title,combined
0,862,Toy Story,jealousi toy boy friendship friend rivalri boy...
1,8844,Jumanji,boardgam disappear basedonchildren'sbook newho...
2,15602,Grumpier Old Men,fish bestfriend duringcreditssting waltermatth...
3,31357,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,11862,Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...
...,...,...,...
9138,159550,The Last Brickmaker in America,friendship sidneypoitier wendycrewson jayo.san...
9139,392572,Rustom,bollywood akshaykumar ileanad'cruz eshagupta t...
9140,402672,Mohenjo Daro,bollywood hrithikroshan poojahegde kabirbedi a...
9141,315011,Shin Godzilla,monster godzilla giantmonst destruct kaiju hir...


In [108]:
def content_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return list(tmdbId.iloc[movie_indices].values), list(titles.iloc[movie_indices].values)

title = 'Batman Begins'
content_recommendations(title)

([155, 49026, 1124, 11660, 27205],
 ['The Dark Knight',
  'The Dark Knight Rises',
  'The Prestige',
  'Following',
  'Inception'])

Collaborative Filtering

In [76]:
ratings = pd.read_csv('MovieDataset/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [77]:
ratings.groupby('movieId').count()['rating']

movieId
1         247
2         107
3          59
4          13
5          56
         ... 
161944      1
162376      1
162542      1
162672      1
163949      1
Name: rating, Length: 9066, dtype: int64

In [78]:
# smd.id = links_small_df.tmdbId
# links_small.movieId = ratings.movieId
links_small_df = pd.read_csv('MovieDataset/links_small.csv')
links_small_df.dropna(inplace=True)
links_small_df['tmdbId'] = links_small_df['tmdbId'].astype(int)
links_small_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [79]:
temp1 = smd[['id', 'title']]
temp1 = temp1.rename(columns={'id':'tmdbId'})
temp2 = pd.merge(links_small_df, temp1, on='tmdbId')
temp2 = temp2[['movieId', 'title']]
ratings = pd.merge(ratings, temp2, on = 'movieId')

In [80]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,31,2.5,1260759144,Dangerous Minds
1,7,31,3.0,851868750,Dangerous Minds
2,31,31,4.0,1273541953,Dangerous Minds
3,32,31,4.0,834828440,Dangerous Minds
4,36,31,3.0,847057202,Dangerous Minds


In [81]:
pt = ratings.pivot_table(index = 'title', columns = 'userId', values = 'rating')
pt.fillna(0, inplace = True)
pt

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$9.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'night, Mother",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And God Created Woman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
¡Three Amigos!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
À Nous la Liberté,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Æon Flux,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Želary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
cosine_sim_pt = cosine_similarity(pt)
cosine_sim_pt

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.12403473],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.12403473, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [83]:
pd.Series([i for i in range(pt.shape[0])], index=pt.index)

title
$9.99                          0
'Neath the Arizona Skies       1
'night, Mother                 2
(500) Days of Summer           3
...And God Created Woman       4
                            ... 
¡Three Amigos!              8704
À Nous la Liberté           8705
Æon Flux                    8706
Želary                      8707
’Round Midnight             8708
Length: 8709, dtype: int64

In [84]:
pt_titles = pd.Series(pt.index)
pt_indices = pd.Series([i for i in range(pt.shape[0])], index=pt.index)

In [110]:
def collaborative_recommendations(title):
    pt_idx = pt_indices[title]
    pt_sim_scores = list(enumerate(cosine_sim_pt[pt_idx]))
    pt_sim_scores = sorted(pt_sim_scores, key=lambda x: x[1], reverse=True)
    pt_sim_scores = pt_sim_scores[1:6]
    pt_movies_inices = [i[0] for i in pt_sim_scores]
    return list(pt_titles.iloc[pt_movies_inices].values)

title = 'Batman'
collaborative_recommendations(title)

['True Lies',
 'Batman Forever',
 'The Fugitive',
 'Dances with Wolves',
 'Jurassic Park']

Hybrid Recommendation System

In [86]:
title = 'Batman Begins'
def hybrid_recommendation(title):
    cnt = content_recommendations(title)
    clb = collaborative_recommendations(title)
    return list(set(cnt+clb))

hybrid_recommendation(title)

['The Matrix Reloaded',
 'The Dark Knight',
 'Ninja',
 'Iron Man',
 'The Prestige',
 'Batman Returns',
 'The Bourne Identity',
 'The Dark Knight Rises',
 'V for Vendetta',
 'Insomnia',
 'The Incredibles',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Interstellar',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Inception',
 'Memento',
 'The Lord of the Rings: The Return of the King',
 'Following',
 'The Lord of the Rings: The Two Towers']

In [87]:
import pickle

In [88]:
#pickle.dump(new_smd.to_dict(), open('movie_dict.pkl', 'wb'))

In [89]:
#pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))