# Movie Recommendation Engine based on the IMDB Movie Dataset

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [28]:
dt=pd.read_csv('movie_dataset.csv')

In [5]:
dt.describe()

Unnamed: 0,index,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,2401.0,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,1386.651002,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,1200.5,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,2401.0,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,3601.5,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,4802.0,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [35]:
dt.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [5]:
dt.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [6]:
features=dt[['keywords','cast','genres','director']]

In [7]:
for i in features:
    features[i]=features[i].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
def combine_features(row):
    try:
        return row['keywords']+" "+row['cast']+" "+row['genres']+row["director"]
    except:
        print("Error:",row)

In [9]:
features['combined_features']=features.apply(combine_features,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
features['combined_features']

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: combined_features, Length: 4803, dtype: object

In [11]:
vectorizer=CountVectorizer()

***Here that object is fitting with the data of the column "combined_features"***

In [12]:
matrix=vectorizer.fit_transform(features['combined_features'])

In [13]:
print(matrix.shape)

(4803, 17055)


***To see the data in the recieved Sparse type of matrix , we use toarray() function, which convert its data into array format***

In [14]:
type(matrix)

scipy.sparse.csr.csr_matrix

In [15]:
print(matrix.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

***Cosine Similarity Method Used***

In [17]:
cos_similarity=cosine_similarity(matrix)

In [18]:
cos_similarity.shape

(4803, 4803)

In [19]:
dt.index[dt['title']=="Avatar"]

Int64Index([0], dtype='int64')

In [20]:
dt.title

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4798                                 El Mariachi
4799                                   Newlyweds
4800                   Signed, Sealed, Delivered
4801                            Shanghai Calling
4802                           My Date with Drew
Name: title, Length: 4803, dtype: object

In [21]:
avatar_sim=cos_similarity[0]

In [22]:
avatar_sim

array([1.       , 0.0728357, 0.0836242, ..., 0.       , 0.       ,
       0.       ])

***Now we got the similarty score list of all the movies with avatar movie, but here in the 'cos_similarity' we have index values which holds the similarity of each movie, since we want to know the names of top similar movies with avatar movie, therefore we will have to get the index of all the top similar movies***

***We will get the indexes of all the top movies mapped with there actual index in "cos_similarity" array into the form of tuples in a list***

In [23]:
avatar_list_with_similarity_index=list(enumerate(cos_similarity[1020]))

In [43]:
avatar_list_with_similarity_index[0:50]

[(0, 0.0),
 (1, 0.0),
 (2, 0.04767312946227961),
 (3, 0.0),
 (4, 0.045643546458763846),
 (5, 0.045643546458763846),
 (6, 0.0),
 (7, 0.04152273992686998),
 (8, 0.0),
 (9, 0.09325048082403138),
 (10, 0.045643546458763846),
 (11, 0.045643546458763846),
 (12, 0.0),
 (13, 0.0),
 (14, 0.08606629658238704),
 (15, 0.044721359549995794),
 (16, 0.0408248290463863),
 (17, 0.0),
 (18, 0.04152273992686998),
 (19, 0.0),
 (20, 0.04662524041201569),
 (21, 0.09325048082403138),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.04662524041201569),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.045643546458763846),
 (32, 0.04303314829119352),
 (33, 0.12909944487358055),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.08944271909999159),
 (39, 0.0),
 (40, 0.045643546458763846),
 (41, 0.044721359549995794),
 (42, 0.0),
 (43, 0.045643546458763846),
 (44, 0.049999999999999996),
 (45, 0.0),
 (46, 0.042257712736425826),
 (47, 0.0),
 (48, 0.0512989176042577),
 (49, 0.049999999999999996)]

***Now we have got list of tuples having each score mapped with its index, now we want to sort all the tuples in the list based on there score values , after sorting we will get the tuples sorted by score values***

In [25]:
sorted_avatar_list_with_similarity_index=sorted(avatar_list_with_similarity_index,key=lambda x :x[1],reverse=True)

In [44]:
sorted_avatar_list_with_similarity_index[0:50]

[(1020, 1.0000000000000002),
 (1041, 0.33371190623595726),
 (982, 0.31950482521134693),
 (1815, 0.3077935056255462),
 (3507, 0.23312620206007845),
 (2399, 0.20080483222562473),
 (4408, 0.19999999999999998),
 (451, 0.17888543819998318),
 (63, 0.17541160386140586),
 (1781, 0.1690308509457033),
 (4767, 0.16770509831248423),
 (444, 0.1632993161855452),
 (2924, 0.16269784336399212),
 (3858, 0.16269784336399212),
 (1844, 0.1538967528127731),
 (2557, 0.1538967528127731),
 (2780, 0.1538967528127731),
 (4098, 0.15),
 (779, 0.14638501094227996),
 (1178, 0.14638501094227996),
 (1537, 0.14638501094227996),
 (2688, 0.14638501094227996),
 (3682, 0.14638501094227996),
 (984, 0.14301938838683884),
 (1127, 0.14301938838683884),
 (2553, 0.14301938838683884),
 (1245, 0.13987572123604708),
 (1517, 0.13987572123604708),
 (2198, 0.13987572123604708),
 (2773, 0.13987572123604708),
 (480, 0.13693063937629155),
 (930, 0.13693063937629155),
 (1432, 0.13693063937629155),
 (3896, 0.13693063937629155),
 (511, 0.13

In [27]:
def get_title_index(ind):
    return dt['title'][ind]

# Here is the output for the Top 10 similar movies for the movie title 'Avatar' having index value '0' in the dataset

In [28]:
print('**Top 10 Correlated movies are : \n ')
for i in sorted_avatar_list_with_similarity_index[0:10]:
    print(get_title_index(i[0]))
    

**Top 10 Correlated movies are : 
 
Taken 3
Taken 2
Run All Night
Taken
Deadfall
Black Nativity
Jimmy and Judy
The Haunting
The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
A Walk Among the Tombstones


## Dumping the cos_similarity data into a Pickle file

***In the below code we are saving the 'cos_similarity' data into a file using pickle module***

In [1]:
import pickle

In [30]:
array_obj=open('cos_similarity.pkl','wb')
pickle.dump(cos_similarity,array_obj)
array_obj.close()

## Loading the pickle file which was dumped by above code and it sized about 176mb

In [31]:
cosine_similarity_list=pickle.load(open('cos_similarity.pkl','rb'))

In [32]:
cosine_similarity_list.shape

(4803, 4803)

# Pickle File Compression and uncompressing techniques

### *Compressing the pickle file with file  name "compressed_cos_similarity.pkl" :*

In [33]:
import bz2
import _pickle as cPickle

In [34]:
sfile=bz2.BZ2File('compressed_cos_similarity.pkl','wb')
pickle.dump(cosine_similarity_list,sfile)

In [35]:
sfile.close()

### *Uncompressing the pickle file with file  name "compressed_cos_similarity.pkl" which was compressed by above code and compression reduced its size from 176mb to 12mb...Great :*

In [36]:
f=bz2.BZ2File('compressed_cos_similarity.pkl','rb')

In [37]:
myobj=cPickle.load(f)

In [38]:
myobj.shape

(4803, 4803)

### The loaded data from both Compressed and uncompressed pickle files is same..Great

In [40]:
myobj[1020]

array([0.        , 0.        , 0.04767313, ..., 0.        , 0.        ,
       0.        ])

In [41]:
cosine_similarity_list[1020]

array([0.        , 0.        , 0.04767313, ..., 0.        , 0.        ,
       0.        ])

***In the below code we will save movie 'titles' with their corresponding indexes of all the movies into a file using pickle module***

In [33]:
title_file=open('movie_index.pkl','wb')

In [35]:
pickle.dump(dt['title'],title_file)

In [None]:
title_file.close()

***In the below code we will save movie 'id' and 'titles' with their corresponding indexes of all the movies into a file using pickle module***

In [None]:
title_id=open('id_title.pkl','wb')

In [None]:
pickle.dump(dt[['id','title']],title_id)

In [None]:
title_id.close()

### Reading the pickle files

In [2]:
movie_titles=pickle.load(open('movie_index.pkl','rb'))

In [None]:
movie_titles.head()

In [None]:
movie_id_titles=pickle.load(open('id_title.pkl','rb'))

In [None]:
movie_id_titles.head()

In [53]:
type(movie_titles)

pandas.core.series.Series

In [32]:
movie_titles.shape

(4803,)

# Rough code

### *In some records of the data , there are some duplicate values or incorrect values , we can check this by below code*

In [23]:
dt.index[dt['title']=='The Matrix Revolutions']

Int64Index([123], dtype='int64')

In [32]:
dt.keywords[123]

'saving the world artificial intelligence man vs machine flying philosophy'

In [25]:
dt.index[dt['title']=='Interstellar']

Int64Index([95], dtype='int64')

In [33]:
dt.keywords[95]

'saving the world artificial intelligence father son relationship single parent nasa'

In [37]:
dt['id'].loc[dt['title']=='Aliens']

2403    679
Name: id, dtype: int64

In [38]:
dt['id']

0        19995
1          285
2       206647
3        49026
4        49529
         ...  
4798      9367
4799     72766
4800    231617
4801    126186
4802     25975
Name: id, Length: 4803, dtype: int64

'Adventure Action Thriller Science Fiction'----'Adventure Drama Science Fiction'

'saving the world artificial intelligence man vs machine flying philosophy'-----'saving the world artificial intelligence father son relationship single parent nasa'

'Keanu Reeves Laurence Fishburne Carrie-Anne Moss Hugo Weaving Mary Alice'-----'Matthew McConaughey Jessica Chastain Anne Hathaway Michael Caine Casey Affleck'

'Lilly Wachowski'----'Christopher Nolan'







