In [169]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [170]:
def title_from_index(index):
    return df[df.index == index]["title"].values[0]

In [171]:
def index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [172]:
df = pd.read_csv('movie_dataset_bollywood.csv')
print(df.columns)

Index(['index', 'imdbId', 'title', 'releaseYear', 'releaseDate', 'genre',
       'writers', 'actors', 'directors', 'sequel', 'hitFlop'],
      dtype='object')


In [182]:
# Getting features from our dataset and if those features contain 'NaN' values then replace it with a blank

In [173]:
features = ['genre', 'actors', 'directors','writers']
for feature in features:
    df[feature] = df[feature].fillna('')

In [184]:
# Combining all the extracted features

In [183]:
def combine_features(row):
    try:
        return row['genre'] + " " + row['actors'] + " "+ row['directors']+ " "+ row['writers']
    except:
        print ('Error: ',row)
df["combined_features"] = df.apply(combine_features, axis=1)
print("Combined Features : ",df["combined_features"].head())

Combined Features :  0    Romance Govinda | Aishwarya Rai Bachchan | Jac...
1    Adventure | Drama | Musical Aamir Khan | Gracy...
2    Action | Comedy Akshay Kumar | Sridevi | Gulsh...
3    Drama | Romance Shah Rukh Khan | Madhuri Dixit...
4    Action | Comedy | Drama Shah Rukh Khan | Juhi ...
Name: combined_features, dtype: object


In [186]:
# CountVectorizer is used for converting into matrix of token count

In [185]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])
print(count_matrix)

  (0, 910)	1
  (0, 3000)	1
  (0, 1380)	1
  (0, 3095)	1
  (0, 2819)	1
  (0, 1304)	2
  (0, 1260)	2
  (0, 2777)	1
  (0, 833)	1
  (0, 2944)	1
  (0, 2065)	1
  (0, 2979)	1
  (0, 1327)	1
  (0, 372)	1
  (0, 2445)	1
  (0, 98)	1
  (0, 1133)	1
  (0, 2624)	1
  (1, 2807)	1
  (1, 822)	1
  (1, 2754)	1
  (1, 818)	1
  (1, 1671)	1
  (1, 1135)	3
  (1, 307)	3
  :	:
  (1282, 2745)	1
  (1282, 1184)	1
  (1282, 885)	1
  (1282, 3425)	1
  (1282, 2689)	1
  (1282, 920)	1
  (1282, 1455)	1
  (1282, 813)	1
  (1282, 2444)	1
  (1282, 1507)	1
  (1282, 769)	1
  (1283, 1840)	1
  (1283, 1513)	1
  (1283, 2965)	1
  (1283, 2622)	2
  (1283, 3443)	1
  (1283, 1799)	2
  (1283, 227)	1
  (1283, 3019)	1
  (1283, 813)	1
  (1283, 2889)	1
  (1283, 3254)	1
  (1283, 1961)	1
  (1283, 1509)	1
  (1283, 80)	1


In [176]:
cosine_similar = cosine_similarity(count_matrix)
print(cosine_similar)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.08703883 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.08703883 ... 0.         1.         0.08703883]
 [0.         0.         0.         ... 0.         0.08703883 1.        ]]


In [194]:
user_movie = "Devdas"

In [195]:
# Try to fetch the user_movie from our dataset if user_movie not in dataset throw an error

In [196]:
try:
    movie_index = index_from_title(user_movie)
except:
    print("Error: Please Check the Name")

In [197]:
# Converting matrix value into list

In [198]:
similar_movies =  list(enumerate(cosine_similar[movie_index]))
print(similar_movies)

[(0, 0.4670993664969138), (1, 0.14322297480788657), (2, 0.0), (3, 0.483045891539648), (4, 0.3450327796711771), (5, 0.9999999999999998), (6, 0.07161148740394328), (7, 0.4296689244236597), (8, 0.13801311186847084), (9, 0.28644594961577313), (10, 0.0), (11, 0.1690308509457033), (12, 0.21483446221182984), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.1333333333333333), (17, 0.1556997888323046), (18, 0.14907119849998599), (19, 0.14322297480788657), (20, 0.07453559924999299), (21, 0.0), (22, 0.14907119849998599), (23, 0.0816496580927726), (24, 0.223606797749979), (25, 0.0), (26, 0.0), (27, 0.0816496580927726), (28, 0.07161148740394328), (29, 0.0), (30, 0.0778498944161523), (31, 0.3113995776646092), (32, 0.07161148740394328), (33, 0.07161148740394328), (34, 0.0778498944161523), (35, 0.19364916731037082), (36, 0.0), (37, 0.07161148740394328), (38, 0.06900655593423542), (39, 0.0), (40, 0.1333333333333333), (41, 0.14322297480788657), (42, 0.223606797749979), (43, 0.37267799624996495), (44, 0.199999999

In [199]:
# Sorting the movies in dataset in the decreasing order so that the most relevant one shows up first

In [200]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)


In [201]:
# SHow top 10 related movies

In [202]:
i=0
for movie in sorted_similar_movies:
    print(title_from_index(movie[0]))
    i=i+1
    if i>10:
        break

Devdas
Hum Tumhare Hain Sanam
Albela
Guzaarish
Kabhi Khushi Kabhie Gham...
Kuch Naa Kaho
Saawariya
Lajja
Shabd
Paheli
Main Hoon Na
