In [1]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load Data 

In [2]:
df = pd.read_csv('Dataset/movie_dataset.csv')
# df.head()

In [3]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

## Prepare Data 

In [4]:
features = ['genres', 'keywords', 'cast', 'director']

data = pd.DataFrame(df['title'])
for f in features:
    temp = pd.DataFrame(df[f])
    data = pd.concat([data,temp],axis=1)

In [5]:
data.head()

Unnamed: 0,title,genres,keywords,cast,director
0,Avatar,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,Pirates of the Caribbean: At World's End,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski
2,Spectre,Action Adventure Crime,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes
3,The Dark Knight Rises,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan
4,John Carter,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton


In [6]:
len(data[data.isna().any(axis=1)]) / len(data) * 100

8.869456589631481

In [7]:
# so we cant get rid of nan, put empty string there
data.fillna('',inplace=True)

In [8]:
len(data[data.isna().any(axis=1)])

0

In [9]:
data.shape

(4803, 5)

In [10]:
# removing punctuatuion and empty spaces for better lookup
replace_punc_dict = dict.fromkeys(string.punctuation)
replace_punc_dict[' '] = ''
table = str.maketrans(replace_punc_dict)
for i in range(len(data['title'])):
    data['title'][i] = data['title'][i].lower()
    data['title'][i] = data['title'][i].translate(table)

In [11]:
data["combined_features"] = data['genres'] + " " + data['keywords'] + " " + data['cast'] + " " + data['director']

In [12]:
data['combined_features'].head()

0    Action Adventure Fantasy Science Fiction cultu...
1    Adventure Fantasy Action ocean drug abuse exot...
2    Action Adventure Crime spy based on novel secr...
3    Action Crime Drama Thriller dc comics crime fi...
4    Action Adventure Science Fiction based on nove...
Name: combined_features, dtype: object

## Vectorizing and creating cosine_similarity score matrix

In [13]:
cv = CountVectorizer()  

In [14]:
cv_mat = cv.fit_transform(data['combined_features']).toarray()

In [15]:
cv_mat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
similarity = cosine_similarity(cv_mat)

## Preparing and Saving Requirements

In [17]:
new_sim = list()
for i in similarity:
    temp = list(enumerate(i))
    temp = sorted(temp, key=lambda x:x[1], reverse=True)
    temp = temp[1:26]
    new_sim.append(temp)

In [18]:
print(new_sim[0])

[(94, 0.42339019740572564), (2403, 0.3774256780481986), (3208, 0.3464101615137755), (47, 0.34426518632954817), (56, 0.33596842045264647), (3158, 0.33333333333333337), (2198, 0.31426968052735443), (2696, 0.30792014356780045), (4401, 0.28867513459481287), (1531, 0.2858966759567453), (278, 0.2810913475705226), (1053, 0.2809003238667948), (239, 0.2765204519281134), (838, 0.2749859704614352), (61, 0.27498597046143514), (232, 0.2694301256218254), (4332, 0.2694301256218254), (661, 0.264197974633739), (4593, 0.264197974633739), (3730, 0.2592592592592593), (1650, 0.2501595914621521), (158, 0.24618298195866545), (461, 0.24618298195866545), (1083, 0.24618298195866542), (322, 0.24077170617153845)]


In [19]:
# converting values to string, to reduce dimensions
save_sim = []
idx = 0
for i in new_sim:
    save_sim.append([])
    for j in i:
        st = str(j[0]) + '|' + str(j[1])
        save_sim[idx].append(st)
    idx += 1

In [20]:
print(save_sim[0])

['94|0.42339019740572564', '2403|0.3774256780481986', '3208|0.3464101615137755', '47|0.34426518632954817', '56|0.33596842045264647', '3158|0.33333333333333337', '2198|0.31426968052735443', '2696|0.30792014356780045', '4401|0.28867513459481287', '1531|0.2858966759567453', '278|0.2810913475705226', '1053|0.2809003238667948', '239|0.2765204519281134', '838|0.2749859704614352', '61|0.27498597046143514', '232|0.2694301256218254', '4332|0.2694301256218254', '661|0.264197974633739', '4593|0.264197974633739', '3730|0.2592592592592593', '1650|0.2501595914621521', '158|0.24618298195866545', '461|0.24618298195866545', '1083|0.24618298195866542', '322|0.24077170617153845']


In [21]:
save_sim = np.array(save_sim)
print(save_sim.shape)

(4803, 25)


In [22]:
df['title'].head()

0                                      Avatar
1    Pirates of the Caribbean: At World's End
2                                     Spectre
3                       The Dark Knight Rises
4                                 John Carter
Name: title, dtype: object

In [23]:
data['title'].head()

0                              avatar
1    piratesofthecaribbeanatworldsend
2                             spectre
3                  thedarkknightrises
4                          johncarter
Name: title, dtype: object

In [24]:
save_sim_pd = pd.DataFrame(save_sim)
lookup_pd = pd.DataFrame(data['title'])
show_pd = pd.DataFrame(df['title'])

In [25]:
save_sim_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,94|0.42339019740572564,2403|0.3774256780481986,3208|0.3464101615137755,47|0.34426518632954817,56|0.33596842045264647,3158|0.33333333333333337,2198|0.31426968052735443,2696|0.30792014356780045,4401|0.28867513459481287,1531|0.2858966759567453,...,232|0.2694301256218254,4332|0.2694301256218254,661|0.264197974633739,4593|0.264197974633739,3730|0.2592592592592593,1650|0.2501595914621521,158|0.24618298195866545,461|0.24618298195866545,1083|0.24618298195866542,322|0.24077170617153845
1,199|0.6207522318391884,12|0.6091095901015049,5|0.2556038601690775,115|0.2556038601690775,30|0.2506402059138015,536|0.23732222626728364,1017|0.23333333333333328,2029|0.2284160962880643,129|0.22841609628806428,1652|0.223606797749979,...,17|0.19462473604038072,178|0.19462473604038072,262|0.19462473604038072,894|0.19034674690672024,1095|0.19034674690672024,448|0.1884336657203579,3374|0.1863389981249825,329|0.18633899812498247,330|0.18633899812498247,840|0.18633899812498247
2,29|0.48900964692182586,11|0.3336230624913197,354|0.2553769592276246,183|0.25021729686848976,1100|0.25021729686848976,1137|0.25021729686848976,1999|0.25021729686848976,2156|0.25021729686848976,3336|0.25021729686848976,3373|0.24573659359149527,...,147|0.2323209277987099,219|0.22750787759664506,1958|0.22750787759664506,444|0.22470176588194152,1192|0.22470176588194152,2167|0.2222771122371935,1024|0.22116293423234573,206|0.2173913043478261,1013|0.2173913043478261,3986|0.2173913043478261
3,119|0.7307692307692306,65|0.6923076923076922,4638|0.45291081365783836,2793|0.39467610868816316,1196|0.37630890450319093,428|0.3602883460614461,1359|0.3530090432487313,210|0.3396831102433788,1720|0.3335621924974955,2371|0.3335621924974955,...,4099|0.29268470350248177,2790|0.29134281629169184,1013|0.28625128703833574,1792|0.28625128703833574,2398|0.28625128703833574,1503|0.2853908964926965,3293|0.28022426915890253,1986|0.27735009811261463,4664|0.27735009811261463,1234|0.27456258919345766
4,972|0.30792014356780045,2904|0.30792014356780045,3494|0.30792014356780045,4401|0.30000000000000004,111|0.2919201796799047,270|0.28577380332470415,183|0.28,260|0.28,400|0.28,2444|0.27456258919345766,...,122|0.25021729686848976,2630|0.25021729686848976,27|0.24494897427831788,249|0.24494897427831788,256|0.24494897427831788,1068|0.24494897427831788,2121|0.24494897427831788,94|0.24000000000000005,266|0.24000000000000005,419|0.24000000000000005


In [26]:
lookup_pd.head()

Unnamed: 0,title
0,avatar
1,piratesofthecaribbeanatworldsend
2,spectre
3,thedarkknightrises
4,johncarter


In [27]:
show_pd.head()

Unnamed: 0,title
0,Avatar
1,Pirates of the Caribbean: At World's End
2,Spectre
3,The Dark Knight Rises
4,John Carter


In [28]:
save_sim_pd.to_csv('model/res.csv', index=None, header=None)
lookup_pd.to_csv('model/lookup.csv', index=None, header=None)
show_pd.to_csv('model/show.csv', index=None, header=None)

## Execution and Results

In [29]:
#prompt user for input movie
movie = "avatar"
i = data[data['title'] == movie].index
i = i[0]

In [30]:
indexed_sim = list(enumerate(similarity[i]))
# print(indexed_sim)

In [31]:
sorted_sim = sorted(indexed_sim, key=lambda x:x[1], reverse=True)

In [32]:
req_sim = sorted_sim[1:21]

In [33]:
# for i,_ in req_sim:
#     if df['vote_average'][i]>6.5 and df['vote_count'][i]>3000:
#         print(data['title'][i])

In [35]:
for i,_ in req_sim:
    print(df['title'][i])

Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
Star Trek Beyond
Alien
Lockout
Jason X
The Helix... Loaded
Moonraker
Planet of the Apes
Galaxy Quest
Gravity
Alien³
Jupiter Ascending
The Wolverine
Silent Running
Zathura: A Space Adventure
Trekkies
Cargo
