In [20]:
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# import boto3
# from botocore.config import Config


In [21]:
metadata = pd.read_csv('movies_metadata.csv')
credits_ = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

metadata = metadata.iloc[0:10000,:]
credits_ = credits_.iloc[0:10000,:]
keywords = keywords.iloc[0:10000,:]

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
keywords['id'] = keywords['id'].astype('int')
credits_['id'] = credits_['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# keywords.loc[0]['keywords']
# credits_.loc[0]['crew']
# metadata.loc[0]

In [23]:
metadata = metadata.merge(credits_, on='id')
metadata = metadata.merge(keywords, on='id')

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [29]:


# metadata.loc[0]['keywords']
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

metadata['director'] = metadata['crew'].apply(get_director)

metadata['crew']

0        [{'credit_id': '52fe4284c3a36847f8024f49', 'de...
1        [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...
2        [{'credit_id': '52fe466a9251416c75077a89', 'de...
3        [{'credit_id': '52fe44779251416c91011acb', 'de...
4        [{'credit_id': '52fe44959251416c75039ed7', 'de...
                               ...                        
10013    [{'credit_id': '5712c499c3a368673d004567', 'de...
10014    [{'credit_id': '52fe44f2c3a36847f80b356d', 'de...
10015    [{'credit_id': '57cb6aee92514163dd003b6e', 'de...
10016    [{'credit_id': '532116ff9251411f890020a3', 'de...
10017    [{'credit_id': '52fe45659251416c75055007', 'de...
Name: crew, Length: 10018, dtype: object

In [30]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)


metadata['cast']

0                      [Tom Hanks, Tim Allen, Don Rickles]
1           [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2               [Walter Matthau, Jack Lemmon, Ann-Margret]
3        [Whitney Houston, Angela Bassett, Loretta Devine]
4               [Steve Martin, Diane Keaton, Martin Short]
                               ...                        
10013    [Emma Gramatica, Francesco Golisano, Paolo Sto...
10014        [Max Riemelt, Tom Schilling, Jonas Jägermeyr]
10015            [Gene Wilder, Harrison Ford, Ramon Bieri]
10016         [Sachiko Kokubu, Mansai Nomura, Hideaki Ito]
10017                  [Beanie Sigel, Noreaga, Damon Dash]
Name: cast, Length: 10018, dtype: object

In [32]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

metadata['keywords']

0                                     [jealousy, toy, boy]
1        [boardgame, disappearance, basedonchildren'sbook]
2              [fishing, bestfriend, duringcreditsstinger]
3        [basedonnovel, interracialrelationship, single...
4                        [baby, midlifecrisis, confidence]
                               ...                        
10013                             [magic, poverty, orphan]
10014                 [transporter, nazis, boardingschool]
10015                                                   []
10016                                                   []
10017                              [drugsmuggle, criminal]
Name: keywords, Length: 10018, dtype: object

In [33]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

metadata['soup'] = metadata.apply(create_soup, axis=1)

metadata['soup']

0        jealousy toy boy tomhanks timallen donrickles ...
1        boardgame disappearance basedonchildren'sbook ...
2        fishing bestfriend duringcreditsstinger walter...
3        basedonnovel interracialrelationship singlemot...
4        baby midlifecrisis confidence stevemartin dian...
                               ...                        
10013    magic poverty orphan emmagramatica francescogo...
10014    transporter nazis boardingschool maxriemelt to...
10015     genewilder harrisonford ramonbieri robertaldr...
10016     sachikokokubu mansainomura hideakiito yojirot...
10017    drugsmuggle criminal beaniesigel noreaga damon...
Name: soup, Length: 10018, dtype: object

In [39]:
def make_recommendation(query, metadata=metadata):
  
  new_row = metadata.iloc[-1,:].copy()
  
  new_row.iloc[-1] = query
  
  metadata = metadata.append(new_row)
#   print(metadata.iloc[-1,:])
  
  count = CountVectorizer(stop_words='english')
  count_matrix = count.fit_transform(metadata['soup'])

  cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
  
  sim_scores = list(enumerate(cosine_sim2[-1,:]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  ranked_titles = []
  for i in range(1, 11):
    indx = sim_scores[i][0]
    ranked_titles.append([metadata['title'].iloc[indx], metadata['imdb_id'].iloc[indx], metadata['runtime'].iloc[indx], metadata['release_date'].iloc[indx], metadata['vote_average'].iloc[indx]])
  
  return ranked_titles


query = "horror jameswan dreadful"

make_recommendation(query, metadata)

drugsmuggle criminal beaniesigel noreaga damondash damondash action adventure crime


[['Two Thousand Maniacs!', 'tt0058694', 87.0, '1964-03-20', 6.2],
 ['Saw', 'tt0387564', 103.0, '2004-10-01', 7.2],
 ['Ju-on: The Curse', 'tt0330500', 70.0, '2000-02-11', 7.0],
 ['House on the Edge of the Park', 'tt0080503', 91.0, '1980-11-06', 5.8],
 ['Theatre of Blood', 'tt0070791', 104.0, '1973-03-15', 6.7],
 ['Long Time Dead', 'tt0251806', 94.0, '2002-01-18', 4.5],
 ['Faces of Death: Fact or Fiction?', 'tt0223251', 54.0, '1999-01-01', 0.0],
 ['The Mesmerist', 'tt0272730', 95.0, '2002-03-11', 0.0],
 ['Faces of Death II', 'tt0085518', 85.0, '1981-11-10', 3.3],
 ['Faces of Death III', 'tt0121261', 86.0, '1985-03-10', 3.4]]