In [2]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np

In [2]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [3]:
def getrecommendation( movie_title, sim_matrix, df, perc = 0.80 ):
    """ 
    the similarity matrix and data frame MUST have the same indices
    MAKE SURE: df.reset_index()
    
    imdb weighted rating:
    v is the number of votes for the movie
    m is the minimum votes required to be listed in the chart
    R is the average rating of the movie
    C is the mean vote across the whole report
    """
    idx = np.asscalar( df.loc[ df['title'] == movie_title ].index.values[0] )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:41].flatten().tolist()
    df_top = df[ ['title','director','cast','genres','keywords','vote_count', 'vote_average']].iloc[ sim_idx ]
    df_top['similarity'] = sorted( sim_matrix[ idx ], reverse = True )[1:41]

    m = df_top.loc[ df_top['vote_count'].notnull(), 'vote_count' ].quantile(0.5)
    C = df_top.loc[ df_top['vote_average'].notnull(), 'vote_average' ].mean()
    
    df_top = df_top.loc[( df_top['vote_count'] >= m ) &
                          ( df_top['vote_count'].notnull() ) &
                          ( df_top['vote_average'].notnull() ) ]    
    v = df_top['vote_count'].values
    R = df_top['vote_average'].values
    r =  ( v * R / ( v + m ) + m * C / ( v + m ) ) / 10. * ( 1 - perc )
#    r = df_top['vote_average'].values / 10 * ( 1 - perc )
    sim_ = df_top['similarity'].values * perc
    score = sim_ + r 
    df_top['score'] = score.tolist()
    return( df_top.sort_values( by='score', ascending=False ) )

In [3]:
df_md = pd. read_csv('../movielens_small/movies_metadata.csv')
df_link = pd.read_csv('../movielens_small/links_small.csv')
df_credits = pd.read_csv( '../movielens_small/credits.csv' )
df_keywords = pd.read_csv( '../movielens_small/keywords.csv' )

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_md.shape

(45466, 24)

In [5]:
df_link = df_link[df_link['tmdbId'].notnull()]['tmdbId'].astype('int')

In [6]:
df_md['genres'] = ( df_md['genres'].fillna('[]').
                   apply(ast.literal_eval).
                   apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) )

In [7]:
df_md = df_md.drop([19730, 29503, 35587]) # invalid input in cols id & imdb_id

In [8]:
df_md['id'] = df_md['id'].astype('int')
df_keywords['id'] = df_keywords['id'].astype('int')
df_credits['id'] = df_credits['id'].astype('int')

In [9]:
df_md_small = ( df_md[ df_md['id'].isin(df_link)].
               merge(df_credits, on='id').
               merge(df_keywords, on='id').
               reset_index() )

In [10]:
df_md_small['director'] = ( df_md_small['crew'].
                           apply(ast.literal_eval).
                           apply(getdirector).astype('str') )
df_md_small['cast'] = ( df_md_small['cast'].
                       apply(ast.literal_eval).
                       apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []).
                       apply( lambda x: x[:3] if len(x) >=3 else x) )
df_md_small['keywords'] = ( df_md_small['keywords'].
                           apply(ast.literal_eval).
                           apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) )

In [19]:
df_md_small.head()

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,vote_average,vote_count,cast,crew,keywords,director,director_,cast_,keywords_,content
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,7.7,5415.0,"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",John Lasseter,[johnlasseter],"[tomhanks, timallen, donrickles]","[jealousi, toy, boy, friendship, friend, rival...",tomhanks timallen donrickles johnlasseter jeal...
1,1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,...,6.9,2413.0,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",Joe Johnston,[joejohnston],"[robinwilliams, jonathanhyde, kirstendunst]","[boardgam, disappear, basedonchildren'sbook, n...",robinwilliams jonathanhyde kirstendunst joejoh...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,...,6.5,92.0,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",Howard Deutch,[howarddeutch],"[waltermatthau, jacklemmon, ann-margret]","[fish, bestfriend, duringcreditssting, oldmen]",waltermatthau jacklemmon ann-margret howarddeu...
3,3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,...,6.1,34.0,"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",Forest Whitaker,[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, single...",whitneyhouston angelabassett lorettadevine for...
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,...,5.7,173.0,"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",Charles Shyer,[charlesshyer],"[stevemartin, dianekeaton, martinshort]","[babi, midlifecrisi, confid, age, daughter, mo...",stevemartin dianekeaton martinshort charlesshy...


In [11]:
df_md_small['director_'] = ( df_md_small['director'].
                           apply( lambda x: x.lower().replace( " ", "") ).
                           apply( lambda x: [x]) ) # more weights?

In [12]:
df_md_small['cast_'] = ( df_md_small['cast'].
                       apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [13]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                           apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                           apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [14]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [15]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [16]:
getrecommendation( 'Pulp Fiction', cosine_sim, df_md_small ).head(15)

Unnamed: 0,title,director,cast,genres,keywords,vote_count,vote_average,similarity,score
6939,Cleaner,Renny Harlin,"[Samuel L. Jackson, Ed Harris, Eva Mendes]","[Crime, Thriller, Mystery]",[cover-up],201.0,5.9,0.17324,0.258104
4595,Basic,John McTiernan,"[John Travolta, Connie Nielsen, Samuel L. Jack...","[Action, Drama, Mystery, Thriller, Crime]","[drug addiction, military court, panama, milit...",290.0,6.2,0.164399,0.254774
1381,Jackie Brown,Quentin Tarantino,"[Pam Grier, Samuel L. Jackson, Robert Forster]","[Comedy, Crime, Romance]","[airport, underworld, arms deal, weapon, polic...",1580.0,7.3,0.137464,0.254007
4764,S.W.A.T.,Clark Johnson,"[Samuel L. Jackson, Colin Farrell, Michelle Ro...","[Action, Thriller, Crime]","[liberation, transport of prisoners, special u...",780.0,5.8,0.171429,0.254005
8905,The Hateful Eight,Quentin Tarantino,"[Samuel L. Jackson, Kurt Russell, Jennifer Jas...","[Crime, Drama, Mystery, Western]","[bounty hunter, wyoming, mountain, narration, ...",4405.0,7.6,0.124341,0.250546
4306,The 51st State,Ronny Yu,"[Samuel L. Jackson, Robert Carlyle, Emily Mort...","[Thriller, Action, Comedy, Crime]","[chemical, laxative, skinheads]",173.0,5.9,0.148939,0.238799
4084,Changing Lanes,Roger Michell,"[Ben Affleck, Samuel L. Jackson, Kim Staunton]","[Action, Adventure, Crime, Thriller]","[new york, custody battle, suspense, lawyer]",306.0,5.9,0.142857,0.233441
7673,Animal Kingdom,David Michôd,"[James Frecheville, Ben Mendelsohn, Joel Edger...","[Drama, Thriller, Crime]",[],240.0,6.7,0.118864,0.22458
7104,The Spirit,Frank Miller,"[Gabriel Macht, Scarlett Johansson, Samuel L. ...","[Action, Comedy, Thriller, Crime, Science Fict...","[secret identity, robber, mask, frog, based on...",323.0,4.7,0.149209,0.221702
8408,Fast & Furious 6,Justin Lin,"[Vin Diesel, Paul Walker, Dwayne Johnson]","[Action, Thriller, Crime]","[car race, sequel, crime, car, automobile raci...",5282.0,6.7,0.109971,0.221662


In [17]:
#getrecommendation( 'The Godfather', cosine_sim, df_md_small ).head(15)

In [18]:
#getrecommendation( 'Batman', cosine_sim, df_md_small ).head(15)