In [15]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np

In [16]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [17]:
def getrecommendation( movie_title, sim_matrix, df, perc = 0.80 ):
    """ 
    the similarity matrix and data frame MUST have the same indices
    MAKE SURE: df.reset_index()
    
    imdb weighted rating:
    v is the number of votes for the movie
    m is the minimum votes required to be listed in the chart
    R is the average rating of the movie
    C is the mean vote across the whole report
    """
    idx = np.asscalar( df.loc[ df['title'] == movie_title ].index.values[0] )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:41].flatten().tolist()
    df_top = df[ ['title','director','cast','genres','keywords','vote_count', 'vote_average']].iloc[ sim_idx ]
    df_top['similarity'] = sorted( sim_matrix[ idx ], reverse = True )[1:41]

    m = df_top.loc[ df_top['vote_count'].notnull(), 'vote_count' ].quantile(0.5)
    C = df_top.loc[ df_top['vote_average'].notnull(), 'vote_average' ].mean()
    
    df_top = df_top.loc[( df_top['vote_count'] >= m ) &
                          ( df_top['vote_count'].notnull() ) &
                          ( df_top['vote_average'].notnull() ) ]    
    v = df_top['vote_count'].values
    R = df_top['vote_average'].values
    r =  ( v * R / ( v + m ) + m * C / ( v + m ) ) / 10. * ( 1 - perc )
#    r = df_top['vote_average'].values / 10 * ( 1 - perc )
    sim_ = df_top['similarity'].values * perc
    score = sim_ + r 
    df_top['score'] = score.tolist()
    return( df_top.sort_values( by='score', ascending=False ) )

In [18]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')

In [19]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [20]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x,x]) ) # more weights?

In [21]:
df_md_small['cast_'] = ( df_md_small['cast'].
                        apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [22]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                            apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                            apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [23]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [24]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [25]:
#getrecommendation( 'The Godfather', cosine_sim, df_md_small ).head(15)

In [26]:
#getrecommendation( 'Batman', cosine_sim, df_md_small ).head(15)

In [27]:
getrecommendation( 'Pulp Fiction', cosine_sim, df_md_small, 1 ).head(15)

Unnamed: 0,title,director,cast,genres,keywords,vote_count,vote_average,similarity,score
1360,Jackie Brown,Quentin Tarantino,"[Pam Grier, Samuel L. Jackson, Robert Forster,...","[Comedy, Crime, Romance]","[airport, underworld, arms deal, weapon, polic...",1580,7.3,0.201498,0.201498
8786,The Hateful Eight,Quentin Tarantino,"[Samuel L. Jackson, Kurt Russell, Jennifer Jas...","[Crime, Drama, Mystery, Western]","[bounty hunter, wyoming, mountain, narration, ...",4405,7.6,0.186171,0.186171
5144,Kill Bill: Vol. 2,Quentin Tarantino,"[Uma Thurman, David Carradine, Daryl Hannah, M...","[Action, Crime, Thriller]","[brother brother relationship, swordplay, kata...",4061,7.7,0.161591,0.161591
4708,S.W.A.T.,Clark Johnson,"[Samuel L. Jackson, Colin Farrell, Michelle Ro...","[Action, Thriller, Crime]","[liberation, transport of prisoners, special u...",780,5.8,0.138343,0.138343
8198,Django Unchained,Quentin Tarantino,"[Jamie Foxx, Christoph Waltz, Leonardo DiCapri...","[Drama, Western]","[bounty hunter, hero, plantation, society, fri...",10297,7.8,0.1335,0.1335
4847,Kill Bill: Vol. 1,Quentin Tarantino,"[Uma Thurman, Lucy Liu, Vivica A. Fox, Daryl H...","[Action, Crime]","[japan, coma, martial arts, kung fu, underworl...",5091,7.7,0.12983,0.12983
7020,The Spirit,Frank Miller,"[Gabriel Macht, Scarlett Johansson, Samuel L. ...","[Action, Comedy, Thriller, Crime, Science Fict...","[secret identity, robber, mask, frog, based on...",323,4.7,0.124114,0.124114
877,Reservoir Dogs,Quentin Tarantino,"[Harvey Keitel, Tim Roth, Michael Madsen, Chri...","[Crime, Thriller]","[traitor, jewelry, psychopath, thief, heist, b...",3821,8.1,0.118712,0.118712
3193,Unbreakable,M. Night Shyamalan,"[Bruce Willis, Samuel L. Jackson, Robin Wright...","[Science Fiction, Thriller, Drama]","[father son relationship, train accident, comi...",1994,6.9,0.103429,0.103429
8446,Oldboy,Spike Lee,"[Josh Brolin, Elizabeth Olsen, Samuel L. Jacks...","[Drama, Thriller, Mystery, Action]","[imprisonment, remake of korean film]",632,5.9,0.098384,0.098384
