In [32]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np

In [33]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [34]:
def getrecommendation( movie_title, sim_matrix, df, perc = 0.80 ):
    """ 
    the similarity matrix and data frame MUST have the same indices
    MAKE SURE: df.reset_index()
    
    imdb weighted rating:
    v is the number of votes for the movie
    m is the minimum votes required to be listed in the chart
    R is the average rating of the movie
    C is the mean vote across the whole report
    """
    idx = np.asscalar( df.loc[ df['title_tmdb'] == movie_title ].index.values[0] )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:41].flatten().tolist()
    df_top = df[ ['title_tmdb','director','cast','genres_tmdb','keywords','vote_count', 'vote_average']].iloc[ sim_idx ]
    df_top['similarity'] = sorted( sim_matrix[ idx ], reverse = True )[1:41]

    m = df_top.loc[ df_top['vote_count'].notnull(), 'vote_count' ].quantile(0.5)
    C = df_top.loc[ df_top['vote_average'].notnull(), 'vote_average' ].mean()
    
    df_top = df_top.loc[( df_top['vote_count'] >= m ) &
                          ( df_top['vote_count'].notnull() ) &
                          ( df_top['vote_average'].notnull() ) ]    
    v = df_top['vote_count'].values
    R = df_top['vote_average'].values
    r =  ( v * R / ( v + m ) + m * C / ( v + m ) ) / 10. * ( 1 - perc )
#    r = df_top['vote_average'].values / 10 * ( 1 - perc )
    sim_ = df_top['similarity'].values * perc
    score = sim_ + r 
    df_top['score'] = score.tolist()
    return( df_top.sort_values( by='score', ascending=False ) )

In [35]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')

In [36]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x]) ) # more weights?

In [37]:
df_md_small['cast_'] = ( df_md_small['cast'].apply(ast.literal_eval).
                       apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [38]:
df_md_small['keywords_'] = ( df_md_small['keywords'].apply(ast.literal_eval).
                           apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                           apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [39]:
df_md_small['genres_tmdb'] = df_md_small['genres_tmdb'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [40]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres_tmdb'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [41]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [42]:
getrecommendation( 'Pulp Fiction', cosine_sim, df_md_small ).head(15)

Unnamed: 0,title_tmdb,director,cast,genres_tmdb,keywords,vote_count,vote_average,similarity,score
1360,Jackie Brown,Quentin Tarantino,"[Pam Grier, Samuel L. Jackson, Robert Forster,...","[Comedy, Crime, Romance]","[airport, underworld, arms deal, weapon, polic...",1580,7.3,0.123353,0.240925
8786,The Hateful Eight,Quentin Tarantino,"[Samuel L. Jackson, Kurt Russell, Jennifer Jas...","[Crime, Drama, Mystery, Western]","[bounty hunter, wyoming, mountain, narration, ...",4405,7.6,0.11291,0.240444
4708,S.W.A.T.,Clark Johnson,"[Samuel L. Jackson, Colin Farrell, Michelle Ro...","[Action, Thriller, Crime]","[liberation, transport of prisoners, special u...",780,5.8,0.153043,0.240226
3193,Unbreakable,M. Night Shyamalan,"[Bruce Willis, Samuel L. Jackson, Robin Wright...","[Science Fiction, Thriller, Drama]","[father son relationship, train accident, comi...",1994,6.9,0.11291,0.226294
4042,Changing Lanes,Roger Michell,"[Ben Affleck, Samuel L. Jackson, Kim Staunton,...","[Action, Adventure, Crime, Thriller]","[new york, custody battle, suspense, lawyer]",306,5.9,0.127536,0.222232
8005,The Raid,Gareth Evans,"[Iko Uwais, Joe Taslim, Donny Alamsyah, Yayan ...","[Action, Thriller, Crime]","[crime boss, tenement, high rise, monitor, tow...",1076,7.3,0.095646,0.217382
7020,The Spirit,Frank Miller,"[Gabriel Macht, Scarlett Johansson, Samuel L. ...","[Action, Comedy, Thriller, Crime, Science Fict...","[secret identity, robber, mask, frog, based on...",323,4.7,0.135492,0.216093
2991,Shaft,John Singleton,"[Samuel L. Jackson, Jeffrey Wright, Christian ...","[Action, Adventure, Crime, Thriller]","[corruption, black people, italo-american, bro...",316,5.5,0.122859,0.214351
8296,Fast & Furious 6,Justin Lin,"[Vin Diesel, Paul Walker, Dwayne Johnson, Jord...","[Action, Thriller, Crime]","[car race, sequel, crime, car, automobile raci...",5282,6.7,0.098683,0.212325
1615,The Negotiator,F. Gary Gray,"[Samuel L. Jackson, Kevin Spacey, David Morse,...","[Action, Adventure, Crime, Drama, Mystery, Thr...","[corruption, hostage, pension, innocence, poli...",593,6.8,0.098115,0.20994


In [43]:
#getrecommendation( 'The Godfather', cosine_sim, df_md_small ).head(15)

In [44]:
#getrecommendation( 'Batman', cosine_sim, df_md_small ).head(15)