In [116]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np

In [117]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [166]:
def getrecommendation( movie_title, sim_matrix, df, perc = 0.80 ):
    """ 
    the similarity matrix and data frame MUST have the same indices
    MAKE SURE: df.reset_index()
    
    weighted rating:
    v is the number of votes for the movie
    m is the minimum votes required to be listed in the chart
    R is the average rating of the movie
    C is the mean vote across the whole report
    """
    idx = np.asscalar( df.loc[ df['title'] == movie_title ].index.values[0] )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:41].flatten().tolist()
    df_top = df[ ['title','director','cast','genres','keywords','vote_count', 'vote_average']].iloc[ sim_idx ]
    df_top['similarity'] = sorted( sim_matrix[ idx ], reverse = True )[1:41]

    m = df_top.loc[ df_top['vote_count'].notnull(), 'vote_count' ].quantile(0.5)
    C = df_top.loc[ df_top['vote_average'].notnull(), 'vote_average' ].mean()
    
    df_top = df_top.loc[( df_top['vote_count'] >= m ) &
                          ( df_top['vote_count'].notnull() ) &
                          ( df_top['vote_average'].notnull() ) ]    
    v = df_top['vote_count'].values
    R = df_top['vote_average'].values
    r =  ( v * R / ( v + m ) + m * C / ( v + m ) ) / 10. * ( 1 - perc )
#    r = df_top['vote_average'].values / 10 * ( 1 - perc )
    sim_ = df_top['similarity'].values * perc
    score = sim_ + r 
    df_top['score'] = score.tolist()
    return( df_top.sort_values( by='score', ascending=False ) )

In [119]:
df_md = pd. read_csv('../movielens_small/movies_metadata.csv')
df_link = pd.read_csv('../movielens_small/links_small.csv')
df_credits = pd.read_csv( '../movielens_small/credits.csv' )
df_keywords = pd.read_csv( '../movielens_small/keywords.csv' )

  interactivity=interactivity, compiler=compiler, result=result)


In [120]:
df_link = df_link[df_link['tmdbId'].notnull()]['tmdbId'].astype('int')

In [121]:
df_md['genres'] = ( df_md['genres'].fillna('[]').
                   apply(ast.literal_eval).
                   apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) )

In [122]:
df_md = df_md.drop([19730, 29503, 35587]) # invalid input in cols id & imdb_id

In [123]:
df_md['id'] = df_md['id'].astype('int')
df_keywords['id'] = df_keywords['id'].astype('int')
df_credits['id'] = df_credits['id'].astype('int')

In [124]:
df_md_small = ( df_md[ df_md['id'].isin(df_link)].
               merge(df_credits, on='id').
               merge(df_keywords, on='id').
               reset_index() )

In [125]:
df_md_small['director'] = ( df_md_small['crew'].
                           apply(ast.literal_eval).
                           apply(getdirector).astype('str') )
df_md_small['cast'] = ( df_md_small['cast'].
                       apply(ast.literal_eval).
                       apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []).
                       apply( lambda x: x[:3] if len(x) >=3 else x) )
df_md_small['keywords'] = ( df_md_small['keywords'].
                           apply(ast.literal_eval).
                           apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) )

In [126]:
df_md_small['director_'] = ( df_md_small['director'].
                           apply( lambda x: x.lower().replace( " ", "") ).
                           apply( lambda x: [x]) ) # more weights?

In [127]:
df_md_small['cast_'] = ( df_md_small['cast'].
                       apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [128]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                           apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                           apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [129]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [130]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [167]:
getrecommendation( 'The Godfather', cosine_sim, df_md_small ).head(15)

Unnamed: 0,title,director,cast,genres,keywords,vote_count,vote_average,similarity,score
994,The Godfather: Part II,Francis Ford Coppola,"[Al Pacino, Robert Duvall, Diane Keaton]","[Drama, Crime]","[italo-american, cuba, vororte, melancholy, pr...",3418.0,8.3,0.222308,0.342607
1602,The Godfather: Part III,Francis Ford Coppola,"[Al Pacino, Diane Keaton, Andy García]","[Crime, Drama, Thriller]","[italy, christianity, new york, assassination,...",1589.0,7.1,0.19452,0.296687
284,The Shawshank Redemption,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton]","[Drama, Crime]","[prison, corruption, police brutality, prison ...",8358.0,8.5,0.151794,0.290863
1346,The Rainmaker,Francis Ford Coppola,"[Matt Damon, Danny DeVito, Jon Voight]","[Drama, Crime, Thriller]","[jurors, proof, court case, leukemia, lawyer, ...",239.0,6.7,0.169031,0.26739
986,GoodFellas,Martin Scorsese,"[Robert De Niro, Ray Liotta, Joe Pesci]","[Drama, Crime]","[prison, based on novel, florida, 1970s, mass ...",3211.0,8.2,0.116091,0.255626
2808,Midnight Express,Alan Parker,"[Brad Davis, Irene Miracle, Bo Hopkins]","[Drama, Crime]","[prison, drug smuggle, attempt to escape, esca...",309.0,7.6,0.135225,0.253718
2742,...And Justice for All,Norman Jewison,"[Al Pacino, Jack Warden, John Forsythe]","[Crime, Drama, Mystery, Thriller]","[suspense, lawyer, courtroom, extramarital aff...",118.0,7.1,0.140981,0.248029
1204,Donnie Brasco,Mike Newell,"[Johnny Depp, Al Pacino, Michael Madsen]","[Crime, Drama, Thriller]","[undercover, colombia, mafia, mobster, dirty c...",1175.0,7.4,0.117698,0.24039
154,Kids,Larry Clark,"[Leo Fitzpatrick, Rosario Dawson, Chloë Sevigny]","[Drama, Crime]","[puberty, first time]",280.0,6.8,0.130931,0.238512
3003,Serpico,Sidney Lumet,"[Al Pacino, John Randolph Jones, Jack Kehoe]","[Crime, Drama, History]","[corruption, hippie, money]",429.0,7.5,0.116335,0.238455


In [168]:
getrecommendation( 'Batman', cosine_sim, df_md_small ).head(15)

Unnamed: 0,title,director,cast,genres,keywords,vote_count,vote_average,similarity,score
1260,Batman & Robin,Joel Schumacher,"[George Clooney, Chris O'Donnell, Arnold Schwa...","[Action, Crime, Fantasy]","[double life, dc comics, dual identity, crime ...",1447.0,4.2,0.484881,0.485865
1134,Batman Returns,Tim Burton,"[Michael Keaton, Danny DeVito, Michelle Pfeiffer]","[Action, Fantasy]","[holiday, corruption, double life, dc comics, ...",1706.0,6.6,0.291807,0.362374
6981,The Dark Knight,Christopher Nolan,"[Christian Bale, Michael Caine, Heath Ledger]","[Drama, Action, Crime, Thriller]","[dc comics, crime fighter, secret identity, sc...",12269.0,8.3,0.153043,0.285708
6521,Superman Returns,Bryan Singer,"[Brandon Routh, Kevin Spacey, Kate Bosworth]","[Adventure, Fantasy, Action, Science Fiction]","[saving the world, dc comics, invulnerability,...",1429.0,5.4,0.183169,0.259846
2131,Superman,Richard Donner,"[Christopher Reeve, Marlon Brando, Margot Kidder]","[Action, Adventure, Fantasy, Science Fiction]","[saving the world, journalist, dc comics, crim...",1042.0,6.9,0.156015,0.256001
8419,Man of Steel,Zack Snyder,"[Henry Cavill, Amy Adams, Michael Shannon]","[Action, Adventure, Fantasy, Science Fiction]","[saving the world, dc comics, superhero, based...",6462.0,6.5,0.156941,0.254713
6218,Batman Begins,Christopher Nolan,"[Christian Bale, Michael Caine, Liam Neeson]","[Action, Crime, Drama]","[himalaya, martial arts, dc comics, crime figh...",7511.0,7.5,0.134131,0.254598
9024,Batman v Superman: Dawn of Justice,Zack Snyder,"[Ben Affleck, Henry Cavill, Gal Gadot]","[Action, Adventure, Fantasy]","[dc comics, vigilante, superhero, based on com...",7189.0,5.7,0.166759,0.248284
8031,The Dark Knight Rises,Christopher Nolan,"[Christian Bale, Michael Caine, Gary Oldman]","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",9263.0,7.6,0.12298,0.247985
4119,Spider-Man,Sam Raimi,"[Tobey Maguire, Willem Dafoe, Kirsten Dunst]","[Fantasy, Action]","[loss of lover, spider, thanksgiving, bad boss...",5398.0,6.8,0.129302,0.237665
