In [2]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np

In [3]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [4]:
def getrecommendation( movie_title, sim_matrix, df, perc = 0.80 ):
    """ 
    the similarity matrix and data frame MUST have the same indices
    MAKE SURE: df.reset_index()
    
    imdb weighted rating:
    v is the number of votes for the movie
    m is the minimum votes required to be listed in the chart
    R is the average rating of the movie
    C is the mean vote across the whole report
    """
    idx = np.asscalar( df.loc[ df['title'] == movie_title ].index.values[0] )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:41].flatten().tolist()
    df_top = df[ ['title','director','cast','genres','keywords','vote_count', 'vote_average']].iloc[ sim_idx ]
    df_top['similarity'] = sorted( sim_matrix[ idx ], reverse = True )[1:41]

    m = df_top.loc[ df_top['vote_count'].notnull(), 'vote_count' ].quantile(0.5)
    C = df_top.loc[ df_top['vote_average'].notnull(), 'vote_average' ].mean()
    
    df_top = df_top.loc[( df_top['vote_count'] >= m ) &
                          ( df_top['vote_count'].notnull() ) &
                          ( df_top['vote_average'].notnull() ) ]    
    v = df_top['vote_count'].values
    R = df_top['vote_average'].values
    r =  ( v * R / ( v + m ) + m * C / ( v + m ) ) / 10. * ( 1 - perc )
#    r = df_top['vote_average'].values / 10 * ( 1 - perc )
    sim_ = df_top['similarity'].values * perc
    score = sim_ + r 
    df_top['score'] = score.tolist()
    return( df_top.sort_values( by='score', ascending=False ) )

In [5]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')

In [6]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [7]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x,x]) ) # more weights?

In [8]:
df_md_small['cast_'] = ( df_md_small['cast'].
                        apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [9]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                            apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                            apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [10]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [11]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [12]:
#getrecommendation( 'The Godfather', cosine_sim, df_md_small ).head(15)

In [13]:
getrecommendation( 'Up', cosine_sim, df_md_small ).head(10)

Unnamed: 0,title,director,cast,genres,keywords,vote_count,vote_average,similarity,score
8873,Inside Out,Pete Docter,"[Amy Poehler, Phyllis Smith, Richard Kind, Bil...","[Drama, Comedy, Animation, Family]","[dream, cartoon, imaginary friend, animation, ...",6737,7.9,0.250706,0.355577
3798,"Monsters, Inc.",Pete Docter,"[John Goodman, Billy Crystal, Mary Gibbs, Stev...","[Animation, Comedy, Family]","[monster, infant, energy supply, company, riva...",6150,7.5,0.250706,0.348236
7243,Ice Age: Dawn of the Dinosaurs,Carlos Saldanha,"[Ray Romano, John Leguizamo, Denis Leary, Quee...","[Animation, Comedy, Family, Adventure]","[ice age, bridge, insanity, jungle, dinosaur, ...",2330,6.5,0.210599,0.298365
1641,One Hundred and One Dalmatians,Hamilton Luske,"[Rod Taylor, J. Pat O'Malley, Betty Lou Gerson...","[Adventure, Animation, Comedy, Family]","[puppy, animation, dog, dalmatian]",1643,6.8,0.200327,0.294167
6884,Horton Hears a Who!,Jimmy Hayward,"[Jim Carrey, Steve Carell, Carol Burnett, Will...","[Animation, Comedy, Family, Adventure, Fantasy]","[elephant, rescue, jungle]",927,6.3,0.206689,0.292985
543,The Aristocats,Wolfgang Reitherman,"[Phil Harris, Sterling Holloway, Scatman Croth...","[Animation, Comedy, Family, Adventure]","[paris, cat, butler, return, suspension, music...",1287,7.1,0.189189,0.288614
3212,The Emperor's New Groove,Mark Dindal,"[David Spade, John Goodman, Eartha Kitt, Patri...","[Adventure, Animation, Comedy, Family, Fantasy]","[central and south america, birthday, emperor,...",1544,7.2,0.184274,0.286526
9045,Finding Dory,Andrew Stanton,"[Ellen DeGeneres, Albert Brooks, Hayden Rolenc...","[Adventure, Animation, Comedy, Family]","[fish, amnesia, sequel, animation, talking ani...",4333,6.8,0.184274,0.282424
2776,The Road to El Dorado,Don Michael Paul,"[Kenneth Branagh, Kevin Kline, Rosie Perez, Ar...","[Adventure, Animation, Comedy, Family]","[gold, horse, sword fight]",892,7.0,0.183169,0.281625
8483,The Lego Movie,Phil Lord,"[Chris Pratt, Will Ferrell, Elizabeth Banks, W...","[Adventure, Animation, Comedy, Family, Fantasy]","[father son relationship, creativity, friendsh...",3127,7.5,0.164399,0.277393


In [14]:
getrecommendation( 'Terminator 2: Judgment Day', cosine_sim, df_md_small, 1 ).head(10)

Unnamed: 0,title,director,cast,genres,keywords,vote_count,vote_average,similarity,score
990,The Terminator,James Cameron,"[Arnold Schwarzenegger, Michael Biehn, Linda H...","[Action, Thriller, Science Fiction]","[saving the world, artificial intelligence, re...",4208,7.4,0.367653,0.367653
7404,The Book of Eli,Albert Hughes,"[Denzel Washington, Gary Oldman, Mila Kunis, R...","[Action, Thriller, Science Fiction]","[book, post-apocalyptic, dystopia, faith, blind]",2207,6.6,0.284029,0.284029
6821,I Am Legend,Francis Lawrence,"[Will Smith, Alice Braga, Charlie Tahan, Salli...","[Drama, Horror, Action, Thriller, Science Fict...","[saving the world, lost civilisation, post-apo...",4977,6.9,0.240048,0.240048
953,Aliens,James Cameron,"[Sigourney Weaver, Michael Biehn, James Remar,...","[Horror, Action, Thriller, Science Fiction]","[android, extraterrestrial technology, space m...",3282,7.7,0.235294,0.235294
2951,Mad Max,George Miller,"[Mel Gibson, Joanne Samuel, Hugh Keays-Byrne, ...","[Adventure, Action, Thriller, Science Fiction]","[chain, baby, bridge, post-apocalyptic, dystop...",1235,6.6,0.222566,0.222566
7205,Terminator Salvation,McG,"[Christian Bale, Sam Worthington, Anton Yelchi...","[Action, Science Fiction, Thriller]","[saving the world, artificial intelligence, pr...",2496,5.9,0.215686,0.215686
8735,Terminator Genisys,Alan Taylor,"[Arnold Schwarzenegger, Jason Clarke, Emilia C...","[Science Fiction, Action, Thriller, Adventure]","[saving the world, artificial intelligence, cy...",3677,5.8,0.20004,0.20004
6766,Resident Evil: Extinction,Russell Mulcahy,"[Milla Jovovich, Oded Fehr, Ali Larter, Iain G...","[Horror, Action, Science Fiction]","[clone, mutant, post-apocalyptic, dystopia, co...",1308,6.1,0.196818,0.196818
4683,Terminator 3: Rise of the Machines,Jonathan Mostow,"[Arnold Schwarzenegger, Nick Stahl, Claire Dan...","[Action, Thriller, Science Fiction]","[saving the world, artificial intelligence, ma...",2177,5.9,0.196078,0.196078
7293,9,Shane Acker,"[Christopher Plummer, Martin Landau, John C. R...","[Action, Adventure, Animation, Science Fiction...","[man vs machine, hope, post-apocalyptic, dysto...",1291,6.6,0.196078,0.196078
