In [21]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
import re

In [22]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [23]:
def getrecommendations(title, cosine_sim, df ):
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [24]:
def getrecommendation( movie_title, sim_matrix, df ):
    idx = np.asscalar( 
        df.loc[ df['title'] == movie_title ].index.values )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:21].flatten().tolist()
    df_top = df[ ['title', 'genres', 'keywords', 'director', 'cast'] ].iloc[ sim_idx ]
    return( df_top )
    

In [25]:
df_md = pd. read_csv('../movielens_small/movies_metadata.csv')
df_link = pd.read_csv('../movielens_small/links_small.csv')
df_credits = pd.read_csv( '../movielens_small/credits.csv' )
df_keywords = pd.read_csv( '../movielens_small/keywords.csv' )

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
df_link = df_link[df_link['tmdbId'].notnull()]['tmdbId'].astype('int')

In [27]:
df_md['genres'] = ( df_md['genres'].fillna('[]').
                   apply(ast.literal_eval).
                   apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) )

In [28]:
df_md = df_md.drop([19730, 29503, 35587]) # invalid input in cols id & imdb_id

In [29]:
df_md['id'] = df_md['id'].astype('int')
df_keywords['id'] = df_keywords['id'].astype('int')
df_credits['id'] = df_credits['id'].astype('int')

In [30]:
df_md_small = ( df_md[ df_md['id'].isin(df_link)].
               merge(df_credits, on='id').
               merge(df_keywords, on='id').
               reset_index() )
df_md_so = df_md_small

In [31]:
df_md_small['director'] = ( df_md_small['crew'].
                           apply(ast.literal_eval).
                           apply(getdirector).astype('str').
                           apply( lambda x: x.lower().replace( " ", "") ).
                           apply( lambda x: [x]) ) #more weights

In [32]:
df_md_small['cast'] = ( df_md_small['cast'].
                       apply(ast.literal_eval).
                       apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []).
                       apply( lambda x: x[:3] if len(x) >=3 else x).
                       apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [33]:
df_md_small['keywords'] = ( df_md_small['keywords'].
                           apply(ast.literal_eval).
                           apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []).
                           apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                           apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [34]:
df_md_small['content'] = ( df_md_small['cast'] +
                          df_md_small['director'] +
                          df_md_small['keywords'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [35]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [36]:
cosine_sim

array([[ 1.        ,  0.03456506,  0.03919309, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03456506,  1.        ,  0.        , ...,  0.046676  ,
         0.03456506,  0.        ],
       [ 0.03919309,  0.        ,  1.        , ...,  0.05292561,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.046676  ,  0.05292561, ...,  1.        ,
         0.13068205,  0.        ],
       [ 0.        ,  0.03456506,  0.        , ...,  0.13068205,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [37]:
getrecommendations( 'The Godfather', cosine_sim, df_md_small ).head(15)

994                             The Godfather: Part II
1602                           The Godfather: Part III
4148                                       The Gambler
5436                                          Mitchell
1346                                     The Rainmaker
3705                                   The Cotton Club
1765                                 The Paradine Case
146                                      Feast of July
284                           The Shawshank Redemption
2742                            ...And Justice for All
7464    The Bad Lieutenant: Port of Call - New Orleans
1412                                     Men with Guns
4663                                 Love the Hard Way
7698                   Wall Street: Money Never Sleeps
2808                                  Midnight Express
Name: title, dtype: object

In [38]:
getrecommendation( 'The Godfather', cosine_sim, df_md_so ).head(15)

Unnamed: 0,title,genres,keywords,director,cast
994,The Godfather: Part II,"[Drama, Crime]","[italo-american, cuba, vorort, melancholi, pra...",[francisfordcoppola],"[alpacino, robertduvall, dianekeaton]"
1602,The Godfather: Part III,"[Crime, Drama, Thriller]","[itali, christian, newyork, assassin, italo-am...",[francisfordcoppola],"[alpacino, dianekeaton, andygarcía]"
4148,The Gambler,"[Drama, Crime]","[gambl, professor, mafia, money, debt]",[karelreisz],"[jamescaan, paulsorvino, laurenhutton]"
5436,Mitchell,"[Crime, Drama, Action]","[drama, crime]",[andrewv.mclaglen],"[joedonbaker, martinbalsam, johnsaxon]"
3705,The Cotton Club,"[Music, Drama, Crime, Romance]","[jazz, jazzmusician, music, mafia]",[francisfordcoppola],"[richardgere, gregoryhines, dianelane]"
1346,The Rainmaker,"[Drama, Crime, Thriller]","[juror, proof, courtcas, leukemia, lawyer, cou...",[francisfordcoppola],"[mattdamon, dannydevito, jonvoight]"
1765,The Paradine Case,"[Drama, Crime, Thriller]","[femmefatal, lawyer]",[alfredhitchcock],"[gregorypeck, anntodd, charleslaughton]"
146,Feast of July,"[Drama, Crime]",[],[christophermenaul],"[embethdavidtz, tombell, gemmajones]"
284,The Shawshank Redemption,"[Drama, Crime]","[prison, corrupt, policebrut, prisoncel, delin...",[frankdarabont],"[timrobbins, morganfreeman, bobgunton]"
2742,...And Justice for All,"[Crime, Drama, Mystery, Thriller]","[suspens, lawyer, courtroom, extramaritalaffair]",[normanjewison],"[alpacino, jackwarden, johnforsythe]"
