In [1]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
import re

In [2]:
def getdirector( crew ):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [69]:
def getrecommendations(title, cosine_sim, df ):
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])
    idx = indices[title]
    print( 'idx', idx )
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    print( 'sim_scores', sim_scores )
    movie_indices = [i[0] for i in sim_scores]
    print( 'movie_indices', movie_indices)
    return titles.iloc[movie_indices]

In [77]:
def getrecommendation( movie_title, sim_matrix, df ):
    idx = np.asscalar( 
        df.loc[ df['title'] == movie_title ].index.values )
    
    print( 'idx', idx )
    sim_idx = sim_matrix[ idx ].argsort()[::-1][1:21].flatten().tolist()
    print( 'movie_indices', sim_idx )
    df_top = df[ ['title', 'genres', 'keywords', 'director', 'cast'] ].iloc[ sim_idx ]
    return( df_top )
    

In [4]:
df_md = pd. read_csv('../movielens_small/movies_metadata.csv')
df_link = pd.read_csv('../movielens_small/links_small.csv')
df_credits = pd.read_csv( '../movielens_small/credits.csv' )
df_keywords = pd.read_csv( '../movielens_small/keywords.csv' )

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df_link = df_link[df_link['tmdbId'].notnull()]['tmdbId'].astype('int')

In [6]:
df_md['genres'] = ( df_md['genres'].fillna('[]').
                   apply(ast.literal_eval).
                   apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) )

In [7]:
df_md = df_md.drop([19730, 29503, 35587]) # invalid input in cols id & imdb_id

In [8]:
df_md['id'] = df_md['id'].astype('int')
df_keywords['id'] = df_keywords['id'].astype('int')
df_credits['id'] = df_credits['id'].astype('int')

In [9]:
df_md_small = ( df_md[ df_md['id'].isin(df_link)].
               merge(df_credits, on='id').
               merge(df_keywords, on='id').
               reset_index() )

In [10]:
df_md_small['director'] = ( df_md_small['crew'].
                           apply(ast.literal_eval).
                           apply(getdirector).astype('str').
                           apply( lambda x: x.lower().replace( " ", "") ).
                           apply( lambda x: [x]) ) #more weights

In [11]:
df_md_small['cast'] = ( df_md_small['cast'].
                       apply(ast.literal_eval).
                       apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []).
                       apply( lambda x: x[:3] if len(x) >=3 else x).
                       apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [12]:
df_md_small['keywords'] = ( df_md_small['keywords'].
                           apply(ast.literal_eval).
                           apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []).
                           apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                           apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [13]:
df_md_small['content'] = ( df_md_small['cast'] +
                          df_md_small['director'] +
                          df_md_small['keywords'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [15]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [19]:
cosine_sim

(9219, 9219)

In [72]:
getrecommendations( 'The Godfather', cosine_sim, df_md_small ).head(15)

idx 699
sim_scores [(994, 0.30009379396821317), (1602, 0.27508597780419541), (1346, 0.26761546505252365), (3705, 0.26761546505252365), (3616, 0.26646935501059649), (3300, 0.22417941532712204), (2998, 0.21571674297647797), (4518, 0.20965696734438366), (1691, 0.20814536170751841), (5867, 0.20131905799006777), (1992, 0.20033416898825337), (642, 0.1849000654084097), (981, 0.17505471314812435), (4148, 0.16012815380508716), (1765, 0.13977131156292244), (5436, 0.13937366833451514), (284, 0.13533299049019168), (146, 0.12403473458920844), (2742, 0.12326671027227314), (7464, 0.12326671027227314)]
movie_indices [994, 1602, 1346, 3705, 3616, 3300, 2998, 4518, 1691, 5867, 1992, 642, 981, 4148, 1765, 5436, 284, 146, 2742, 7464]


994            The Godfather: Part II
1602          The Godfather: Part III
1346                    The Rainmaker
3705                  The Cotton Club
3616    Tucker: The Man and His Dream
3300                 Gardens of Stone
2998                 The Conversation
4518               One from the Heart
1691                    The Outsiders
5867                      Rumble Fish
1992            Peggy Sue Got Married
642                              Jack
981                    Apocalypse Now
4148                      The Gambler
1765                The Paradine Case
Name: title, dtype: object

In [78]:
getrecommendation( 'The Godfather', cosine_sim, df_md_small ).head(15)

idx 699
movie_indices [994, 1602, 1346, 3705, 3616, 3300, 2998, 4518, 1691, 5867, 1992, 642, 981, 4148, 1765, 5436, 284, 146, 7464, 2742]


Unnamed: 0,title,genres,keywords,director,cast
994,The Godfather: Part II,"[Drama, Crime]","[italo-american, cuba, vorort, melancholi, pra...","[francisfordcoppola, francisfordcoppola]","[alpacino, robertduvall, dianekeaton]"
1602,The Godfather: Part III,"[Crime, Drama, Thriller]","[itali, christian, newyork, assassin, italo-am...","[francisfordcoppola, francisfordcoppola]","[alpacino, dianekeaton, andygarcía]"
1346,The Rainmaker,"[Drama, Crime, Thriller]","[juror, proof, courtcas, leukemia, lawyer, cou...","[francisfordcoppola, francisfordcoppola]","[mattdamon, dannydevito, jonvoight]"
3705,The Cotton Club,"[Music, Drama, Crime, Romance]","[jazz, jazzmusician, music, mafia]","[francisfordcoppola, francisfordcoppola]","[richardgere, gregoryhines, dianelane]"
3616,Tucker: The Man and His Dream,[Drama],[],"[francisfordcoppola, francisfordcoppola]","[jeffbridges, joanallen, martinlandau]"
3300,Gardens of Stone,"[Drama, History]","[vietnamveteran, washingtond.c., cemeteri, vie...","[francisfordcoppola, francisfordcoppola]","[jamescaan, anjelicahuston, jamesearljones]"
2998,The Conversation,"[Crime, Drama, Mystery]","[sanfrancisco, paranoia, audiotap, wiretap, sh...","[francisfordcoppola, francisfordcoppola]","[genehackman, johncazale, fredericforrest]"
4518,One from the Heart,"[Drama, Music, Romance]","[fourthofjuli, circusperform]","[francisfordcoppola, francisfordcoppola]","[terigarr, fredericforrest, raúljuliá]"
1691,The Outsiders,"[Crime, Drama]","[streetgang, children'shom, comingofag, gang, ...","[francisfordcoppola, francisfordcoppola]","[mattdillon, ralphmacchio, c.thomashowell]"
5867,Rumble Fish,"[Action, Adventure, Crime, Drama, Romance]","[streetgang, billard, colour-blind, gang]","[francisfordcoppola, francisfordcoppola]","[mattdillon, mickeyrourke, dianelane]"


In [27]:
df_ = df_md_small

In [63]:
 idx = df_.loc[ df_['title'] == 'The Godfather' ].index.values
idx

array([699], dtype=int64)

In [64]:
k = cosine_sim[idx].argsort()[::-1][:20].flatten().tolist()

In [65]:
df_.iloc[k]

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,content
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousi, toy, boy, friendship, friend, rival...","[johnlasseter, johnlasseter]",tomhanks timallen donrickles johnlasseter john...
5003,5003,False,,0,"[Fantasy, Horror, Comedy]",,6069,tt0094332,en,The Witches of Eastwick,...,Three Beautiful Women. One Lucky Devil.,The Witches of Eastwick,False,6.2,334.0,"[jacknicholson, cher, susansarandon]","[{'credit_id': '52fe443bc3a36847f8089cab', 'de...","[witch, newengland]","[georgemiller, georgemiller]",jacknicholson cher susansarandon georgemiller ...
5002,5002,False,,0,"[Adventure, Fantasy, Horror]",,10166,tt0100944,en,The Witches,...,Saving the world from witches is a tall order ...,The Witches,False,6.7,224.0,"[anjelicahuston, maizetterling, jasenfisher]","[{'credit_id': '55d22d17c3a368342c00010f', 'de...","[witch, chocol, mous, grandmothergrandsonrelat...","[nicolasroeg, nicolasroeg]",anjelicahuston maizetterling jasenfisher nicol...
4996,4996,False,"{'id': 43066, 'name': 'Teen Wolf Collection', ...",0,"[Comedy, Fantasy, Romance]",,11824,tt0090142,en,Teen Wolf,...,He always wanted to be special... but he never...,Teen Wolf,False,6.0,292.0,"[michaelj.fox, jameshampton, susanursitti]","[{'credit_id': '52fe448c9251416c75038b75', 'de...","[trainer, train, supernaturalpow, highschool, ...","[roddaniel, roddaniel]",michaelj.fox jameshampton susanursitti roddani...
4994,4994,False,,530000,[Horror],,29437,tt0076590,en,Rabid,...,Pray it doesn't happen to you.,Rabid,False,6.4,90.0,"[marilynchambers, frankmoore, joesilver]","[{'credit_id': '52fe45e0c3a368484e073eed', 'de...","[nuditi, vampir, montreal, motorcyclecrash, mu...","[davidcronenberg, davidcronenberg]",marilynchambers frankmoore joesilver davidcron...
4991,4991,False,,0,[Comedy],,14671,tt0094072,en,Summer School,...,"At Ocean Front High, what do they call a guy w...",Summer School,False,6.3,68.0,"[markharmon, kirstiealley, courtneythorne-smith]","[{'credit_id': '5403f2ba0e0a2658ee008f89', 'de...","[highschool, misfit, teacher, school, teenmovi...","[carlreiner, carlreiner]",markharmon kirstiealley courtneythorne-smith c...
4984,4984,False,,15000000,"[Action, Adventure, Thriller]",,10538,tt0105104,en,Passenger 57,...,"He's an ex-cop with a bad mouth, a bad attitud...",Passenger 57,False,5.6,264.0,"[wesleysnipes, brucepayne, tomsizemore]","[{'credit_id': '52fe43839251416c75013485', 'de...","[airport, florida, fbi, hijack, ex-cop, losang...","[kevinhooks, kevinhooks]",wesleysnipes brucepayne tomsizemore kevinhooks...
4981,4981,False,,20000000,"[Fantasy, Comedy, Science Fiction, Romance]",,2612,tt0100201,en,Mr. Destiny,...,Larry Burrows Wished For It All... Until All T...,Mr. Destiny,False,6.1,60.0,"[jimbelushi, michaelcaine, lindahamilton]","[{'credit_id': '52fe4360c3a36847f804f873', 'de...",[wish],"[jamesorr, jamesorr]",jimbelushi michaelcaine lindahamilton jamesorr...
4976,4976,False,,29000000,"[Action, Thriller]",,9319,tt0102266,en,The Last Boy Scout,...,Everyone had counted them out. But they're abo...,The Last Boy Scout,False,6.5,502.0,"[brucewillis, damonwayans, chelseafield]","[{'credit_id': '52fe44e7c3a36847f80b0e39', 'de...","[bomb, corrupt, assassin, sniper, antihero, co...","[tonyscott, tonyscott]",brucewillis damonwayans chelseafield tonyscott...
4974,4974,False,"{'id': 397444, 'name': 'Kindergarten Cop Colle...",15000000,[Comedy],,951,tt0099938,en,Kindergarten Cop,...,"Go ahead, you tell him you didn't do your home...",Kindergarten Cop,False,5.8,643.0,"[arnoldschwarzenegger, penelopeannmiller, pame...","[{'credit_id': '52fe4292c3a36847f8029389', 'de...","[crimefight, cook, drugdeal, dyinganddeath, ki...","[ivanreitman, ivanreitman]",arnoldschwarzenegger penelopeannmiller pamelar...
