In [92]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np
from scipy.sparse.linalg import svds

In [93]:
def getratingmat( df_rating ):
    df = df_rating.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    df.index.name = 'Movie ID'
    df.columns.name = 'User ID'
    Y = df.as_matrix()
    R = ( Y > 0. ).astype(int)
    return( df, Y, R );

def normalizeratings2( Y, R ):
    Y_sum = np.sum( Y, axis = 0 )
    R_sum = np.sum( R, axis = 0 )
    Y_mean = Y_sum / R_sum
    Y_norm = np.zeros( Y.shape )
    for i in range( Y.shape[1] ):
        j = np.where( R[:,i] == 1 )
        Y_norm[j,i] = Y[j,i] - Y_mean[i]       
    return( Y_norm, Y_mean );

In [94]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')
df_small = pd.read_csv('../movielens_small/ratings_clean.csv')

In [95]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [96]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x, x]) ) # more weights?

In [97]:
df_md_small['cast_'] = ( df_md_small['cast'].
                        apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [98]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                            apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                            apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [99]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [100]:
df_md_small.head()

Unnamed: 0,index,movieId,tmdbId,title,vote_average,vote_count,cast,director,keywords,genres,director_,cast_,keywords_,content
0,0,1,862,Toy Story,7.7,5415,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter,"[jealousy, toy, boy, friendship, friends, riva...","[Animation, Comedy, Family]","[johnlasseter, johnlasseter]","[tomhanks, timallen, donrickles, jimvarney, wa...","[jealousi, toy, boy, friendship, friend, rival...",tomhanks timallen donrickles jimvarney wallace...
1,1,2,8844,Jumanji,6.9,2413,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]","[joejohnston, joejohnston]","[robinwilliams, jonathanhyde, kirstendunst, br...","[boardgam, disappear, basedonchildren'sbook, n...",robinwilliams jonathanhyde kirstendunst bradle...
2,2,3,15602,Grumpier Old Men,6.5,92,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch,"[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]","[howarddeutch, howarddeutch]","[waltermatthau, jacklemmon, ann-margret, sophi...","[fish, bestfriend, duringcreditssting, oldmen]",waltermatthau jacklemmon ann-margret sophialor...
3,3,4,31357,Waiting to Exhale,6.1,34,"[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]","[forestwhitaker, forestwhitaker]","[whitneyhouston, angelabassett, lorettadevine,...","[basedonnovel, interracialrelationship, single...",whitneyhouston angelabassett lorettadevine lel...
4,4,5,11862,Father of the Bride Part II,5.7,173,"[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer,"[baby, midlife crisis, confidence, aging, daug...",[Comedy],"[charlesshyer, charlesshyer]","[stevemartin, dianekeaton, martinshort, kimber...","[babi, midlifecrisi, confid, age, daughter, mo...",stevemartin dianekeaton martinshort kimberlywi...


In [101]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [102]:
cosine_sim.shape

(9082, 9082)

In [103]:
df_ratings, Y, R = getratingmat( df_small )
Y_norm, Y_mean = normalizeratings2( Y, R )

In [104]:
U, sigma, Vt = svds(Y_norm, k = 50)
sigma = np.diag(sigma)

In [105]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + Y_mean

In [106]:
user_id = 50
movie_title = 'Pulp Fiction'

In [107]:
df_svd_pred = pd.DataFrame(all_user_predicted_ratings, columns = df_ratings.columns, index = df_ratings.index)

In [108]:
df_userp = df_svd_pred[ user_id ].reset_index()
df_userp.columns = [ 'movieId', 'svd_pred' ]
df_userp.shape

(9025, 2)

In [109]:
highest_svd = df_userp.sort_values('svd_pred', ascending = False ).reset_index( drop = True ).loc[0,'svd_pred']

In [110]:
idx = np.asscalar( df_md_small.loc[ df_md_small['title'] == movie_title ].index.values[0] )
df_cbp = df_md_small[['movieId', 'title']].reset_index( drop = True )
df_cbp[ 'sim' ] = cosine_sim[ idx ].flatten().tolist()
df_cbp.shape

(9082, 3)

In [111]:
highest_sim = df_cbp.sort_values('sim', ascending = False).head().reset_index( drop = True ).loc[1,'sim']

In [115]:
df_c = df_userp.merge( df_cbp, on = 'movieId' )
df_c['score'] = df_c['svd_pred'] * ( 5 / highest_svd ) *0.5 + df_c[ 'sim'] * ( 3.5 / highest_sim ) * 0.5
df_c.sort_values('score', ascending = False).iloc[1:11]

Unnamed: 0,movieId,svd_pred,title,sim,score
1360,1729,3.316758,Jackie Brown,0.201498,3.968442
8743,128360,3.279542,The Hateful Eight,0.186171,3.810439
5132,7438,3.246035,Kill Bill: Vol. 2,0.161591,3.574551
4700,6595,3.303196,S.W.A.T.,0.138343,3.410871
4532,6263,3.28495,Basic,0.138215,3.397554
8169,99114,3.294345,Django Unchained,0.1335,3.362894
6830,57401,3.287759,Cleaner,0.132453,3.349395
4837,6874,3.267465,Kill Bill: Vol. 1,0.12983,3.31304
231,259,3.283971,Kiss of Death,0.127453,3.303436
6995,61401,3.280374,The Spirit,0.124114,3.272033
