In [155]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer #porterstemmer?
import numpy as np
from scipy.sparse.linalg import svds
from scipy.stats import percentileofscore

In [156]:
def getratingmat( df_rating ):
    df = df_rating.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    df.index.name = 'Movie ID'
    df.columns.name = 'User ID'
    Y = df.as_matrix()
    R = ( Y > 0. ).astype(int)
    return( df, Y, R );

def normalizeratings2( Y, R ):
    Y_sum = np.sum( Y, axis = 0 )
    R_sum = np.sum( R, axis = 0 )
    Y_mean = Y_sum / R_sum
    Y_norm = np.zeros( Y.shape )
    for i in range( Y.shape[1] ):
        j = np.where( R[:,i] == 1 )
        Y_norm[j,i] = Y[j,i] - Y_mean[i]       
    return( Y_norm, Y_mean );

In [157]:
df_md_small = pd.read_csv('../movielens_small/metadata_clean.csv')
df_small = pd.read_csv('../movielens_small/ratings_clean.csv')

In [158]:
df_md_small['genres'] = df_md_small['genres'].apply(ast.literal_eval)
df_md_small['cast'] = df_md_small['cast'].apply(ast.literal_eval)
df_md_small['keywords'] = df_md_small['keywords'].apply(ast.literal_eval)

In [159]:
df_md_small['director_'] = ( df_md_small['director'].
                            astype(str).
                            apply( lambda x: x.lower().replace( " ", "") ).
                            apply( lambda x: [x, x]) ) # more weights?

In [160]:
df_md_small['cast_'] = ( df_md_small['cast'].
                        apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [161]:
df_md_small['keywords_'] = ( df_md_small['keywords'].
                            apply( lambda x: [SnowballStemmer( 'english' ).stem( i ) for i in x]).
                            apply( lambda x: [i.lower().replace( " ", "") for i in x ]) )

In [162]:
df_md_small['content'] = ( df_md_small['cast_'] +
                          df_md_small['director_'] +
                          df_md_small['keywords_'] +
                          df_md_small['genres'] ).apply( lambda x: ' '.join( x ) ) # combine all content features

In [163]:
content_matrix = CountVectorizer(
    analyzer = 'word',
    ngram_range = (1, 2),
    min_df = 0,
    stop_words = 'english').fit_transform(df_md_small['content'])
cosine_sim = cosine_similarity(content_matrix, content_matrix)

In [164]:
cosine_sim.shape

(9082, 9082)

In [165]:
df_ratings, Y, R = getratingmat( df_small )
Y_norm, Y_mean = normalizeratings2( Y, R )

In [166]:
U, sigma, Vt = svds(Y_norm, k = 100, maxiter = 30 )
sigma = np.diag(sigma)

In [167]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + Y_mean

In [168]:
user_id = 50
movie_title = 'Terminator 2: Judgment Day'

In [169]:
df_user_rating = ( df_small.loc[ df_small['userId'] == user_id ].
               merge(df_md_small, on = 'movieId').
               reset_index(drop = True) )
df_user_rating[ ['movieId', 'title', 'rating'] ].sort_values('rating', ascending = False ).head(10)

Unnamed: 0,movieId,title,rating
41,589,Terminator 2: Judgment Day,5.0
0,10,GoldenEye,4.0
15,292,Outbreak,4.0
44,780,Independence Day,4.0
37,527,Schindler's List,4.0
35,480,Jurassic Park,4.0
31,440,Dave,4.0
26,368,Maverick,4.0
25,367,The Mask,4.0
24,357,Four Weddings and a Funeral,4.0


In [170]:
df_svd_pred = pd.DataFrame(all_user_predicted_ratings, columns = df_ratings.columns, index = df_ratings.index)

In [171]:
df_userp = df_svd_pred[ user_id ].reset_index()
df_userp.columns = [ 'movieId', 'rating_pred' ]
df_userp.shape

(9025, 2)

In [172]:
idx = np.asscalar( df_md_small.loc[ df_md_small['title'] == movie_title ].index.values[0] )
df_cbp = df_md_small[['movieId', 'title']].reset_index( drop = True )
df_cbp[ 'sim' ] = cosine_sim[ idx ].flatten().tolist()
df_cbp.shape

(9082, 3)

In [173]:
df_c = df_userp.merge( df_cbp, on = 'movieId' )
df_c = df_c[ ~df_c['movieId'].isin(df_user_rating['movieId']) ]
x = df_c['rating_pred'].values
y = df_c['sim'].values
x_p = np.array( [percentileofscore(x, i, 'strict') for i in x] )
y_p = np.array( [percentileofscore(y, i, 'weak') for i in y] )
df_c['rating_p'] = x_p.tolist()
df_c['sim_p'] = y_p.tolist()

In [174]:
weight_cb = 0.5
score = ( x_p * ( 1 - weight_cb ) + y_p * weight_cb ) 
df_c['score'] = score.tolist()
df_c.sort_values( 'score', ascending = False ).iloc[0:11]

Unnamed: 0,movieId,rating_pred,title,sim,rating_p,sim_p,score
990,1240,3.433612,The Terminator,0.367653,99.933177,100.0,99.966589
31,32,3.393068,Twelve Monkeys,0.187867,99.788395,99.721573,99.754984
2864,3593,3.36381,Battlefield Earth,0.192187,99.476556,99.777258,99.626907
953,1200,3.350868,Aliens,0.235294,99.220403,99.955452,99.587927
1476,1917,3.368883,Armageddon,0.127611,99.565653,98.774919,99.170286
2813,3527,3.348657,Predator,0.134535,99.153581,98.997661,99.075621
4359,5903,3.338866,Equilibrium,0.131212,98.774919,98.930839,98.852879
7365,72998,3.328616,Avatar,0.153964,98.140105,99.38746,98.763782
1328,1676,3.326458,Starship Troopers,0.156863,97.995322,99.432008,98.713665
7170,68319,3.321785,X-Men Origins: Wolverine,0.153081,97.560976,99.376323,98.468649


In [175]:
df_md_small.loc[ df_md_small['title'] == 'Terminator 2: Judgment Day']

Unnamed: 0,index,movieId,tmdbId,title,vote_average,vote_count,cast,director,keywords,genres,director_,cast_,keywords_,content
522,522,589,280,Terminator 2: Judgment Day,7.7,4274,"[Arnold Schwarzenegger, Linda Hamilton, Robert...",James Cameron,"[cyborg, shotgun, post-apocalyptic, dystopia, ...","[Action, Thriller, Science Fiction]","[jamescameron, jamescameron]","[arnoldschwarzenegger, lindahamilton, robertpa...","[cyborg, shotgun, post-apocalypt, dystopia, mo...",arnoldschwarzenegger lindahamilton robertpatri...
