In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [4]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 99

# Reading the 'tags' data

We start by reading the 'tags' data from the tags.csv file. Each row in the file consists of entry for a movie (specified by 'movieId'), the user reviewing the movie ('userId') and the user-generated tag ('tag') along with a timestamp. After reading the file, we delete the timestamp and the userId. 

In [19]:
cwd = os.getcwd()
tags = pd.read_csv(os.path.join(cwd,"..","MovieLens_Data", "MovieLens", "ml-25m", "tags.csv"))
tags.drop('timestamp', axis= 1, inplace= True)
tags.drop('userId', axis=1, inplace=True)
tags.head()

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good


In [6]:
tag_movie_ids = tags['movieId'].nunique()
print(tag_movie_ids)

45251


We find that there are roughly 45k movies with user generated tags

Next, we read the movies.csv file that contains the mapping between 'movieId' in tags.csv and the title of the movie. We also have a column specifying the genre(s) that the movie belongs to.

In [7]:
movies = pd.read_csv(os.path.join(cwd, "..", "MovieLens_Data", "MovieLens", "ml-25m", "movies.csv"))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


We shall add the information concerning the genre(s) of the movie to its respective tag document. To that end, we replace the '|' character with a space. In cases where no genre information ('no genres listed') is present, we simply leave the cell empty.  

In [9]:
movies['genres'] = movies['genres'].str.replace(pat='|', repl = ' ', regex = False)
movies['genres'] = movies['genres'].str.replace(pat='(no genres listed)', repl = ' ', regex = False) 
#regex = False treats () as a part of the string
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


We similarly replace all tags marked #NA with empty spaces. Then, we concatenate all the user generated tags for each movieId into a single document. 

In [20]:
tags.fillna("", inplace=True)   #searches for tags marked #NA and replaces them with empty spaces
tags = pd.DataFrame(tags.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
tags.reset_index(inplace=True)
tags.head()

Unnamed: 0,movieId,tag
0,1,Owned imdb top 250 Pixar Pixar time travel children comedy funny witty rated-G animation Pixar ...
1,2,Robin Williams time travel fantasy based on children's book board game disappearance giant inse...
2,3,funny best friend duringcreditsstinger fishing old man sequel fever moldy old sequel NO_FA_GANE...
3,4,based on novel or book chick flick divorce interracial relationship single mother CLV chick fli...
4,5,aging baby confidence contraception daughter gynecologist midlife crisis parent child relations...


To this concatenated document, we add the genre information for each movie from 'movies' dataframe.

In [21]:
tags = pd.merge(movies, tags, on = 'movieId')
tags['document'] = tags[['tag','genres']].apply(lambda x: ' '.join(x), axis= 1)
tags.head()

Unnamed: 0,movieId,title,genres,tag,document
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Owned imdb top 250 Pixar Pixar time travel children comedy funny witty rated-G animation Pixar ...,Owned imdb top 250 Pixar Pixar time travel children comedy funny witty rated-G animation Pixar ...
1,2,Jumanji (1995),Adventure Children Fantasy,Robin Williams time travel fantasy based on children's book board game disappearance giant inse...,Robin Williams time travel fantasy based on children's book board game disappearance giant inse...
2,3,Grumpier Old Men (1995),Comedy Romance,funny best friend duringcreditsstinger fishing old man sequel fever moldy old sequel NO_FA_GANE...,funny best friend duringcreditsstinger fishing old man sequel fever moldy old sequel NO_FA_GANE...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,based on novel or book chick flick divorce interracial relationship single mother CLV chick fli...,based on novel or book chick flick divorce interracial relationship single mother CLV chick fli...
4,5,Father of the Bride Part II (1995),Comedy,aging baby confidence contraception daughter gynecologist midlife crisis parent child relations...,aging baby confidence contraception daughter gynecologist midlife crisis parent child relations...


# TF-IDF and PCA

Now that the document for each movie is ready, we use the sklearn library to calculate the tf-idf matrix

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(0,1), min_df = 0.0001, stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(tags['document'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),index=tags.index.tolist())
print(tfidf_df.shape)

(45251, 11514)


Once we have successfully computed the tf-idf matrix, we seek to reduce the dimension of the "movie-space". Currently, as we can see, each movie is a point in 11514-dimensional space. One good way to reduce dimensions would be to use an autoencoder-decoder neural net. However, for the sake of simplicity, we choose to use principal component analysis with 20 principal components. 

In [31]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 100)
principalComponents = pca.fit_transform(tfidf_df)
principalComponents_df = pd.DataFrame(data=principalComponents, index=tags.index.tolist())
principalComponents_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.053902,0.001941,-0.090299,0.038624,-0.091975,0.213907,-0.005662,-0.176703,-0.002587,0.001484,-0.142637,-0.129378,-0.079494,-0.017493,-0.014489,0.040153,0.011691,-0.003693,0.014503,-0.024713,-0.009831,-0.031223,-0.005833,0.061307,-0.065518,0.167363,-0.030957,0.099275,-0.008962,-0.004856,-0.007229,-0.001281,-0.002724,-0.009079,0.02383,-0.007117,0.018565,-0.011594,-0.039704,0.02638,-0.014405,0.004799,-0.017357,0.056247,-0.00555,-0.0204,-0.02197,-0.039279,0.048986,-0.017062,-0.030888,0.008318,0.00875,0.002632,-0.020732,-0.00863,0.012169,0.003271,0.000345,-0.008942,-0.00575,0.002714,0.025589,-0.010355,-0.000839,0.000324,0.064098,0.038948,0.012707,0.039054,-0.090701,-0.029568,0.014212,0.047605,0.054343,-0.011833,0.01142,-0.038195,-0.002848,-0.074078,0.016931,-0.027356,-0.015927,0.005926,-0.027365,0.033409,0.074901,-0.049445,0.018791,-0.019187,-0.021048,0.024351,-0.023327,0.017804,0.005138,0.026323,0.026088,0.012011,0.022491,-0.016462
1,-0.050437,-0.039635,-0.048211,0.027431,-0.056871,0.08703,-0.011774,-0.029512,-0.011044,0.009921,0.003061,-0.008971,0.038399,0.022337,0.030094,-0.044842,0.091753,-0.02584,-0.016673,-0.016511,0.037359,0.023448,0.004385,-0.035554,0.072769,-0.066631,-0.025597,0.04144,0.064447,-0.048008,0.009643,-0.007415,-0.013958,-0.010509,-0.003672,-0.004365,-0.001043,-0.011314,-0.040035,-0.040429,-0.003547,-0.0312,-0.031486,0.027587,0.028501,-0.037709,-0.060696,-0.178235,0.129527,-0.144315,-0.064606,0.050366,0.008543,0.058318,-0.012003,-0.00071,0.046044,-0.009765,-0.009161,-0.010997,-0.000437,0.004898,0.007941,0.001997,0.002477,-0.046248,0.005331,0.016044,-0.030778,0.029102,-0.026602,-0.012301,-0.002558,-0.013272,-0.010525,-0.009104,-0.012111,-0.004857,-0.022295,0.015286,-0.011892,-0.027192,-0.049991,-0.004189,-0.00893,-0.031242,0.023147,-0.022727,0.000757,-0.003416,-0.043174,0.000338,0.006149,0.037176,-0.024148,-0.005073,-0.010364,0.030464,-0.004218,0.078701
2,-0.041464,0.014659,-0.032051,-0.002705,-0.028249,0.01032,-0.02537,0.015745,-0.023086,-0.02228,0.033749,-0.02271,0.009204,-0.025819,-0.007561,-0.044778,-0.011158,-0.016536,-0.007122,0.002041,-0.006886,-0.002818,-0.015256,-0.007977,-0.000767,0.003285,-0.03355,-0.000419,-0.006622,-0.009244,-0.005153,-0.004653,0.006325,-0.014924,-0.026375,0.00313,-0.000529,-0.015185,-0.020428,0.001939,-0.002603,-0.020661,-0.011908,-0.011223,0.009968,-0.011659,-0.002934,-0.003491,0.015535,0.031989,-0.026164,-0.004254,-0.004876,0.010071,-1.6e-05,0.009755,-0.026896,-0.008211,0.002028,-0.004919,-0.0189,-0.012994,0.010864,-0.036516,0.002691,-0.031774,0.032698,0.030251,-0.043365,0.023278,-0.117252,-0.00253,-0.057592,-0.054358,-0.015718,0.03038,-0.021353,0.056494,0.018134,-0.055271,-0.001967,0.034729,0.075715,-0.051609,-0.045976,-0.012301,-0.015226,0.018644,0.013059,-0.016501,-0.011628,0.017752,-0.00535,0.068206,0.00098,0.009078,0.01697,0.015576,-0.051262,-0.060396
3,-0.032691,0.040974,0.027383,-0.03492,-0.058024,0.011567,-0.051395,0.014621,-0.042757,0.004052,0.02091,-0.024359,0.129875,0.104643,0.0026,-0.005805,-0.049043,0.003106,0.015289,-0.001348,-0.019067,-0.009113,0.011071,0.003356,-0.011101,-0.012209,-0.008684,0.001018,-0.01387,0.005636,-0.018443,-0.005962,-0.078829,-0.010702,0.038423,0.012352,0.006462,-0.003357,0.024544,0.000269,0.032501,-0.034684,-0.008611,-0.024009,-0.03174,0.028389,0.012791,-0.008123,0.05568,0.014456,0.057642,-0.054226,0.003482,0.007307,0.022695,-0.041644,-0.003782,-0.038207,0.013115,0.006805,-0.003679,0.006594,0.016835,-0.003776,-0.020227,0.000676,0.018813,0.010066,-0.010447,0.005863,-0.004999,0.008975,0.000711,0.011536,-0.000197,-0.002732,-0.024778,0.012069,-0.031622,-0.014761,0.037687,-0.011603,0.021866,0.01804,0.060991,0.01478,0.010137,-0.025646,-0.02582,0.011096,-0.013171,0.005473,-0.023975,0.018354,-0.01953,0.004318,0.00231,-0.003647,0.015942,-0.000985
4,-0.04099,0.060512,-0.049049,0.012286,-0.026016,0.018287,-0.014618,-0.015119,-0.056137,0.003816,0.024843,-0.014091,0.006584,-0.025411,0.014291,-0.061792,0.024507,-0.032075,0.012876,-0.000151,-0.043499,0.005007,0.02991,-0.002406,0.023426,-0.042744,-0.023115,0.013742,-0.003818,-0.042212,-0.028089,-0.048031,-0.111075,-0.052328,0.148414,-0.026056,0.0526,-0.032564,-0.040366,-0.025258,0.033314,-0.036258,-0.031571,0.073428,-0.032782,-0.018838,-0.066596,0.013684,-0.01743,0.006949,0.003597,0.012179,0.00096,0.014175,-0.004747,0.003595,0.010985,-0.027273,0.000389,-0.007776,-0.021605,-0.021307,-0.003155,-0.02943,-0.003697,-0.020221,0.022072,0.0164,-0.045549,0.039506,-0.098618,-0.001642,-0.046797,-0.052727,0.000278,0.018388,-0.009005,0.062038,0.042651,-0.019056,0.018774,0.048243,0.055088,-0.024844,-0.020419,0.006352,-0.024149,0.009121,-0.000438,-0.014749,-0.016557,0.01418,-0.041603,0.056145,0.022874,0.031342,0.062705,0.029793,-0.037505,-0.034923


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
# similarity_matrix_df = pd.DataFrame(cosine_similarity(principalComponents), index=tags.index.tolist())
similarity_matrix_df = pd.DataFrame(euclidean_distances(principalComponents), index=tags.index.tolist())
# similarity_matrix_df = pd.DataFrame(cosine_similarity(tfidf_matrix), index=tags.index.tolist())
# similarity_matrix_df = pd.DataFrame(euclidean_distances(tfidf_matrix), index=tags.index.tolist())
similarity_matrix_df.head()

In [38]:
#input a numpy array of movieids and provide recommendations of those movies that have the highest average similarity score

def get_content_recommendations(movie_ids, movies, similarity_matrix_df, num = 5):
    movie_titles = pd.DataFrame(index=[movie_ids], columns=['title'])
    index_from_movie_ids = np.zeros(len(movie_ids))
    
    for i in range(len(movie_ids)):
        movie_titles['title'].iloc[i] = movies[movies['movieId'] == movie_ids[i]]['title'].iloc[0]
        index_from_movie_ids[i] = movies[movies['movieId'] == movie_ids[i]].index.values[0]
        
    similarity_overall_score = np.zeros((similarity_matrix_df.shape[0],1))
    
    for i in range(len(movie_ids)):
        similarity_overall_score=np.add(similarity_matrix_df[[index_from_movie_ids[i]]].values, similarity_overall_score)    
    
    similar_movies_df = pd.DataFrame(similarity_overall_score, index=similarity_matrix_df.index.tolist())
    similar_movies_df.drop(labels=index_from_movie_ids, axis=0, inplace=True)
    similar_movies_df.columns = ['overall_cosine_similarity']
    similar_movies_df.sort_values(by = ['overall_cosine_similarity'], ascending=False, inplace=True)
    similar_movies_df = similar_movies_df.head(num)
    similar_movies_df = similar_movies_df.join(movies, how='left')
    
    return similar_movies_df
    
#     for i in range(num):
#         similar_movies_df['title'].iloc[i] = movies[movies['index'] == similar_movies_df['index'].iloc[i]]['title'].iloc[0]
#     similar_movies_df.reset_index(inplace=True)
#     similar_movies_df.insert(1, column='title', value=np.nan)
#     similar_movies_df.insert(2, column='movieId', value=np.nan)
    
#     movie_rec_titles = pd.DataFrame(index=range(num), columns=['movieId','title'])
#     movie_rec_titles['movieId'] = similar_movies_df['movieId']
    
#     for i in range(num):
#         movie_rec_titles['title'].iloc[i] = movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[i]]['title'].iloc[0]
        
    

In [None]:
def lookup_movie_id_by_title(movie_title):
    return movies[movies.title.str.contains(movie_title)]

In [44]:
lookup_movie_id_by_title("Star Wars")

Unnamed: 0,movieId,title,genres
257,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi
1166,1196,Star Wars: Episode V - The Empire Strikes Back (1980),Action Adventure Sci-Fi
1179,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure Sci-Fi
2537,2628,Star Wars: Episode I - The Phantom Menace (1999),Action Adventure Sci-Fi
5270,5378,Star Wars: Episode II - Attack of the Clones (2002),Action Adventure Sci-Fi IMAX
9952,33493,Star Wars: Episode III - Revenge of the Sith (2005),Action Adventure Sci-Fi
12593,61160,Star Wars: The Clone Wars (2008),Action Adventure Animation Sci-Fi
14912,79006,Empire of Dreams: The Story of the 'Star Wars' Trilogy (2004),Documentary
21250,109713,Star Wars: Threads of Destiny (2014),Action Adventure Sci-Fi
25055,122886,Star Wars: Episode VII - The Force Awakens (2015),Action Adventure Fantasy Sci-Fi IMAX


In [53]:
movie_ids = [2571]
movie_titles = pd.DataFrame(index=[movie_ids], columns=['title'])
index_from_movie_ids = np.zeros(len(movie_ids))

for i in range(len(movie_ids)):
        movie_titles['title'].iloc[i] = movies[movies['movieId'] == movie_ids[i]]['title'].iloc[0]
        index_from_movie_ids[i] = movies[movies['movieId'] == movie_ids[i]].index.values[0]

# movie_titles.head()
print(index_from_movie_ids)
# movies[movies['movieId'] == 51540].index.values[0]

[2480.]


In [54]:
np.round(cosine_similarity(tfidf_matrix[2480,:], tfidf_matrix[6247,:]),10)

array([[0.]])

In [40]:
movie_ids = [1196]
alpha = get_content_recommendations(movie_ids, movies, similarity_matrix_cosine_df, num=5)
print(alpha)
# print(alpha['movieId'].values)
# movies.query('movieId == 24501')

       overall_cosine_similarity  movieId  \
27034                   0.814767   128155   
26243                   0.802151   125485   
37684                   0.764345   153178   
39925                   0.740150   158288   
39647                   0.730017   157681   

                                         title                genres  
27034  The Last Days of Lehman Brothers (2009)     Documentary Drama  
26243                 We Were Strangers (1949)  Action Drama Romance  
37684                           Insiang (1976)                 Drama  
39925                             Saamy (2003)          Action Drama  
39647                  Gotta Kick It Up (2002)          Comedy Drama  


In [49]:
tags.query('movieId== 89763').tag.values

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
7        0.000000
8        0.000000
9        0.000000
10       0.000000
11       0.000000
12       0.000000
13       0.000000
14       0.000000
15       0.000000
16       0.000000
17       0.000000
18       0.000000
19       0.000000
20       0.000000
21       0.000000
22       0.000000
23       0.000000
24       0.000000
25       0.000000
26       0.000000
27       0.000000
28       0.000000
29       0.000000
30       0.000000
31       0.000000
32       0.000000
33       0.000000
34       0.000000
35       0.000000
36       0.000000
37       0.000000
38       0.212194
39       0.000000
40       0.000000
41       0.000000
42       0.000000
43       0.000000
44       0.000000
45       0.000000
46       0.000000
47       0.000000
48       0.000000
49       0.000000
50       0.000000
51       0.000000
52       0.000000
53       0.000000
54       0.000000
55       0

In [50]:
movie_ids = [260, 234, 456, 789]
similar_overall_score = np.zeros((similarity_matrix_df.shape[0],1))
similar_overall_score = np.add(similarity_matrix_df[[movie_ids[0]]].values, similar_overall_score)
similar_overall_score = np.add(similarity_matrix_df[[movie_ids[1]]].values, similar_overall_score)
similar_overall_score = np.add(similarity_matrix_df[[movie_ids[2]]].values, similar_overall_score)
similar_overall_score = np.add(similarity_matrix_df[[movie_ids[3]]].values, similar_overall_score)
similar_movies_new_df = pd.DataFrame(similar_overall_score)
similar_movies_new_df.columns = ['cosine_similarity']
# similar_movies_new_df.insert(1, column='title', value=np.nan)
# similar_movies_new_df.insert(2, column='movieId', value=np.nan)
similar_movies_new_df.sort_values(by = ['cosine_similarity'], ascending=False, inplace=True)
# print(similar_overall_score.shape)
similar_movies_new_df = similar_movies_new_df.head()
print(similar_movies_new_df)
similar_movies_new_df = similar_movies_new_df.join(movies, how = 'left')
similar_movies_new_df
# for i in range(3):
#         similar_movies_new_df['title'].iloc[i] = movies[movies.index == similar_movies_df.index.iloc[i]]['title'].iloc[0]
# similar_movies_new_df.reset_index
# similar_movies_new_df.index.tolist(1,1)
# print(len(movie_ids))
# type(similar_movies_new_df)
# print(similar_overall_score)

       cosine_similarity
456             1.069405
260             1.051592
234             1.029656
789             1.022092
32614           0.806088


Unnamed: 0,cosine_similarity,movieId,title,genres
456,1.069405,461,Go Fish (1994),Drama Romance
260,1.051592,263,Ladybird Ladybird (1994),Drama
234,1.029656,237,Forget Paris (1995),Comedy Romance
789,1.022092,805,"Time to Kill, A (1996)",Drama Thriller
32614,0.806088,141347,Watusi (1959),Action Adventure


In [119]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
similar_movies_df.sort_values(by = ['cosine_similarity'], ascending=False, inplace=True)


In [127]:
# movie_ids = [260, 234, 456, 789]
# movie_titles = pd.DataFrame(index=[movie_ids], columns=['title'])
# print(movie_titles)
# for i in range(len(movie_ids)):
#         movie_titles['title'].iloc[i] = movies[movies['movieId'] == movie_ids[i]]['title'].iloc[0]
# print(movie_titles)

    title
260   NaN
234   NaN
456   NaN
789   NaN
                                                 title
260          Star Wars: Episode IV - A New Hope (1977)
234                                Exit to Eden (1994)
456                                       Fresh (1994)
789  I, the Worst of All (Yo, la peor de todas) (1990)


In [22]:
movies.query('movieId == 234')

Unnamed: 0,movieId,title,genres
231,234,Exit to Eden (1994),Comedy


In [61]:
movie_rec_titles = pd.DataFrame(index=range(5), columns=['movieId','title'])
print(movie_rec_titles)
movie_rec_titles['movieId'] = alpha['movieId'] 
# for i in range(5):
#         movie_rec_titles.title.iloc[i] = movies[movies['movieId'] == movie_rec_titles.movieId.iloc[i]]['title'].iloc[0]
print(movie_rec_titles)
movie_rec_titles['title'].iloc[0] = movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[0]]['title'].iloc[0]
movie_rec_titles['title'].iloc[1] = movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[1]]['title'].iloc[0]
# movie_rec_titles['title'].iloc[2] = movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[2]]['title'].iloc[0]
# movie_rec_titles['title'].iloc[3] = movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[3]]['title'].iloc[0]
# movie_rec_titles['title'].iloc[4] = movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[4]]['title'].iloc[0]

print(movie_rec_titles)

  movieId title
0     NaN   NaN
1     NaN   NaN
2     NaN   NaN
3     NaN   NaN
4     NaN   NaN
   movieId title
0     8999   NaN
1     8784   NaN
2    14552   NaN
3     9323   NaN
4    24501   NaN
   movieId                                 title
0     8999  That's Entertainment, Part II (1976)
1     8784                   Garden State (2004)
2    14552                                   NaN
3     9323                                   NaN
4    24501                                   NaN


In [93]:
# print(movies[movies['movieId'] == movie_rec_titles['movieId'].iloc[2]]['title'].iloc[0])
# print(movies[movies['movieId'] == 14552]['title'].iloc[0])
# print(movie_rec_titles['movieId'].iloc[2])
# movies.query('movieId == 14552')['title'].iloc[0]
movies.query('movieId == 8784')

Unnamed: 0,movieId,title,genres
8070,8784,Garden State (2004),Comedy|Drama|Romance


In [280]:
movie_ids = [260, 234, 456, 789]
movie_titles = pd.DataFrame(index=[movie_ids], columns=['title'])

movie_rec_titles = pd.DataFrame(index=range(5), columns=['movieId','title'])

# movie_titles.reset_index(inplace=True)
# movie_titles.rename(columns = {'movieId'}, inplace = True)
# movie_titles.index = movie_titles['movieId']
#     movie_titles = pd.Series(index=[movie_ids])

for i in range(len(movie_ids)):
    movie_titles.iloc[i] = movies[movies['movieId'] == movie_ids[i]]['title'].iloc[0]

# for i in range(5):
#     movie_rec_titles['movidId'].iloc[i] = get
    
# print(movie_titles)
# print(pd.Series(movie_titles['title'].values))    
alpha = get_content_recommendations(movie_ids, similarity_matrix_df)
print(alpha)
print(movie_titles)
# print(movie_rec_titles)

   movieId  overall_cosine_similarity
0    14810                   1.681837
1    14552                   1.666567
2     8999                   1.666377
3    11335                   1.658928
4    15330                   1.650560
                                                 title
260          Star Wars: Episode IV - A New Hope (1977)
234                                Exit to Eden (1994)
456                                       Fresh (1994)
789  I, the Worst of All (Yo, la peor de todas) (1990)


In [160]:
movie_ids = [260, 234, 456, 789]
similarity_overall_score=np.zeros((similarity_matrix_df.shape[0], 1))
similarity_overall_score_df = pd.DataFrame(similarity_overall_score)

# similarity_overall_score=np.add(similarity_matrix_df[[movie_ids[i]]].values for i in range(len(movie_ids)))
print(similarity_overall_score_df)
# print(similarity_overall_score)

         0
0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      0.0
10     0.0
11     0.0
12     0.0
13     0.0
14     0.0
15     0.0
16     0.0
17     0.0
18     0.0
19     0.0
20     0.0
21     0.0
22     0.0
23     0.0
24     0.0
25     0.0
26     0.0
27     0.0
28     0.0
29     0.0
30     0.0
31     0.0
32     0.0
33     0.0
34     0.0
35     0.0
36     0.0
37     0.0
38     0.0
39     0.0
40     0.0
41     0.0
42     0.0
43     0.0
44     0.0
45     0.0
46     0.0
47     0.0
48     0.0
49     0.0
50     0.0
51     0.0
52     0.0
53     0.0
54     0.0
55     0.0
56     0.0
57     0.0
58     0.0
59     0.0
60     0.0
61     0.0
62     0.0
63     0.0
64     0.0
65     0.0
66     0.0
67     0.0
68     0.0
69     0.0
70     0.0
71     0.0
72     0.0
73     0.0
74     0.0
75     0.0
76     0.0
77     0.0
78     0.0
79     0.0
80     0.0
81     0.0
82     0.0
83     0.0
84     0.0
85     0.0
86     0.0
87     0.0
88     0.0
89     0.0