In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
movie_data = pd.read_csv('./movie_preprocessed.csv')
movie_data = movie_data[['id', 'title', 'genres', 'keywords']]

print(movie_data.shape)
movie_data.head()

(2001, 4)


Unnamed: 0,id,title,genres,keywords
0,8844,Jumanji,Adventure Fantasy Family,board game disappearance based on children's b...
1,949,Heat,Action Crime Drama Thriller,robbery detective bank obsession chase shootin...
2,710,GoldenEye,Adventure Action Thriller,cuba falsely accused secret identity computer ...
3,21032,Balto,Family Animation Adventure,wolf dog-sledding race alaska dog goose bear a...
4,524,Casino,Drama Crime,poker drug abuse 1970s overdose illegal prosti...


In [17]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001 entries, 0 to 2000
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2001 non-null   int64 
 1   title     2001 non-null   object
 2   genres    2000 non-null   object
 3   keywords  1987 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.7+ KB


In [18]:
movie_data.loc[:,('title','genres', 'keywords')] = movie_data[['title','genres', 'keywords']].astype("string")

In [20]:
movie_data = movie_data.dropna()

## TF-IDF vectorize (genres + keywords)

In [31]:
tfidf_vector = TfidfVectorizer()
tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'] + " " + movie_data['keywords']).toarray()
tfidf_matrix_feature = tfidf_vector.get_feature_names_out()

In [32]:
tfidf_matrix.shape

(1986, 5581)

In [33]:
tfidf_matrix = pd.DataFrame(tfidf_matrix, columns=tfidf_matrix_feature, index = movie_data.title)
print(tfidf_matrix.shape)
tfidf_matrix.head()

(1986, 5581)


Unnamed: 0_level_0,16th,17th,18th,1910s,1920s,1930s,1940s,1950s,1960s,1970s,...,zombification,zone,zoo,zoom,zoophilia,zurich,øverste,卧底肥妈,绝地奶霸,超级妈妈
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Heat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GoldenEye,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Balto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Casino,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## calculate cosine similarity

In [34]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix)

CPU times: user 1.95 s, sys: 21.6 ms, total: 1.97 s
Wall time: 575 ms


In [35]:
cosine_sim.shape

(1986, 1986)

In [27]:
cosine_sim_df = pd.DataFrame(cosine_sim, index = movie_data.title, columns = movie_data.title)
print(cosine_sim_df.shape)
cosine_sim_df.head()

(1986, 1986)


title,Jumanji,Heat,GoldenEye,Balto,Casino,Sense and Sensibility,Four Rooms,Ace Ventura: When Nature Calls,Get Shorty,Assassins,...,Legend of the Guardians: The Owls of Ga'Hoole,It's Kind of a Funny Story,Let Me In,Devil,You Again,Life As We Know It,Inside Job,Monsters,Never Let Me Go,Jackass 3D
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jumanji,1.0,0.0,0.009178,0.025863,0.0,0.039444,0.029209,0.021456,0.04347,0.010352,...,0.114952,0.048266,0.0,0.0,0.0,0.0,0.0,0.0,0.055495,0.0
Heat,0.0,1.0,0.007901,0.0,0.03104,0.019528,0.035415,0.027496,0.197204,0.135589,...,0.0,0.05444,0.004015,0.008908,0.0,0.004122,0.073449,0.008517,0.0052,0.008122
GoldenEye,0.009178,0.007901,1.0,0.006828,0.0,0.0,0.0,0.012642,0.005697,0.015748,...,0.01834,0.0,0.0,0.009645,0.0,0.0,0.0,0.005198,0.0,0.008793
Balto,0.025863,0.0,0.006828,1.0,0.0,0.0,0.0,0.015964,0.0,0.007702,...,0.100168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Casino,0.0,0.03104,0.0,0.0,1.0,0.016911,0.019166,0.031453,0.085627,0.015175,...,0.0,0.010347,0.009186,0.0,0.0,0.00943,0.0,0.008502,0.011897,0.0


## Content based recommendation

#### get similarity matrix by target title
#### get movie titles having similarity of top k

In [28]:
def genre_recommendations(target_title, matrix, items, k=10):
    recom_idx = matrix.loc[:, target_title].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    recom_title = items.iloc[recom_idx, :].title.values
    recom_genre = items.iloc[recom_idx, :].genres.values
    target_title_list = np.full(len(range(k)), target_title)
    target_genre_list = np.full(len(range(k)), items[items.title == target_title].genres.values)
    d = {
        'target_title':target_title_list,
        'target_genre':target_genre_list,
        'recom_title' : recom_title,
        'recom_genre' : recom_genre
    }
    return pd.DataFrame(d)

In [36]:
genre_recommendations('Jumanji', cosine_sim_df, movie_data, k=5)

Unnamed: 0,target_title,target_genre,recom_title,recom_genre
0,Jumanji,Adventure Fantasy Family,Where the Wild Things Are,Family Fantasy
1,Jumanji,Adventure Fantasy Family,Clue,Comedy Thriller Crime Mystery
2,Jumanji,Adventure Fantasy Family,James and the Giant Peach,Adventure Animation Family
3,Jumanji,Adventure Fantasy Family,The Cat in the Hat,Comedy Fantasy Family
4,Jumanji,Adventure Fantasy Family,The NeverEnding Story,Drama Family Fantasy Adventure
