This code completed using: https://medium.com/rahasak/recommendation-system-with-content-based-filtering-500231e31a60

In [1]:
import pandas as pd
from ast import literal_eval

In [4]:
df = pd.read_csv('https://gitlab.com/rahasak-labs/dot/-/raw/master/src/main/resources/movie.csv', usecols={'title', 'director' ,'genres', 'cast', 'keywords'})

df.genres = df.genres.apply(literal_eval)
df.cast = df.cast.apply(literal_eval)
df.keywords = df.keywords.apply(literal_eval)

display(df.head())
print(df.shape)

Unnamed: 0,title,cast,genres,keywords,director
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Action, Adventure, Fantasy]","[culture clash, future, space war]",James Cameron
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island]",Gore Verbinski
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Action, Adventure, Crime]","[spy, based on novel, secret agent]",Sam Mendes
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]","[Action, Crime, Drama]","[dc comics, crime fighter, terrorist]",Christopher Nolan
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion]",Andrew Stanton


(4803, 5)


In [5]:
def clean_feature(x):
    if isinstance(x, list):
        # apply for all list items
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # apply for string items, if not string return an empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# apply clean function to the features
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    df[feature] = df[feature].apply(clean_feature)

In [6]:
df.head()

Unnamed: 0,title,cast,genres,keywords,director
0,Avatar,"[samworthington, zoesaldana, sigourneyweaver]","[action, adventure, fantasy]","[cultureclash, future, spacewar]",jamescameron
1,Pirates of the Caribbean: At World's End,"[johnnydepp, orlandobloom, keiraknightley]","[adventure, fantasy, action]","[ocean, drugabuse, exoticisland]",goreverbinski
2,Spectre,"[danielcraig, christophwaltz, léaseydoux]","[action, adventure, crime]","[spy, basedonnovel, secretagent]",sammendes
3,The Dark Knight Rises,"[christianbale, michaelcaine, garyoldman]","[action, crime, drama]","[dccomics, crimefighter, terrorist]",christophernolan
4,John Carter,"[taylorkitsch, lynncollins, samanthamorton]","[action, adventure, sciencefiction]","[basedonnovel, mars, medallion]",andrewstanton


In [8]:
def add_feature_col(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df['features'] = df.apply(add_feature_col, axis=1)
df.head(5)

Unnamed: 0,title,cast,genres,keywords,director,features
0,Avatar,"[samworthington, zoesaldana, sigourneyweaver]","[action, adventure, fantasy]","[cultureclash, future, spacewar]",jamescameron,cultureclash future spacewar samworthington zo...
1,Pirates of the Caribbean: At World's End,"[johnnydepp, orlandobloom, keiraknightley]","[adventure, fantasy, action]","[ocean, drugabuse, exoticisland]",goreverbinski,ocean drugabuse exoticisland johnnydepp orland...
2,Spectre,"[danielcraig, christophwaltz, léaseydoux]","[action, adventure, crime]","[spy, basedonnovel, secretagent]",sammendes,spy basedonnovel secretagent danielcraig chris...
3,The Dark Knight Rises,"[christianbale, michaelcaine, garyoldman]","[action, crime, drama]","[dccomics, crimefighter, terrorist]",christophernolan,dccomics crimefighter terrorist christianbale ...
4,John Carter,"[taylorkitsch, lynncollins, samanthamorton]","[action, adventure, sciencefiction]","[basedonnovel, mars, medallion]",andrewstanton,basedonnovel mars medallion taylorkitsch lynnc...


In [20]:
def get_recommendations(title, cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
        
    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies from the data frame
    return df['title'].iloc[movie_indices]

# Count Vectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# create count matrix and cosine similarity
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['features'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# reset index of the data frame and construct reverse mapping
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [22]:
get_recommendations('JFK', cosine_sim)

884              Zero Dark Thirty
1528                     Criminal
647            World Trade Center
737     Jack Ryan: Shadow Recruit
2008        In the Valley of Elah
3172                The Contender
940                       Syriana
991                     Fair Game
1091                        Nixon
1187              Bridge of Spies
Name: title, dtype: object

# TFIDF Vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(df['features'])

tfidf_matrix.shape

(4803, 11520)

In [24]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index, index=df['title'])

In [25]:
get_recommendations('JFK', cosine_sim)

1528                     Criminal
884              Zero Dark Thirty
737     Jack Ryan: Shadow Recruit
188                          Salt
969                     Assassins
2141                         Milk
2503                 The Homesman
484                   The Postman
2008        In the Valley of Elah
2278           Dances with Wolves
Name: title, dtype: object