In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer

cv = CountVectorizer(max_features=5000, stop_words="english")


In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
new_moviews = movies.merge(credits, on="title")

In [None]:
new_moviews.head(1)

In [None]:
new_moviews["genres"][0]


In [None]:
new_moviews.info()

In [None]:
# id
# genres
# keywords
# overview
# title
# cast
# crew

In [7]:
new_moviews = new_moviews[["id", "title", "overview", "genres", "keywords", "cast", "crew" ]]

In [8]:
new_moviews.isnull().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [9]:
new_moviews.dropna(inplace=True)

In [10]:
new_moviews.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [12]:
import ast

def convert(obj_instance):
    result = []
    for obj in ast.literal_eval(obj_instance):
        result.append(obj["name"])
    
    return result

def fetch_director(obj_instance):
    result = []
    for obj in ast.literal_eval(obj_instance):
        if obj["job"] == "Director":
            result.append(obj["name"])
            break
    return result

def get_and_convert_three_records(obj_instance):
    result = []
    count = 0
    for obj in ast.literal_eval(obj_instance):
        if count != 3:
            result.append(obj["name"])
            count += 1
        else:
            break

    return result

In [13]:
new_moviews["genres"] = new_moviews["genres"].apply(convert)
new_moviews["keywords"] = new_moviews["keywords"].apply(convert)

In [14]:
new_moviews["crew"] = new_moviews["crew"].apply(fetch_director)


In [15]:
new_moviews["cast"] = new_moviews["cast"].apply(get_and_convert_three_records)


In [16]:
new_moviews["genres"] = new_moviews["genres"].apply(lambda x: [i.replace(" ", "") for i in x])
new_moviews["keywords"] = new_moviews["keywords"].apply(lambda x: [i.replace(" ", "") for i in x])
new_moviews["crew"] = new_moviews["crew"].apply(lambda x: [i.replace(" ", "") for i in x])
new_moviews["cast"] = new_moviews["cast"].apply(lambda x: [i.replace(" ", "") for i in x])

In [19]:
new_moviews["overview"] = new_moviews["overview"].apply(lambda x:x.split())

In [20]:
new_moviews["tags"] = new_moviews["overview"] + new_moviews["genres"]  + new_moviews["keywords"] + new_moviews["crew"] + new_moviews["cast"]

In [21]:
movies = new_moviews[["id", "title", "tags"]]

In [22]:
movies["tags"] = movies["tags"].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["tags"].apply(lambda x:" ".join(x))


In [23]:
movies["tags"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d JamesCameron SamWorthington ZoeSaldana SigourneyWeaver'

In [24]:
movies["tags"] = movies["tags"].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["tags"].apply(lambda x:x.lower())


In [25]:
movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [30]:
ps = PorterStemmer()

In [31]:
def stem(text):
    res = []

    for word in text.split():
        res.append(ps.stem(word))
    
    return " ".join(res)

In [32]:
movies["tags"][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d jamescameron samworthington zoesaldana sigourneyweaver'

In [33]:
stem('in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d jamescameron samworthington zoesaldana sigourneyweaver')

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d jamescameron samworthington zoesaldana sigourneyweav'

In [34]:
movies["tags"] = movies["tags"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["tags"] = movies["tags"].apply(stem)


In [35]:
vectors = cv.fit_transform(movies["tags"]).toarray()

In [38]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      shape=(5000,), dtype=object)

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
similarity = cosine_similarity(vectors)

In [42]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ], shape=(4806,))

In [72]:
movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [None]:
int(movies[movies["title"] == "Batman"].index[0])

np.int64(1361)

In [59]:
sorted(enumerate(similarity[0]), reverse=True, key=lambda x:x[1])[1:6]

[(1214, np.float64(0.28676966733820225)),
 (2405, np.float64(0.26901379342448517)),
 (3728, np.float64(0.2605130246476754)),
 (507, np.float64(0.255608593705383)),
 (539, np.float64(0.25038669783359574))]

In [60]:
def recommend(movie):
    movie_index = int(movies[movies["title"] == movie].index[0])
    distance = similarity[movie_index]
    movie_list = sorted(enumerate(distance), reverse=True, key=lambda x:x[1])[1:6]

    for x in movie_list:
        print(movies.iloc[x[0]].title)

In [69]:
recommend("Aliens")

Alien
Alien³
Meet Dave
Star Trek: Insurrection
Escape from Planet Earth


In [70]:
import pickle

In [78]:
pickle.dump(movies, open("movies.pkl", 'wb'))

In [75]:
pickle.dump(similarity, open("similarity.pkl", 'wb'))
