### importing libraries

# Data preprocessing

In [1]:
import pandas as pd
import numpy as np


In [2]:
credits=pd.read_csv("data/tmdb_5000_credits.csv")
movies=pd.read_csv("data/tmdb_5000_movies.csv")

In [3]:
# credits.head()

In [4]:
# credits.info()


In [5]:
# movies.info()

In [6]:
# movies.head(1)

In [7]:
# credits.head(1)

# Merging the dataframes

In [8]:
df=movies.merge(credits, on='title')

In [9]:
# df.info()

In [10]:
# df.shape

In [11]:
# df.info()

# Feature engineering

In [12]:
data=df.drop(['budget','homepage','original_language', 'original_title', 'popularity', 'production_companies' , 'production_countries','release_date', 'revenue', 'runtime', 'spoken_languages', 'status','tagline', 'vote_average',
 'vote_count','movie_id'], axis=1)

In [13]:
# data.info()

In [14]:
# data.head()

checking for null values

In [15]:
data.isnull().sum()

genres      0
id          0
keywords    0
overview    3
title       0
cast        0
crew        0
dtype: int64

In [16]:
data.dropna(inplace=True)

# Checking for duplicates

In [17]:
movies.duplicated().sum()

0

In [18]:
l=data.iloc[0]['genres']
l

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

Here we can see that the 'genres' have a weird format. We'll have to fix it using some string manipulations.

In [19]:
import ast
def convert(l):
    L=[]
    for i in ast.literal_eval(l):
        L.append(i["name"])
    return L
    

Applying the above function to 'genres', 'cast' and 'keywords' columns.

In [20]:
data['genres']=data['genres'].apply(convert)
data['keywords']=data['keywords'].apply(convert)
data['cast']=data['cast'].apply(convert)

In [21]:
# data.head()

In [22]:
# data['keywords'][0]

We won't include the entire cast of the movie in the recommendation system. Instead, we'll just use the first three names of the actors.

In [23]:
def fetch_first_three(obj):
    a=[]
    for i in range(len(obj)):
        if i<3:
            a.append(obj[i])
    return a

data['cast']=data['cast'].apply(fetch_first_three)

Fetching the name of the director.


In [24]:
# data['crew'][1]

In [25]:
def get_director(obj):
    ls=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            ls.append(i["name"])
    return ls

data['crew']=data['crew'].apply(get_director)

In [26]:
data.head()

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...",John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


We will also need to remove spaces between the names of the cast and crew.

In [27]:
def remove_space(obj):
    no_sapce_lst=[]
    for i in obj:
        no_sapce_lst.append(i.replace(" ", ""))
    return no_sapce_lst

# Modifying the overview column
def modify_overview(obj):
    return obj.split()



In [28]:
data['cast']=data['cast'].apply(remove_space)
data['crew']=data['crew'].apply(remove_space)
data['keywords']=data['keywords'].apply(remove_space)
data['overview']=data['overview'].apply(modify_overview)

# Concatenating all the features to create a single feature

In [29]:
data['tags']=data['overview']+data['genres']+data['keywords']+data['cast']+data['crew']

In [30]:
data.head()

Unnamed: 0,genres,id,keywords,overview,title,cast,crew,tags
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,"[Action, Adventure, Crime]",206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",Spectre,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,"[Action, Crime, Drama, Thriller]",49026,"[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,"[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,"[Action, Adventure, Science Fiction]",49529,"[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",John Carter,"[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [31]:
# data=data.drop(['genres', 'keywords', 'overview','cast', 'crew'], axis=1)

In [32]:
# data.head()

In [33]:
new_df=data[['id', 'title', 'tags']]
new_df.head()


Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


Converting tags to list

In [34]:


new_df['tags']=new_df['tags'].apply(lambda x: " ".join(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: " ".join(x))


In [35]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [36]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# Convert everything to lowercase (recommendation)

In [37]:
# def lowercase(obj):
#     lower=[]
#     for i in obj:
#         lower.append(i.lower())
#     return lower
# new_df['tags']=new_df['tags'].apply(lambda x: x.lower())

In [38]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [39]:

new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [40]:
new_df['tags']=new_df['tags'].apply(lambda x: x.lower())
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: x.lower())


Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [41]:
pip freeze requirements.txt

actionlib==1.14.0
angles==1.9.13
anyio==3.6.2
apturl==0.5.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.2.1
attrs==19.3.0
autobahn==17.10.1
Automat==0.8.0
backcall==0.1.0
bcrypt==3.1.7
beautifulsoup4==4.11.2
bleach==6.0.0
blinker==1.4
bondpy==1.8.6
breezy==3.0.2
Brlapi==0.7.0
camera-calibration==1.17.0
camera-calibration-parsers==1.12.0
catkin==0.8.10
catkin-pkg==0.5.2
catkin-pkg-modules==0.5.2
cbor==1.0.0
certifi==2019.11.28
cffi==1.15.1
chardet==3.0.4
Click==7.0
colorama==0.4.3
comm==0.1.2
command-not-found==0.3
configobj==5.0.6
constantly==15.1.0
controller-manager==0.19.6
controller-manager-msgs==0.19.6
cryptography==2.8
cupshelpers==1.0
cv-bridge==1.16.2
cycler==0.10.0
Cython==0.29.14
dbus-python==1.2.16
debugpy==1.6.6
decorator==4.4.2
defer==1.0.6
defusedxml==0.6.0
Deprecated==1.2.7
diagnostic-analysis==1.11.0
diagnostic-common-diagnostics==1.11.0
diagnostic-updater==1.11.0
distlib==0.3.6
distro==1.4.0
distro-info===0.23ubuntu1
docutils==0.16
dulwich==0.19.15
du

In [42]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()


In [43]:
def stem_(text):
    ls=[]
    for i in text.split():
        ls.append(ps.stem(i))
    return " ".join(ls)
new_df['tags']=new_df['tags'].apply(stem_)
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem_)


In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
cv=CountVectorizer(max_features=5000, stop_words='english')

In [46]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [47]:
print(vectors)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [48]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [49]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [53]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)

In [55]:
similarity

array([[1.        , 0.08238526, 0.08492078, ..., 0.04399059, 0.        ,
        0.        ],
       [0.08238526, 1.        , 0.06063391, ..., 0.02355714, 0.        ,
        0.02615329],
       [0.08492078, 0.06063391, 1.        , ..., 0.02428215, 0.        ,
        0.        ],
       ...,
       [0.04399059, 0.02355714, 0.02428215, ..., 1.        , 0.03924588,
        0.04189458],
       [0.        , 0.        , 0.        , ..., 0.03924588, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04189458, 0.08714204,
        1.        ]])

In [74]:
def recommend(movie):
    movie_index=new_df[new_df['title']==movie].index[0]

0

In [73]:
new_df[new_df['title']=='Batman Begins'].index[0]

119