In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [5]:
movie = movies.merge(credits, on ='title')

In [7]:
movie=movie[['movie_id','title','overview','genres','keywords','cast', 'crew']]

In [9]:
movie.dropna(inplace=True)

In [11]:
import ast

def convert (text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

In [13]:
movie['genres'] = movie['genres'].apply(convert)

In [15]:
movie['keywords'] = movie['keywords'].apply(convert)

In [17]:
def convert_cast (text):
    l=[]
    counter=0
    for i in ast.literal_eval(text):
        if counter<3:
            l.append(i['name'])
        counter +=1
    return l

In [19]:
movie['cast'] = movie['cast'].apply(convert_cast)

In [21]:
def fetch_director(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
            
    return l

In [23]:
movie['crew'] = movie['crew'].apply(fetch_director)

In [25]:
movie['overview'] = movie['overview'].apply(lambda x:x.split())

In [27]:
def remove_space(word):
    l = []
    for i in word:
        l.append(i.replace(" ", ""))
    return l

In [29]:
movie['cast'] = movie['cast'].apply(remove_space)
movie['crew'] = movie['crew'].apply(remove_space)
movie['genres'] = movie['genres'].apply(remove_space)

In [31]:
movie['keywords'] = movie['keywords'].apply(remove_space)

In [33]:
movie['tags']= movie['overview'] + movie['genres'] + movie['keywords'] + movie['cast'] + movie['crew']

In [35]:
new_df = movie[['movie_id','title','tags']]

In [37]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [39]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [41]:
new_df.iloc[0]['tags']

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [43]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [45]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [51]:
import nltk
from nltk.stem import PorterStemmer 

In [53]:
ps = PorterStemmer()

In [59]:
def stems (text):
    l=[]
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)

In [61]:
new_df['tags'] = new_df['tags'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stems)


In [63]:
new_df.iloc[0]['tags']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [67]:
vector =cv.fit_transform(new_df['tags']).toarray()

In [69]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [71]:
vector.shape

(4806, 5000)

In [73]:
from sklearn.metrics.pairwise import  cosine_similarity

In [81]:
similarity = cosine_similarity(vector)

In [82]:
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [85]:
similarity.shape

(4806, 4806)

In [91]:
new_df[new_df['title']=='Spider-Man'].index[0]

159

In [93]:
def recommend(movie):
    index = new_df[new_df['title']==movie].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x:x[1])
    for i in distance [1:11]:
        print(new_df.iloc[i[0]].title)

In [95]:
recommend('Spider-Man')

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
Kick-Ass
The Amazing Spider-Man
21 Jump Street
X-Men
Eight Legged Freaks
Light It Up


In [97]:
recommend('The Dark Knight Rises')

The Dark Knight
Batman Returns
Batman
Batman Forever
Batman Begins
Batman
Batman & Robin
Nighthawks
Slow Burn
Amidst the Devil's Wings


In [99]:
import pickle

pickle.dump(new_df, open('Artificats/movie_list.pkl','wb'))
pickle.dump(similarity, open('Artificats/similarity.pkl','wb'))