In [1]:
import numpy as np
import pandas as pd

# Step 1. Data Gathering Process


In [3]:
#loading dataset(credits amd movies tables)

movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Step 2. Understanding dataset 


In [5]:
#movies table


'''
columns 
1. budget : the total amount of money allocated for the production of the film
2. genres : genres refer to categories or classifications that define the overall style, theme, and narrative elements of a film
3. homepage : The homepage acts as a one-stop destination for fans and potential viewers to learn more about the film and engage with its content.
4. id : ID (short for "identifier") can refer to movie_id
5. keywords : keywords refer to specific terms or phrases associated with the film that describe its key elements, themes, or content
6. original_title : original title in that particular language
7. overview : a brief summary or description of the film's plot, themes, and key elements. I
8. popularity : how much liked by audeince 
9. production_companies : list of production companies involved in the movie
10.production_countries : funding sources
11.release_date : The date when the movie was officially released to the public, either in theaters or on a particular platform.
12.revenue : The total amount of money earned from the film
13.runtime : The total length of the movie, measured in minutes.           
14.spoken_languages :  The languages spoken in the movie   
15.status : The current status of the movie, such as "Released," "Post-production," "In Production," or "Cancelled."            
16.tagline : A short, memorable phrase or slogan associated with the movie             
17.title :  The name of the movie        
18.vote_average : The average rating given to the movie by viewers or critics (scale 1 to 10)
19.vote_count : no. of votes gained
'''
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [6]:
#credits table
'''
columns 
1. movie_id : ID (short for "identifier") can refer to movie_id
2. title : The name of the movie
3. cast : the group of actors and actresses who perform in the film.
4. crew : the group of people who work behind the scenes to create the film
'''
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


# Step 3 . Feature selection


In [8]:
#merging credits and movies table

movies = movies.merge(credits,on='title')


In [9]:
#required columns for the content based recommendation are (selct features suitable for creating tags)
'''
1.genres                
2.movie_id
3. keywords
4. overview
5. tagline
6. title
7. cast
8. crew
'''
movies = movies[['movie_id','title','overview','genres','keywords','tagline','cast','crew']]


# Step 4 . Preprocessing 

In [11]:
#missing data check
movies.isna().sum()

movie_id      0
title         0
overview      3
genres        0
keywords      0
tagline     844
cast          0
crew          0
dtype: int64

In [12]:
movies.drop(columns=['tagline'] ,  inplace=True)

In [13]:
movies.dropna(inplace=True)

In [14]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [15]:
movies.shape


(4806, 7)

In [16]:
#checking duplicated data
#movies.duplicated().sum()
movies.drop_duplicates(keep='first').shape


(4806, 7)

In [17]:
#movie_id
#title
#tags = concatination  'overview'+'genres'+'keywords'+'cast'+'crew'


In [18]:
'''def fetch_genres(obj):
    genres = []
    for i in obj:
        genres.append(i['name'])
    return genres

fetch_genres('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
')
'''
import ast

def fetch_genres(obj):
    genres = []
    for i in ast.literal_eval(obj):
        genres.append(i['name'])
    return genres

movies['genres']= movies['genres'].apply(fetch_genres)



In [19]:
def fetch_keywords(obj):
    keywords = []
    for i in ast.literal_eval(obj):
        keywords.append(i['name'])
    return keywords

movies['keywords']= movies['keywords'].apply(fetch_keywords)

In [20]:
def fetch_acters(obj):
    acters =[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            acters.append(i['name'])
            counter+=1
        else:
            break
    return acters

movies['cast']=movies['cast'].apply(fetch_acters)

In [21]:
'''counter=0
for i in ast.literal_eval(movies['crew'][0]):
    print(counter , i)
    counter+=1
'''
def fetch_director(obj):
    director = []
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            director.append(i['name'])
            break 
    return director     
            
movies['crew'] = movies['crew'].apply(fetch_director)

In [22]:
movies['overview']= movies['overview'].apply(lambda x : x.split())
movies

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui...","[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap...","[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...",[Robert Rodriguez]
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended...","[Comedy, Romance]",[],"[Edward Burns, Kerry Bishé, Marsha Dietlein]",[Edward Burns]
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,...","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi...","[Eric Mabius, Kristin Booth, Crystal Lowe]",[Scott Smith]
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is...",[],[],"[Daniel Henney, Eliza Coupe, Bill Paxton]",[Daniel Hsia]


In [23]:
movies['overview']= movies['overview'].apply(lambda x :[i.replace(' ','') for i in x])
movies['genres']= movies['genres'].apply(lambda x :[i.replace(' ','') for i in x])
movies['keywords']= movies['keywords'].apply(lambda x :[i.replace(' ','') for i in x])
movies['cast']= movies['cast'].apply(lambda x :[i.replace(' ','') for i in x])
movies['crew']= movies['crew'].apply(lambda x :[i.replace(' ','') for i in x])

In [24]:
movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [25]:
new_df = movies[['movie_id','title','tags']]

In [26]:
new_df['tags']= new_df['tags'].apply(lambda x :' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x :' '.join(x))


In [27]:
new_df['tags']= new_df['tags'].apply(lambda x :x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x :x.lower())


In [28]:
import nltk

In [29]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [30]:
def stem(text):
    y =[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)
        
    
    

In [31]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000,stop_words='english')

In [33]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [34]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [35]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [36]:
vectors.shape

(4806, 5000)

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [38]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True ,key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)
    


In [82]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [84]:
import pickle

In [86]:
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))

In [88]:
pickle.dump(similarity,open('similarity.pkl','wb'))