In [49]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movies.head(1) #printing one random movie so that we will know what columns are present
#for data filtering or data preprocessing

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1) #printing one random movie so that we will know what columns are present
#for data filtering or data preprocessing

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
movies = movies.merge(credits,on='title') # merging two dataframes

In [6]:
movies['release_date'].head()

0    2009-12-10
1    2007-05-19
2    2015-10-26
3    2012-07-16
4    2012-03-07
Name: release_date, dtype: object

In [7]:
#data preprocessing
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew','release_date']]

In [8]:
movies.isnull().sum()

movie_id        0
title           0
overview        3
genres          0
keywords        0
cast            0
crew            0
release_date    1
dtype: int64

In [9]:
#there are 3 rows in overview category which are not known
#we shall drop those rows as 3 is not a big number
# and 1 in release_date
movies = movies.dropna()

In [10]:
#searching if any dupliactes present
movies.duplicated().sum()

np.int64(0)

In [11]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
#string of list of dictionaries -> List of dictionaries -> list
#we can create a helper function
def converter(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List

In [13]:
movies['genres'] = movies['genres'].apply(converter)

In [14]:
# movies.head()

In [15]:
movies['keywords'] = movies['keywords'].apply(converter)

In [16]:
# i'm taking first 5 actors of each movie and extracting name from all
def converter5(obj):
    List = []
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=5:
            List.append(i['name'])
            counter+=1
        else:
            break
            
    return List

In [17]:
movies['cast'] = movies['cast'].apply(converter5)

In [18]:
# movies.head()

In [19]:
def fetch_director(obj):
    List = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            List.append(i['name'])
            break
            
    return List

In [20]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [21]:
# movies.head()

In [22]:
#overview is a string. Let's convert it into list so that we can concatenate with other columns
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [23]:
# movies.head()

In [24]:
#removing spaces in overview,genres,keywords,cast and crew
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [25]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,release_date
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],2009-12-10
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],2007-05-19
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],2015-10-26
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],2012-07-16
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],2012-03-07


In [26]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')  # Convert to datetime
movies['release_year'] = movies['release_date'].dt.year  # Extract the year

In [27]:
movies['release_year'] = movies['release_year'].fillna(movies['release_year'].median())  # Fill NaN with the median

In [28]:
movies['release_year'].head()

0    2009
1    2007
2    2015
3    2012
4    2012
Name: release_year, dtype: int32

In [29]:
# Define the bins for the 10-year ranges
bins = [1900,1910,1920,1930,1940,1950,1960,1970,1980, 1990, 2000, 2010, 2020, 2030]  # Adjust as per your data range
labels = ['1900-1909','1910-1919','1920-1929','1930-1939','1940-1949','1950-1959','1960-1969','1970-1979','1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2029']

# Create a new column for decade ranges
movies['year_range'] = pd.cut(movies['release_year'], bins=bins, labels=labels, right=False)

# Count the number of movies in each range
year_range_counts = movies['year_range'].value_counts()

# Display the counts
print(year_range_counts)


year_range
2000-2009    2050
2010-2019    1444
1990-1999     778
1980-1989     280
1970-1979     109
1960-1969      72
1950-1959      27
1940-1949      25
1930-1939      15
1920-1929       4
1910-1919       1
1900-1909       0
2020-2029       0
Name: count, dtype: int64


In [30]:
# Standardization for 'release_year'
mean_year = movies['release_year'].mean()   # Calculate mean = 2000
std_year = movies['release_year'].std()     # Calculate standard deviation = sqrt(200) 

movies['release_year_standardized'] = (movies['release_year'] - mean_year) / std_year

In [31]:
movies['release_year_standardized'].head()

0    0.527194
1    0.366225
2    1.010101
3    0.768648
4    0.768648
Name: release_year_standardized, dtype: float64

In [32]:
def categorize_years(standardized_year):
    if standardized_year < -1.5:
        return '1900s'
    elif standardized_year < -0.5:
        return '1930s-1950s'
    elif standardized_year < 0.5:
        return '1960s-1990s'
    elif standardized_year < 1.5:
        return '2000s-2010s'
    else:
        return '2020s+'

In [33]:
movies['year_categories'] = movies['release_year_standardized'].apply(categorize_years)

In [34]:
movies['year_categories'] = movies['year_categories'].apply(lambda x:[x])

In [35]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [36]:
movies_new = movies[['movie_id','title','tags','year_categories']]

In [37]:
# movies_new.head()

In [38]:
# converting tag from list to string
# movies_new['tags'] = movies_new['tags'].apply(lambda x:" ".join(x))
# gives error as movies_new is likely a subset of another DataFrame, movies
# and pandas in unsure if we want to change original DataFrame as well
# thus we use .loc which expliciltly selects the tags columns for all rows to modify df inplace
# movies_new.loc[:,'tags'] = movies_new['tags'].apply(lambda x:" ".join(x))
# this raised an issue
movies_new.loc[:,'tags'] = movies_new['tags'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
# checks if x is of type list or not, if it is then it's executed

In [39]:
movies_new.head()

Unnamed: 0,movie_id,title,tags,year_categories
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",[2000s-2010s]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",[1960s-1990s]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,[2000s-2010s]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,[2000s-2010s]
4,49529,John Carter,"John Carter is a war-weary, former military ca...",[2000s-2010s]


In [40]:
movies_new.loc[:,'tags'] = movies_new['tags'].apply(lambda x:x.lower())

In [41]:
movies_new.head()

Unnamed: 0,movie_id,title,tags,year_categories
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",[2000s-2010s]
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",[1960s-1990s]
2,206647,Spectre,a cryptic message from bond’s past sends him o...,[2000s-2010s]
3,49026,The Dark Knight Rises,following the death of district attorney harve...,[2000s-2010s]
4,49529,John Carter,"john carter is a war-weary, former military ca...",[2000s-2010s]


Text Vectorization

In [42]:
ps = PorterStemmer()

In [43]:
# we are steeming as we want to convert all tenses into simple present

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [44]:
movies_new.loc[:,'tags'] = movies_new['tags'].apply(stem)

In [45]:
# calculating similarity score between two tags to know if they are similar, and should be recommended
# remove stopwords
cv = CountVectorizer(max_features=5000,stop_words='english')

In [46]:
# convert movies into vectors
tag_vectors = cv.fit_transform(movies_new['tags']).toarray()

In [47]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [50]:
scaler = MinMaxScaler()

In [51]:
movies_new['normalized_year'] = scaler.fit_transform(movies_new[['year_categories']])

ValueError: setting an array element with a sequence.

In [None]:
year_vector = movies_new['normalized_year'].values.reshape(-1, 1)

In [None]:
combined_vectors = np.hstack((tag_vectors,year_vector))

In [None]:
movies_new.shape

In [None]:
# 4806 movies and 5000 features
# we shall calculate cosine distance (angle) not euclidean distance
# distance increases similarity decreases
# cosine similarity
cosine_similarity(combined_vectors).shape

In [None]:
similarity = cosine_similarity(combined_vectors)

In [None]:
similarity[1]

In [None]:
# Assuming the recommend function is defined and similairty matrix is pre-computed
def recommend(movie):
    # Find the index of the movie in the DataFrame
    movie_index = movies_new[movies_new['title'] == movie].index[0]
    
    # Assuming similairty is a pre-computed matrix of distances or similarities
    distances = similarity[movie_index]
    
    # Get the top 5 similar movies (excluding the input movie itself)
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Get the movie titles for the top 5 recommendations
    # recommended_movies = [movies_new.iloc[i[0]]['title'] for i in movies_list]
    
    # return recommended_movies
    for i in movies_list:
        print(movies_new.iloc[i[0]].title)

In [None]:
# Now print the recommendations for 'Avatar'
# print(recommend('Avatar'))
recommend('Avatar')

In [None]:
import pickle

In [None]:
pickle.dump(movies_new.to_dict(),open('movies_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))