In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('data/top10K-TMDB-movies.csv')
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,23/09/1994,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,19/10/1995,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,14/03/1972,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,15/12/1993,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,20/12/1974,8.6,9811


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [4]:
def split_and_join_genres(genre_string):
    if pd.isna(genre_string):
        return ''
    return ' '.join(genre_string.split(','))

movies['genre'] = movies['genre'].apply(split_and_join_genres)

In [5]:
movies = movies[['id', 'title', 'genre', 'overview']]
movies['tags'] = movies['genre'] + " " + movies['overview']
movies.drop(['genre', 'overview'], axis=1, inplace=True)

In [6]:
movies.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Drama Crime Framed in the 1940s for the double...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy Drama Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,Drama History War The true story of how busine...
4,240,The Godfather: Part II,Drama Crime In the continuing saga of the Corl...


In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [8]:
def clean_text(text, return_tokens=False):
    if not isinstance(text, str):
        return ''
    
    lemmantizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    words = nltk.word_tokenize(text)
    words = [lemmantizer.lemmatize(word) for word in words if word not in stop_words]

    if return_tokens:
        return words
    else:
        return ' '.join(words)

In [9]:
movies['tags'] = movies['tags'].apply(clean_text)

In [10]:
movies.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,drama crime framed 1940s double murder wife lo...
1,19404,Dilwale Dulhania Le Jayenge,comedy drama romance raj rich carefree happygo...
2,238,The Godfather,drama crime spanning year 1945 1955 chronicle ...
3,424,Schindler's List,drama history war true story businessman oskar...
4,240,The Godfather: Part II,drama crime continuing saga corleone crime fam...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

In [12]:
cvt = TfidfVectorizer(max_features=5000, stop_words='english')
vector = cvt.fit_transform(movies['tags'].values.astype('U')).toarray()

# nmf = NMF(n_components=50, random_state=42)
# vectorized_data_nmf = nmf.fit_transform(vector)

### Content-Based Filtering

In [13]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [14]:
csm = cosine_similarity(vector, vector)
lrk = linear_kernel(vector, vector)

In [25]:
def recommend(movie_title, similarity):
    if movie_title not in movies['title'].values:
        return f"Movie titled {movie_title} not found...."
    index = movies[movies['title'] == movie_title].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    return [movies.iloc[i[0]].title for i in distances[1:11]]

In [26]:
title = 'Iron'
recommended = recommend(title, csm)
print(f'Top 10 recommended movies after watching {title}:\n')
print(recommended)

Top 10 recommended movies after watching Iron:

Movie titled Iron not found....


In [27]:
title = 'The Avengers'
recommended = recommend(title, lrk)
print(f'Top 10 recommended movies after watching {title}:\n')
for i in recommended:
    print(i)

Top 10 recommended movies after watching The Avengers:

Marvel One-Shot: The Consultant
Kingsman: The Secret Service
Kingsman: The Golden Circle
Spider-Man: Far From Home
The Fountain
Living in Oblivion
Allegiant
Timecop
Avengers: Age of Ultron
Echelon Conspiracy


In [28]:
title = 'The Godfather'
recommended = recommend(title, lrk)
print(f'Top 10 recommended movies after watching {title}:\n')
for i in recommended:
    print(i)

Top 10 recommended movies after watching The Godfather:

Blood Ties
The Godfather: Part II
Extremely Wicked, Shockingly Evil and Vile
Xtreme
The Gangster, the Cop, the Devil
Furious 7
Proud Mary
Joe
The Color Purple
The House That Jack Built
