Installing Necessary Pacakges for this notebook

In [None]:
! pip install pandas sklearn nltk

Loading the csv files

In [None]:
import pandas as pd

In [None]:
credits = pd.read_csv("data/tmdb_5000_credits.csv")
movies = pd.read_csv("data/tmdb_5000_movies.csv")

In [None]:
movies = movies.merge(credits, on="title")

### Data Preprocessing 
1. Feature Selection (Dimentionality Reduction)

In [None]:
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]

2. Feature cleaning

In [None]:
movies.dropna(inplace=True)

3. Checking for Duplicate values

In [None]:
movies.duplicated().sum()

4. Applying function on features to convert them to lists

In [None]:
import ast # Abstract Syntax Tree


def convert_to_list(column: str) -> list:
    return [item['name'] for item in ast.literal_eval(column)]

In [None]:
movies['genres'] = movies['genres'].apply(convert_to_list)

In [None]:
movies['keywords'] = movies['keywords'].apply(convert_to_list)

In [None]:
movies['cast'] = movies['cast'].apply(lambda casts: convert_to_list(casts)[: 3])

In [None]:
movies['crew'] = movies['crew'].apply(lambda crews: [crew['name'] for crew in ast.literal_eval(crews) if crew['job'] == 'Director'])

In [None]:
movies['overview'] = movies['overview'].apply(lambda text: text.split())

In [None]:
def concatenate_words(strings: list[str]) -> list[str]:
    return [string.replace(" ", "") for string in strings]


In [None]:
movies['genres'] = movies['genres'].apply(concatenate_words)
movies['keywords'] = movies['keywords'].apply(concatenate_words)
movies['cast'] = movies['cast'].apply(concatenate_words)
movies['crew'] = movies['crew'].apply(concatenate_words)

5. New Feature Creation

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
new_df = pd.DataFrame(movies[['id', 'title', 'tags']])

In [None]:
print(new_df)

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda lst: ' '.join(map(str.lower, lst)))

6. Apply stemming on features

In [None]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
# with stemming
new_df['tags'] = new_df['tags'].apply(lambda tags: ' '.join(ps.stem(tag) for tag in tags.split()))

In [None]:
new_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(max_features=10000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
cv.get_feature_names_out()[:500]

7. Calculating using cosine similarity instead of Euclidean Distance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies = enumerate(distances)
    recommended_movies = sorted(movies, reverse=True, key=lambda m: m[1])
    top_5_recommendations = recommended_movies[1:6]

    return [new_df.iloc[index].title for index, _ in top_5_recommendations]

8. Testing on a data

In [None]:
recommend('Avatar')

9. Exporting new Datasets


In [None]:
import pickle


pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))