Installing Necessary Pacakges for this notebook

In [31]:
! pip install pandas scikit-learn nltk



Loading the csv files

In [32]:
import pandas as pd

In [33]:
credits = pd.read_csv("data/tmdb_5000_credits.csv")
movies = pd.read_csv("data/tmdb_5000_movies.csv")

In [34]:
movies = movies.merge(credits, on="title")

### Data Preprocessing 
1. Feature Selection (Dimentionality Reduction)

In [35]:
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]

2. Feature cleaning

In [36]:
movies.dropna(inplace=True)

3. Checking for Duplicate values

In [37]:
movies.duplicated().sum()

0

4. Applying function on features to convert them to lists

In [38]:
import ast # Abstract Syntax Tree


def convert_to_list(column: str) -> list:
    return [item['name'] for item in ast.literal_eval(column)]

In [39]:
movies['genres'] = movies['genres'].apply(convert_to_list)

In [40]:
movies['keywords'] = movies['keywords'].apply(convert_to_list)

In [41]:
movies['cast'] = movies['cast'].apply(lambda casts: convert_to_list(casts)[: 3])

In [42]:
movies['crew'] = movies['crew'].apply(lambda crews: [crew['name'] for crew in ast.literal_eval(crews) if crew['job'] == 'Director'])

In [43]:
movies['overview'] = movies['overview'].apply(lambda text: text.split())

In [44]:
def concatenate_words(strings: list[str]) -> list[str]:
    return [string.replace(" ", "") for string in strings]


In [45]:
movies['genres'] = movies['genres'].apply(concatenate_words)
movies['keywords'] = movies['keywords'].apply(concatenate_words)
movies['cast'] = movies['cast'].apply(concatenate_words)
movies['crew'] = movies['crew'].apply(concatenate_words)

5. New Feature Creation

In [46]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [47]:
new_df = pd.DataFrame(movies[['id', 'title', 'tags']])

In [48]:
print(new_df)

          id                                     title  \
0      19995                                    Avatar   
1        285  Pirates of the Caribbean: At World's End   
2     206647                                   Spectre   
3      49026                     The Dark Knight Rises   
4      49529                               John Carter   
...      ...                                       ...   
4804    9367                               El Mariachi   
4805   72766                                 Newlyweds   
4806  231617                 Signed, Sealed, Delivered   
4807  126186                          Shanghai Calling   
4808   25975                         My Date with Drew   

                                                   tags  
0     [In, the, 22nd, century,, a, paraplegic, Marin...  
1     [Captain, Barbossa,, long, believed, to, be, d...  
2     [A, cryptic, message, from, Bond’s, past, send...  
3     [Following, the, death, of, District, Attorney...  
4     [John, 

In [49]:
new_df['tags'] = new_df['tags'].apply(lambda lst: ' '.join(map(str.lower, lst)))

6. Apply stemming on features

In [50]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
# with stemming
new_df['tags'] = new_df['tags'].apply(lambda tags: ' '.join(ps.stem(tag) for tag in tags.split()))

In [None]:
new_df.head()
print(
    *new_df[new_df['title']=='Final Destination']['tags']
)
print(
    *new_df[new_df['title']=='Final Destination 2']['tags']
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(max_features=10000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
cv.get_feature_names_out()[:500]

7. Calculating using cosine similarity instead of Euclidean Distance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies = enumerate(distances)
    recommended_movies = sorted(movies, reverse=True, key=lambda m: m[1])
    top_5_recommendations = recommended_movies[1:6]

    return [new_df.iloc[index].title for index, _ in top_5_recommendations]

8. Testing on a data

In [None]:
recommend('Avatar')

9. Exporting new Datasets


In [None]:
import pickle


pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))