# 1. Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib
import os

# 2. Load dataset

In [3]:
movies = pd.read_csv("imdb_movies.csv")

# Keep only the required columns
movies = movies[["names", "overview"]]

# Drop missing values
movies = movies.dropna(subset=["overview"])
movies.reset_index(drop=True, inplace=True)

print("✅ Dataset loaded with shape:", movies.shape)
print(movies.head())


✅ Dataset loaded with shape: (10178, 2)
                         names  \
0                    Creed III   
1     Avatar: The Way of Water   
2  The Super Mario Bros. Movie   
3                      Mummies   
4                    Supercell   

                                            overview  
0  After dominating the boxing world, Adonis Cree...  
1  Set more than a decade after the events of the...  
2  While working underground to fix a water main,...  
3  Through a series of unfortunate events, three ...  
4  Good-hearted teenager William always lived in ...  


# 3. TF-IDF Vectorization

In [4]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)

# Fit TF-IDF on movie overviews
tfidf_matrix = tfidf.fit_transform(movies["overview"])

print("✅ TF-IDF matrix shape:", tfidf_matrix.shape)

✅ TF-IDF matrix shape: (10178, 5000)


# 4. Train KNN model

In [5]:
knn = NearestNeighbors(n_neighbors=16, metric="cosine", algorithm="brute")
knn.fit(tfidf_matrix)


# 5. Test recommendation

In [6]:
def recommend(movie_title):
    if movie_title not in movies["names"].values:
        return f"❌ Movie '{movie_title}' not found in dataset."
    
    idx = movies[movies["names"] == movie_title].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=6)
    
    print(f"🎬 Recommendations for '{movie_title}':")
    for i in indices.flatten()[1:]:  # skip the first (same movie)
        print("-", movies.iloc[i]["names"])

# Example Test
recommend("The Dark Knight")

🎬 Recommendations for 'The Dark Knight':
- Batman: The Long Halloween, Part Two
- Batman: The Long Halloween, Part One
- The Dark Knight Rises
- Batman
- Batman Forever


# 6. Save model + vectorizer + dataset

In [None]:
os.makedirs("pickle_model", exist_ok=True)

joblib.dump(knn, "pickle_model/knn_model.pkl")
joblib.dump(tfidf, "pickle_model/tfidf_vectorizer.pkl")
movies.to_csv("pickle_model/movies_metadata.csv", index=False)

print("✅ Model, vectorizer, and metadata saved in pickle_model/")