In [None]:
!unzip movies.zip -d ./tmp

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
ds = pd.read_csv('./tmp/movies.csv')
print(ds.shape, ds.columns)
selected_features=['genres','keywords','tagline','cast','director']
for feature in selected_features:
    # Handle NaN and convert to string
    ds[feature] = ds[feature].fillna('').astype(str)
    # Remove extra spaces and replace pipes with spaces
    ds[feature] = ds[feature].str.strip()
    ds[feature] = ds[feature].str.replace(r'\s+', ' ', regex=True)  # Multiple spaces -> single space
    ds[feature] = ds[feature].str.replace('|', ' ')

# Combine features with better handling
def combine_features(row):
    """Safely combine movie features"""
    parts = []
    for feature in selected_features:
        value = str(row[feature]).strip()
        if value and value.lower() != 'nan':
            parts.append(value)
    
    combined = ' '.join(parts).strip()
    # Ensure non-empty result
    return combined if combined else f"movie: {row['title']}"

_combined_features = ds.apply(combine_features, axis=1)

# compute embeddings
model = SentenceTransformer('Qwen/Qwen3-0.6B')
ft_embeddings = model.encode(_combined_features, batch_size=128, show_progress_bar=True)
np.save('./tmp/movie_embeddings.npy', ft_embeddings)

(4803, 24) Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


No sentence-transformers model found with name Qwen/Qwen3-0.6B. Creating a new one with mean pooling.
    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    


Batches:   0%|          | 0/38 [00:00<?, ?it/s]

In [2]:
import difflib
from sklearn.metrics.pairwise import cosine_similarity
from numpy import load
import pandas as pd

ds = pd.read_csv('./tmp/movies.csv')
title_list=ds['title'].tolist()
similarity = cosine_similarity(load('./tmp/movie_embeddings.npy'))
movie_name = input("What's your favorite movie?")
candidates = difflib.get_close_matches(movie_name, title_list)
if len(candidates)==0:
    print("No movie found. Please check your input.")
    exit()
movie_name = candidates[0]
print(f"Did you mean: {movie_name}?")
index_movie = ds[ds.title==movie_name]["index"].values[0]
similarity_score=list(enumerate(similarity[index_movie])) 
#sorting the movies based on their similarity score
sorted_similar_movies=sorted(similarity_score,key= lambda x:x[1], reverse=True)
_recomended= sorted_similar_movies[:10]
#finding title of movie with index
print(f'\nRecommended if you like "{movie_name}":')
print("=" * 100)

for rank, (index, score) in enumerate(_recomended, 1):
    movie = ds[ds.index==index].iloc[0]
    title = movie['title']
    genres = movie.get('genres', 'N/A')
    
    print(f"{rank:2d}. Title: {title}")    
    print(f"    Genres: {genres}")
    print(f"    Similarity Score: {score:.4f}")
    print()

Did you mean: Rocky?

Recommended if you like "Rocky":
 1. Title: Rocky
    Genres: Drama
    Similarity Score: 1.0000

 2. Title: Rocky Balboa
    Genres: Drama
    Similarity Score: 0.9540

 3. Title: Black Snake Moan
    Genres: Drama
    Similarity Score: 0.9534

 4. Title: Creed
    Genres: Drama
    Similarity Score: 0.9533

 5. Title: Raising Helen
    Genres: Drama Comedy Romance
    Similarity Score: 0.9524

 6. Title: The Karate Kid
    Genres: Drama
    Similarity Score: 0.9523

 7. Title: The Perfect Storm
    Genres: Drama
    Similarity Score: 0.9493

 8. Title: Disaster Movie
    Genres: Action Comedy
    Similarity Score: 0.9488

 9. Title: Mommie Dearest
    Genres: Drama
    Similarity Score: 0.9486

10. Title: The Manchurian Candidate
    Genres: Drama Thriller Mystery
    Similarity Score: 0.9474

