# Step 1: Import necessary Librabries

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Step 2: Load Dataset (movies dataset)

In [4]:
movies = pd.read_csv("imdb_movies.csv")   # replace with your file
movies = movies.dropna(subset=["names", "overview"])  # ensure text available

print("Dataset shape:", movies.shape)
print(movies.head())

Dataset shape: (10178, 12)
                         names       date_x  score  \
0                    Creed III  03/02/2023    73.0   
1     Avatar: The Way of Water  12/15/2022    78.0   
2  The Super Mario Bros. Movie  04/05/2023    76.0   
3                      Mummies  01/05/2023    70.0   
4                    Supercell  03/17/2023    61.0   

                                           genre  \
0                                  Drama, Action   
1             Science Fiction, Adventure, Action   
2  Animation, Adventure, Family, Fantasy, Comedy   
3  Animation, Comedy, Family, Adventure, Fantasy   
4                                         Action   

                                            overview  \
0  After dominating the boxing world, Adonis Cree...   
1  Set more than a decade after the events of the...   
2  While working underground to fix a water main,...   
3  Through a series of unfortunate events, three ...   
4  Good-hearted teenager William always lived in ...   

# Step 3: Load Pretrained Sentence Transformer Model

In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Step 4: Encode Movie Overviews


In [6]:
movie_embeddings = model.encode(
    movies["overview"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)

print("Embeddings shape:", movie_embeddings.shape)

Batches:   0%|          | 0/319 [00:00<?, ?it/s]

Embeddings shape: torch.Size([10178, 384])


# Step 5: Build Recommendation Function

In [9]:
def recommend(movie_title, top_k=5):
    # find index
    idx = movies[movies["names"].str.lower() == movie_title.lower()].index
    if len(idx) == 0:
        return f"Movie '{movie_title}' not found."
    idx = idx[0]

    query_embedding = movie_embeddings[idx]

    # cosine similarity
    scores = util.cos_sim(query_embedding, movie_embeddings)[0].cpu().numpy()

    # top results
    top_indices = np.argsort(scores)[::-1][1:top_k+1]  # skip the movie itself

    recommendations = movies.iloc[top_indices][["title", "overview"]]
    return recommendations

In [10]:
# Example
print(recommend("The Matrix"))

KeyError: "['title'] not in index"