In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from src.model.embeddings import load_embedding_model, generate_embeddings
from src.config.model_config import MODEL_CONFIG

In [2]:
# Load movies dataset
df = pd.read_csv("../data/processed/movies_processed.csv")
print(f"Loaded {len(df)} movies")

# Get list of available models
available_models = list(MODEL_CONFIG.keys())
print("\nAvailable models:")
for model in available_models:
    print(f"- {model}")

Loaded 8531 movies

Available models:
- paraphrase-MiniLM-L6-v2
- my-sagemaker-model_v2
- my_custom_onnx_model
- all-MiniLM-L6-v2
- bert-base-uncased


In [3]:
df.isna().sum()

Unnamed: 0      0
id              0
title           0
overview        0
release_date    0
popularity      0
vote_average    0
vote_count      0
dtype: int64

In [4]:
# # Remove rows where overview is NaN
# print(f"Original shape: {df.shape}")
# df = df.dropna(subset=['overview'])
# print(f"Shape after removing NaN overviews: {df.shape}")

# # Save back to same file, overwriting it
# df.to_csv("../data/processed/movies_processed.csv", index=False)
# print("Saved cleaned dataset")


Original shape: (8551, 8)
Shape after removing NaN overviews: (8531, 8)
Saved cleaned dataset


## Embeddings for model paraphrase-MiniLM-L6-v2

In [4]:
# Tercera celda - Seleccionar y verificar modelo
# Select model to generate embeddings
model_name = "paraphrase-MiniLM-L6-v2"  # Change this to your desired model

if model_name not in MODEL_CONFIG:
    raise ValueError(f"Model {model_name} not found in configuration")

# Get model configuration
config = MODEL_CONFIG[model_name]
print(f"\nSelected model: {model_name}")
print(f"Embeddings will be saved to: {config['embeddings_path']}")


Selected model: paraphrase-MiniLM-L6-v2
Embeddings will be saved to: s3/embeddings/embeddings_paraphrase_MiniLM.npy


In [16]:
"1900-01-03">"1900-01-01"

True

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8531 entries, 0 to 8530
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    8531 non-null   int64  
 1   id            8531 non-null   int64  
 2   title         8531 non-null   object 
 3   overview      8531 non-null   object 
 4   release_date  8531 non-null   object 
 5   popularity    8531 non-null   float64
 6   vote_average  8531 non-null   float64
 7   vote_count    8531 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 533.3+ KB


In [17]:
# Find min and max values for release_date, popularity and vote_average
print("Release date range:")
print(f"Min: {df['release_date'].min()}")
print(f"Max: {df['release_date'].max()}\n")

print("Popularity range:")
print(f"Min: {df['popularity'].min():.2f}")
print(f"Max: {df['popularity'].max():.2f}\n")

print("Vote average range:")
print(f"Min: {df['vote_average'].min():.2f}")
print(f"Max: {df['vote_average'].max():.2f}")

Release date range:
Min: 1902-04-17
Max: 2021-03-24

Popularity range:
Min: 0.60
Max: 11701.43

Vote average range:
Min: 2.20
Max: 8.70


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,0,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,18.433,8.7,2763
1,1,724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,2020-07-31,8.439,8.7,1223
2,2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,65.57,8.7,18637
3,3,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,63.277,8.7,14052
4,4,761053,Gabriel's Inferno Part III,The final part of the film adaption of the ero...,2020-11-19,26.691,8.7,773


In [13]:
df["release_date"]

0       1995-10-20
1       2020-07-31
2       1994-09-23
3       1972-03-14
4       2020-11-19
           ...    
8546    2008-08-29
8547    2000-05-12
8548    2009-03-12
8549    2003-04-11
8550    2010-02-27
Name: release_date, Length: 8551, dtype: object

In [12]:
df["id"].nunique()

8551

In [11]:
df.shape

(8551, 8)

In [6]:
# Load the model
print("Loading model...")
model = load_embedding_model(model_name)

# Generate embeddings
print("\nGenerating embeddings...")
texts = df['overview'].tolist() 
embeddings = generate_embeddings(model, texts, show_progress_bar=True)

print(f"\nGenerated embeddings shape: {embeddings.shape}")

Loading model...


  from .autonotebook import tqdm as notebook_tqdm



Generating embeddings...


Batches: 100%|██████████| 267/267 [00:08<00:00, 30.16it/s]



Generated embeddings shape: (8531, 384)


In [7]:
import os
current_dir = os.getcwd()
if not current_dir.endswith("Movie_Recommender"):
    os.chdir("..")
os.getcwd()

'd:\\Projects_D\\Movie_Recommender'

In [8]:
embeddings_path = config['embeddings_path']
print(f"Saving embeddings to {embeddings_path}")
np.save(embeddings_path, embeddings)
print("Done!")

Saving embeddings to s3/embeddings/embeddings_paraphrase_MiniLM.npy...
Done!


## Embeddings for model all-MiniLM-L6-v2

In [10]:
# Tercera celda - Seleccionar y verificar modelo
# Select model to generate embeddings
model_name = "all-MiniLM-L6-v2"  # Change this to your desired model

if model_name not in MODEL_CONFIG:
    raise ValueError(f"Model {model_name} not found in configuration")

In [11]:
# Get model configuration
config = MODEL_CONFIG[model_name]
print(f"\nSelected model: {model_name}")
print(f"Embeddings will be saved to: {config['embeddings_path']}")
# Load the model
print("Loading model...")
model = load_embedding_model(model_name)


Selected model: all-MiniLM-L6-v2
Embeddings will be saved to: s3/embeddings/embeddings_all_miniLM.npy
Loading model...


In [12]:
# Generate embeddings
print("\nGenerating embeddings...")
texts = df['overview'].tolist() 
embeddings = generate_embeddings(model, texts, show_progress_bar=True)

print(f"\nGenerated embeddings shape: {embeddings.shape}")


Generating embeddings...


Batches: 100%|██████████| 267/267 [00:07<00:00, 33.67it/s]


Generated embeddings shape: (8531, 384)





In [13]:
import os
current_dir = os.getcwd()
if not current_dir.endswith("Movie_Recommender"):
    os.chdir("..")
os.getcwd()

embeddings_path = config['embeddings_path']
print(f"Saving embeddings to {embeddings_path}")
np.save(embeddings_path, embeddings)
print("Done!")

Saving embeddings to s3/embeddings/embeddings_all_miniLM.npy...
Done!


# Filters

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,0,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,18.433,8.7,2763
1,1,724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,2020-07-31,8.439,8.7,1223
2,2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,65.57,8.7,18637
3,3,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,63.277,8.7,14052
4,4,761053,Gabriel's Inferno Part III,The final part of the film adaption of the ero...,2020-11-19,26.691,8.7,773


In [4]:
mask = (
            (df['release_date'] >="2020-01-01") &
            (df['release_date'] <="2021-01-01") &
            (df['popularity'] >= 3000) &
            (df['popularity'] <= 11702) &
            (df['vote_average'] >= 2.5) &
            (df['vote_average'] <= 10)
        )

In [6]:
df[mask].shape

(0, 8)

In [9]:
df[df['release_date'] >="2020-01-01"].head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
1,1,724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,2020-07-31,8.439,8.7,1223
4,4,761053,Gabriel's Inferno Part III,The final part of the film adaption of the ero...,2020-11-19,26.691,8.7,773
5,5,696374,Gabriel's Inferno,An intriguing and sinful exploration of seduct...,2020-05-29,10.51,8.7,1993
6,6,791373,Zack Snyder's Justice League,Determined to ensure Superman's ultimate sacri...,2021-03-18,7337.834,8.6,4179
7,7,399566,Godzilla vs. Kong,"In a time when monsters walk the Earth, humani...",2021-03-24,11701.435,8.7,1236
