# Embeddings

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from sentence_transformers import SentenceTransformer
import umap
import plotly.express as px
import pandas as pd

from movie_buddy.preprocessing.movies_dataset import get_movies_dataset

In [None]:
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
my_sentences_list = [
    ("The sun rises in the east and sets in the west.", "Nature and Environment"),
    (
        "Forests play a crucial role in maintaining the earth's ecosystem.",
        "Nature and Environment",
    ),
    (
        "Pollution is one of the biggest threats to marine life.",
        "Nature and Environment",
    ),
    ("Climate change affects weather patterns globally.", "Nature and Environment"),
    ("Cats are known for their independence and agility.", "Animals"),
    ("Dogs are loyal companions that can understand human emotions.", "Animals"),
    ("Elephants have complex social structures and remarkable memory.", "Animals"),
    ("Technology has revolutionized the way we communicate.", "Technology"),
    (
        "Artificial intelligence is transforming industries by automating tasks.",
        "Technology",
    ),
    (
        "Cybersecurity is essential in protecting online data from threats.",
        "Technology",
    ),
    ("Cooking at home allows for healthier dietary choices.", "Health and Lifestyle"),
    (
        "Regular exercise contributes to both physical and mental well-being.",
        "Health and Lifestyle",
    ),
    ("Meditation can reduce stress and improve focus.", "Health and Lifestyle"),
    (
        "Mathematics is fundamental to the understanding of the universe.",
        "Science and Education",
    ),
    (
        "Physics explores the fundamental principles governing the natural world.",
        "Science and Education",
    ),
    (
        "History teaches us about the successes and failures of past civilizations.",
        "Science and Education",
    ),
    (
        "Reading fiction can improve empathy and emotional intelligence.",
        "Literature and Psychology",
    ),
    (
        "Writing poetry is a way to express emotions and explore creativity.",
        "Literature and Psychology",
    ),
    (
        "Studying psychology helps us understand human behavior and mental processes.",
        "Literature and Psychology",
    ),
    ("The Olympic Games bring together athletes from around the world.", "Sports"),
    (
        "Football is one of the most popular sports globally, with a vast fan base.",
        "Sports",
    ),
    (
        "Marathon running challenges individuals to push their physical limits.",
        "Sports",
    ),
    (
        "Renewable energy sources are key to sustainable development.",
        "Energy and Sustainability",
    ),
    (
        "Recycling reduces waste and conserves natural resources.",
        "Energy and Sustainability",
    ),
    (
        "Urban planning that includes green spaces contributes to environmental health.",
        "Energy and Sustainability",
    ),
]

sentences_df = pd.DataFrame(my_sentences_list, columns=["sentences", "field"])

In [None]:
sentences_df

In [None]:
encoded_sentences = encoder.encode(sentences_df["sentences"])

In [None]:
encoded_sentences.shape

In [None]:
reducer = umap.UMAP()
reduced_encoded_sentences = reducer.fit_transform(encoded_sentences)

In [None]:
reduced_encoded_sentences.tolist()

In [None]:
sentences_df["reduced_encoded_sentences"] = reduced_encoded_sentences.tolist()
split = pd.DataFrame(
    sentences_df["reduced_encoded_sentences"].to_list(), columns=["x", "y"]
)
sentences_df = pd.concat([sentences_df, split], axis=1)
sentences_df

In [None]:
sentences_df["short_sentences"] = sentences_df["sentences"].str.slice(0, 20) + "..."
sentences_df

In [None]:
fig = px.scatter(
    sentences_df,
    x="x",
    y="y",
    text="short_sentences",
    color="field",
    height=512,
    hover_name="field",
    hover_data={
        "sentences": True,
        "x": False,
        "y": False,
        "field": False,
        "short_sentences": False,
    },
)
fig.update_layout(title_text="Which Vector are close?", template="plotly_white")
fig.update_traces(textposition="top center", marker=dict(size=15))
fig.show()

## What About Movies? 

In [None]:
movies_df = get_movies_dataset()

In [None]:
movies_df

In [None]:
len(movies_df["overview"].tolist())

In [None]:
%time
encoded_movies = encoder.encode(movies_df["overview"].tolist())

In [None]:
reducer = umap.UMAP()
reduced_encoded_movies = reducer.fit_transform(encoded_movies)

In [None]:
len(reduced_encoded_movies.tolist())

In [None]:
movies_df["encoded_overview"] = reduced_encoded_movies.tolist()

In [None]:
movies_df

In [None]:
split = pd.DataFrame(movies_df["encoded_overview"].tolist(), columns=["x", "y"])

In [None]:
split

In [None]:
movies_df = pd.concat([movies_df.reset_index(), split.reset_index()], axis=1)
movies_df

In [None]:
fig = px.scatter(
    movies_df,
    x="x",
    y="y",
    color="genre",
    height=512,
    hover_name="genre",
    hover_data={"overview": False, "title": True, "x": False, "y": False},
)
fig.update_layout(title_text="Which Movies Are Close?", template="plotly_white")
fig.update_traces(textposition="top center", marker=dict(size=5))
fig.show()