## Installing Libraries

In [None]:
%pip install datasets pandas pymongo sentence_transformers
%pip install -U transformers
%pip install accelerate

## Downloading the dataset

In [2]:
from datasets import load_dataset
import pandas as pd

In [3]:
dataset = load_dataset("AIatMongoDB/embedded_movies")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(2)

Unnamed: 0,num_mflix_comments,genres,runtime,poster,fullplot,writers,awards,directors,countries,plot_embedding,rated,type,cast,languages,metacritic,imdb,title,plot
0,0,[Action],199.0,https://m.media-amazon.com/images/M/MV5BMzgxOD...,Young Pauline is left a lot of money when her ...,"[Charles W. Goddard (screenplay), Basil Dickey...","{'nominations': 0, 'text': '1 win.', 'wins': 1}","[Louis J. Gasnier, Donald MacKenzie]",[USA],"[0.00072939653, -0.026834568, 0.013515796, -0....",,movie,"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",[English],,"{'id': 4465, 'rating': 7.6, 'votes': 744}",The Perils of Pauline,Young Pauline is left a lot of money when her ...
1,0,"[Comedy, Short, Action]",22.0,https://m.media-amazon.com/images/M/MV5BNzE1OW...,As a penniless man worries about how he will m...,[H.M. Walker (titles)],"{'nominations': 1, 'text': '1 nomination.', 'w...","[Alfred J. Goulding, Hal Roach]",[USA],"[-0.022837115, -0.022941574, 0.014937485, -0.0...",TV-G,movie,"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",[English],,"{'id': 10146, 'rating': 7.0, 'votes': 639}",From Hand to Mouth,A penniless young man tries to save an heiress...


In [4]:
dataset_df.shape

(1500, 18)

The dataset contains 18 features and 1500 samples.

In [5]:
dataset_df.isna().sum()

num_mflix_comments      0
genres                  0
runtime                15
poster                 89
fullplot               48
writers                13
awards                  0
directors              13
countries               0
plot_embedding         28
rated                 308
type                    0
cast                    1
languages               1
metacritic            928
imdb                    0
title                   0
plot                   27
dtype: int64

## Data Preparation
The `fullplot` feature is what we'll use for out embedding process. Since there are 48 missing values, we'll drop them.
There's also a `plot_embedding` feature which needs to be removed since we'll be creating our own embeddings.

In [6]:
# Remove data point where plot coloumn is missing
dataset_df = dataset_df.dropna(subset=["fullplot"])

dataset_df.isna().sum()

num_mflix_comments      0
genres                  0
runtime                14
poster                 78
fullplot                0
writers                13
awards                  0
directors              12
countries               0
plot_embedding          1
rated                 279
type                    0
cast                    1
languages               1
metacritic            893
imdb                    0
title                   0
plot                    0
dtype: int64

In [7]:
dataset_df = dataset_df.drop(columns=["plot_embedding"])

In [8]:
dataset_df.head(2)

Unnamed: 0,num_mflix_comments,genres,runtime,poster,fullplot,writers,awards,directors,countries,rated,type,cast,languages,metacritic,imdb,title,plot
0,0,[Action],199.0,https://m.media-amazon.com/images/M/MV5BMzgxOD...,Young Pauline is left a lot of money when her ...,"[Charles W. Goddard (screenplay), Basil Dickey...","{'nominations': 0, 'text': '1 win.', 'wins': 1}","[Louis J. Gasnier, Donald MacKenzie]",[USA],,movie,"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",[English],,"{'id': 4465, 'rating': 7.6, 'votes': 744}",The Perils of Pauline,Young Pauline is left a lot of money when her ...
1,0,"[Comedy, Short, Action]",22.0,https://m.media-amazon.com/images/M/MV5BNzE1OW...,As a penniless man worries about how he will m...,[H.M. Walker (titles)],"{'nominations': 1, 'text': '1 nomination.', 'w...","[Alfred J. Goulding, Hal Roach]",[USA],TV-G,movie,"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",[English],,"{'id': 10146, 'rating': 7.0, 'votes': 639}",From Hand to Mouth,A penniless young man tries to save an heiress...


## Generating embeddings
We use General Text Embeddings (GTE) model to generate embeddings for the `fullplot` feature. The embeddings are then stored in a new column called `embedding`.
The GTE models are trained by Alibaba DAMO Academy. They are mainly based on the BERT framework and currently offer three different sizes of models, including GTE-large, GTE-base, and GTE-small.

In [10]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()


dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)

dataset_df.head(2)

TypeError: SentenceTransformer.__init__() got an unexpected keyword argument 'force_download'

## Database setup and connection
The database is setup using MongoDB. The Dockerfile can be found in this repository.

In [9]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [10]:
def get_mongo_client(mongo_uri):
    try:
        client = MongoClient(mongo_uri, server_api=ServerApi('1'))
        client.admin.command('ping')
        print("Connected to MongoDB")
        return client
    except Exception as e:
        print(f"Error connecting to MongoDB: {e}")
        return None

In [11]:
import os
from dotenv import load_dotenv

load_dotenv()

USER = os.getenv('MONGO_USERNAME')
PASS = os.getenv('MONGO_PASSWORD')

In [12]:
print(USER, PASS)

cherubs01edifice 5PbdfFpeurhrga1x


In [35]:
mongo_uri = f"mongodb+srv://${USER}:${PASS}@rag.5argruh.mongodb.net/?retryWrites=true&w=majority&appName=rag"
mongo_client = get_mongo_client(mongo_uri)

Connected to MongoDB


In [36]:
db = mongo_client["movies"]
collection = db["movie_collection_2"]

In [37]:
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

InsertManyResult([ObjectId('6653d36c4d9a8611d1e7c8eb'), ObjectId('6653d36c4d9a8611d1e7c8ec'), ObjectId('6653d36c4d9a8611d1e7c8ed'), ObjectId('6653d36c4d9a8611d1e7c8ee'), ObjectId('6653d36c4d9a8611d1e7c8ef'), ObjectId('6653d36c4d9a8611d1e7c8f0'), ObjectId('6653d36c4d9a8611d1e7c8f1'), ObjectId('6653d36c4d9a8611d1e7c8f2'), ObjectId('6653d36c4d9a8611d1e7c8f3'), ObjectId('6653d36c4d9a8611d1e7c8f4'), ObjectId('6653d36c4d9a8611d1e7c8f5'), ObjectId('6653d36c4d9a8611d1e7c8f6'), ObjectId('6653d36c4d9a8611d1e7c8f7'), ObjectId('6653d36c4d9a8611d1e7c8f8'), ObjectId('6653d36c4d9a8611d1e7c8f9'), ObjectId('6653d36c4d9a8611d1e7c8fa'), ObjectId('6653d36c4d9a8611d1e7c8fb'), ObjectId('6653d36c4d9a8611d1e7c8fc'), ObjectId('6653d36c4d9a8611d1e7c8fd'), ObjectId('6653d36c4d9a8611d1e7c8fe'), ObjectId('6653d36c4d9a8611d1e7c8ff'), ObjectId('6653d36c4d9a8611d1e7c900'), ObjectId('6653d36c4d9a8611d1e7c901'), ObjectId('6653d36c4d9a8611d1e7c902'), ObjectId('6653d36c4d9a8611d1e7c903'), ObjectId('6653d36c4d9a8611d1e7c9

## Perform Vector Search on User Queries

The following step implements a function that returns a vector search result by generating a query embedding and defining a MongoDB aggregation pipeline.

The pipeline, consisting of the $vectorSearch and $project stages, executes queries using the generated vector and formats the results to include only the required information, such as plot, title, and genres while incorporating a search score for each result.

In [48]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    # the $vectorSearch is only available in MongoDB Atlas.
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 300,
                "limit": 4,
            }
        },
        {
            "$project": {
                "_id": 0,
                "fullplot": 1,
                "title": 1,
                "genres": 1,
                "score": {"$meta": "vectorSearchScore"},
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

## Handling user queries and loading Gemma

In [49]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"

    return search_result

In [51]:
query = "Young Pauline is left a lot of money"
search_result = get_search_result(query, collection)

combined_information = (
    f"Query: {query}\nContinue to answer the query by using the Search Results:\n{search_result}."
)

In [52]:
print(combined_information)

Query: Young Pauline is left a lot of money
Continue to answer the query by using the Search Results:
.
