In [2]:
from dotenv import load_dotenv
from openai import OpenAI
import os

from glob import glob
from math import ceil
import os
from pathlib import Path
from random import choices
import re

import cv2
import matplotlib.pyplot as plt
from PIL import Image

# I'm using MongoDB as my vector database:
from pymongo import MongoClient
from pymongo.errors import CollectionInvalid, DuplicateKeyError
from pymongo.operations import SearchIndexModel

from sentence_transformers import SentenceTransformer # The transformer used to execute the clip model.
from tqdm.notebook import tqdm     

import certifi

load_dotenv(dotenv_path="../.env.local")

True

In [3]:
MONGO_URI = os.getenv("MONGO_URI")

DB_NAME = os.getenv("DB_NAME")

COLLECTION_NAME = os.getenv("COLLECTION_NAME")

ATLAS_INDEX = os.getenv("ATLAS_INDEX")

In [4]:
ca = certifi.where()

client = MongoClient(MONGO_URI, tlsCAFile=ca)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("\nPinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)


Pinged your deployment. You successfully connected to MongoDB!


In [5]:
model = SentenceTransformer("clip-ViT-L-14")


In [6]:
db = client[DB_NAME]

COLLECTION_NAME = "mongo-img"

collection = db[COLLECTION_NAME]

In [8]:
from PIL import Image
import requests
from io import BytesIO

def emb_img(url):
    response = requests.get(url)

    img = Image.open(BytesIO(response.content))
    
    return model.encode(img)

In [9]:
img_1 = "https://image.tmdb.org/t/p/w500/rr7E0NoGKxvbkb89eR1GwfoYjpA.jpg"

img_2 = "https://image.tmdb.org/t/p/w500/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg"


collection.insert_one(
  {
    "embeddings": emb_img(img_1).tolist(),
    "url": img_1,
    "source": "fight club"
  }
)

collection.insert_one(
  {
    "embeddings": emb_img(img_2).tolist(),
    "url": img_2,
    "source": "fight club"
  }
)

InsertOneResult(ObjectId('666f48c0aca9c039dc0ac3fd'), acknowledged=True)

In [10]:
ATLAS_INDEX = os.getenv("ATLAS_INDEX")

emb = emb_img("https://image.tmdb.org/t/p/w500/rr7E0NoGKxvbkb89eR1GwfoYjpA.jpg")

pipeline = [
    {
        "$vectorSearch": {
            "index": f"{ATLAS_INDEX}",
            "path": "embeddings",
            "queryVector": emb.tolist(),
            "numCandidates": 5,
            "limit": 5,
        }
    },
    {
        "$project": {
            "_id": 0,
            "url": 1,
            "source": 1,
            "score": {
                "$meta": "vectorSearchScore"
            }
        }
    }
]

In [11]:
results = list(collection.aggregate(pipeline))

In [12]:
results

[{'url': 'https://image.tmdb.org/t/p/w500/rr7E0NoGKxvbkb89eR1GwfoYjpA.jpg',
  'source': 'fight club',
  'score': 1.0},
 {'url': 'https://image.tmdb.org/t/p/w500/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg',
  'source': 'fight club',
  'score': 0.8855478167533875}]

In [13]:
ATLAS_INDEX = os.getenv("ATLAS_INDEX")

emb = model.encode("fight club")

pipeline = [
    {
        "$vectorSearch": {
            "index": f"{ATLAS_INDEX}",
            "path": "embeddings",
            "queryVector": emb.tolist(),
            "numCandidates": 5,
            "limit": 5,
        }
    },
    {
        "$project": {
            "_id": 0,
            "url": 1,
            "source": 1,
            "score": {
                "$meta": "vectorSearchScore"
            }
        }
    }
]

In [14]:
results = list(collection.aggregate(pipeline))

In [15]:
results

[{'url': 'https://image.tmdb.org/t/p/w500/rr7E0NoGKxvbkb89eR1GwfoYjpA.jpg',
  'source': 'fight club',
  'score': 0.6294015645980835},
 {'url': 'https://image.tmdb.org/t/p/w500/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg',
  'source': 'fight club',
  'score': 0.6255523562431335}]

In [16]:
ATLAS_INDEX = os.getenv("ATLAS_INDEX")

emb = model.encode("fight club") + emb_img("https://image.tmdb.org/t/p/w500/rr7E0NoGKxvbkb89eR1GwfoYjpA.jpg")

pipeline = [
    {
        "$vectorSearch": {
            "index": f"{ATLAS_INDEX}",
            "path": "embeddings",
            "queryVector": emb.tolist(),
            "numCandidates": 5,
            "limit": 5,
        }
    },
    {
        "$project": {
            "_id": 0,
            "url": 1,
            "source": 1,
            "score": {
                "$meta": "vectorSearchScore"
            }
        }
    }
]


In [17]:
results = list(collection.aggregate(pipeline))

In [18]:
results

[{'url': 'https://image.tmdb.org/t/p/w500/rr7E0NoGKxvbkb89eR1GwfoYjpA.jpg',
  'source': 'fight club',
  'score': 0.9218599796295166},
 {'url': 'https://image.tmdb.org/t/p/w500/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg',
  'source': 'fight club',
  'score': 0.8396159410476685}]