In [1]:
import os
import sys

from rich import print

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"✅ Project root added to path: {project_root}")

In [2]:
import numpy as np

texts = [
    "What is the capital of France?", # 0
    "What is the capital of Norway?",
    "Man", # 2
    "Woman",
    "男人", # 4
    "King"
    "Queen" # 6
]

def cosine_similarity(a, b):
    """
    Calculate the cosine similarity between two vectors.

    Parameters:
    - a (numpy array): First vector.
    - b (numpy array): Second vector.

    Returns:
    - float: Cosine similarity between the two vectors.
    """
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

from langchain_core.embeddings.embeddings import Embeddings

def test_embeddings(embeddings: Embeddings):
    print(embeddings.__class__.__mro__)
    vectorized_texts = embeddings.embed_documents(texts)
    print(vectorized_texts[0].__class__.__mro__)
    print(f"dimension: {len(vectorized_texts[0])}")
    print(f"semantic: {cosine_similarity(vectorized_texts[0], vectorized_texts[1])}")
    print(f"opposite : {cosine_similarity(vectorized_texts[2], vectorized_texts[3])}")
    print(f"multilang: {cosine_similarity(vectorized_texts[2], vectorized_texts[4])}")
    vector_cal = np.array(vectorized_texts[5]) - np.array(vectorized_texts[2]) + np.array(vectorized_texts[3]) # king - man + woman = queen?
    print(f"king - man + woman vs. Queen: {cosine_similarity(vector_cal, vectorized_texts[5])}")


## `common.models.embedding_models`

In [12]:
from src.app.common.models import init_embedding_model

embeddings = init_embedding_model(model="jina-embeddings-v4", model_provider="jinaai")
test_embeddings(embeddings)

## `langchain.embeddings`

- [ ] `langchain_openai.OpenAIEmbeddings`
- [ ] `langchain_ollama.OllamaEmbeddings`
- [ ] `langchain_xinference.XinferenceEmbeddings`
- [ ] `langchain_google_genai.GoogleGenerativeAIEmbeddings`
- [ ] `langchain_siliconflow.SiliconFlowEmbeddings`
- [ ] `langchain_community.embeddings.dashscope.DashScopeEmbeddings`
- [ ] `langchain_community.embeddings.jina.JinaEmbeddings`


In [None]:
# package: langchain_openai
from langchain_openai import OpenAIEmbeddings

openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
test_embeddings(openai_embeddings)
test_embeddings(openai_embeddings)

In [None]:
# package: langchain_ollama
from langchain_ollama import OllamaEmbeddings

ollama_embeddings = OllamaEmbeddings(model="qwen3-embedding:latest")
test_embeddings(ollama_embeddings)

In [None]:
# package: langchain_xinference
from langchain_xinference import XinferenceEmbeddings

xinference_embeddings = XinferenceEmbeddings(model_uid="qwen3-embedding-4b", server_url="http://192.168.100.10:9997")
test_embeddings(xinference_embeddings)

In [None]:
# package: langchain_google_genai
from langchain_google_genai import GoogleGenerativeAIEmbeddings

google_embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
test_embeddings(google_embeddings)

In [None]:
# package: langchain_siliconflow.SiliconflowEmbeddings
from langchain_siliconflow import SiliconFlowEmbeddings

siliconflow_embeddings = SiliconFlowEmbeddings(model="BAAI/bge-m3")
test_embeddings(siliconflow_embeddings)

In [None]:
# langchain_community.embeddings.DashScopeEmbeddings # require: pip install dashscope
from langchain_community.embeddings import DashScopeEmbeddings # Dashscope
dashscope_embeddings = DashScopeEmbeddings(model="text-embedding-v4")
#dashscope_embeddings = DashScopeEmbeddings(model="tongyi-embedding-vision-plus")
test_embeddings(dashscope_embeddings)

In [None]:
# langchain_community.embeddings.JinaEmbeddings
from langchain_community.embeddings import JinaEmbeddings

jinaai_embeddings = JinaEmbeddings(model="jina-embeddings-v4")
test_embeddings(jinaai_embeddings)