In [1]:
! which python3

/bin/python3


In [2]:
! /bin/pip install -r ../requirements.txt

Collecting langchain-community==0.0.34
  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting streamlit==1.33.0
  Downloading streamlit-1.33.0-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting langchain==0.1.16
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 KB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb==0.5.0
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 KB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf==4.2.0
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━

In [3]:
import sys 
sys.path.append('/workspaces/AI_Chatbot')

In [4]:
import logging
import os

from langchain_community.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

from constants import SOURCE_DIRECTORY, CHROMA_SETTINGS, PERSIST_DIRECTORY
from langchain.document_loaders import PDFPlumberLoader, DirectoryLoader

from semantic_chunking_helper import SematicChunkingHelper

import torch

In [5]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
    level=logging.INFO,
)

source_dir = '/workspaces/AI_Chatbot/.DOCS.t/'

model_name='dangvantuan/vietnamese-embedding'
persist_dir=PERSIST_DIRECTORY + "_" + model_name,
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model_name, device=device)
test_sentence = ["Xin chào, đây là thử nghiệm"]
try:
    embedding_output = model.encode(test_sentence)
    print(f"Embedding output: {embedding_output}")
except RuntimeError as e:
    print(f"Error during embedding: {e}")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
2024-10-02 08:28:53,484 - INFO - SentenceTransformer.py:107 - Load pretrained SentenceTransformer: dangvantuan/vietnamese-embedding
  _torch_pytree._register_pytree_node(
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.94it/s]

Embedding output: [[ 6.46414682e-02 -2.79253796e-02  1.80540472e-01 -1.94376945e-01
  -1.48897991e-01  2.75223851e-02  2.05813363e-01 -5.98897338e-01
  -4.46321845e-01 -1.76655903e-01  3.60757351e-01  4.81040299e-01
   2.27497533e-01  4.06815171e-01 -4.94004041e-02  4.34967816e-01
   5.55163383e-01  3.12171012e-01 -3.57532114e-01 -4.05472875e-01
   6.68190300e-01 -1.50099978e-01  1.32202199e-02  5.47649622e-01
  -1.10528517e+00 -1.68719545e-01 -3.90341312e-01 -4.53529835e-01
   4.44119096e-01  5.26073538e-02 -3.65168065e-01 -6.98505715e-02
  -8.66786689e-02  2.25236908e-01  4.20317203e-01 -7.87826002e-01
  -2.32527956e-01  8.07268560e-01  8.83637369e-02 -1.72254398e-01
   1.68198735e-01 -1.41508698e-01 -5.95090762e-02 -7.30464682e-02
  -2.53133848e-02  7.49091804e-02 -2.63811201e-01  2.31989726e-01
   9.39007755e-03  1.70397371e-01 -2.56641418e-01  5.94984442e-02
  -2.43142381e-01  5.56152165e-01 -1.36461377e-01  6.24050163e-02
   5.56508154e-02 -4.96622585e-02  6.10720694e-01 -8.39585




In [7]:
! export CUDA_LAUNCH_BLOCKING=1

# PyPDFDirectoryLoader, supports loading dpf files
docs = DirectoryLoader(
    source_dir, glob="**/*.pdf", loader_cls=PDFPlumberLoader
).load()
chunks = filter_complex_metadata(docs)

# model_name = "dangvantuan/vietnamese-embedding"
embeddings = HuggingFaceInstructEmbeddings(
    model_name=model_name,
    model_kwargs={"device": device},  # note for Apple Silicon Chip use "mps"
)
semantic_chunking = SematicChunkingHelper(
    docs=chunks, embeddings=embeddings, buffer_size=2, breakpoint_threshold=50
)
Chroma.from_texts(
    texts=semantic_chunking.text_chunks,
    embedding=embeddings,
    client_settings=CHROMA_SETTINGS,
    persist_directory=persist_dir,
)


2024-10-02 08:29:29,438 - INFO - SentenceTransformer.py:107 - Load pretrained SentenceTransformer: dangvantuan/vietnamese-embedding


TypeError: INSTRUCTOR._load_sbert_model() got an unexpected keyword argument 'token'