In [2]:
from langchain_cohere import CohereEmbeddings
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter
import os
from langchain_core.documents import Document
import mimetypes
import uuid
from tqdm import tqdm


# Set up Azure OpenAI credentials
os.environ["OPENAI_API_VERSION"] = "2023-12-01-preview"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://cog-kguqugfu5p2ki.openai.azure.com/"
os.environ["AZURE_OPENAI_API_KEY"] = "4657af893faf48e5bd81208d9f87f271"

connection_string="postgresql://root:root@localhost:5433/langchain_demo_faiss"
# Initialize Azure OpenAI embedding and LLM
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="embedding",
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version=os.environ["OPENAI_API_VERSION"]
)

collection_name = "my_docs"
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection_string,
    use_jsonb=True,
)


## Insert Document to Vector

In [3]:

UPLOAD_DIR = "uploaded_files"

# Get a list of files in the directory
files = os.listdir(UPLOAD_DIR)
# Filter out directories
files = [file for file in files if os.path.isfile(os.path.join(UPLOAD_DIR, file))]

texts_collections=[]
for file in files:
    file_path = os.path.join(UPLOAD_DIR, file)
    mime_type, encoding = mimetypes.guess_type(file_path)

    if mime_type == "text/plain":
        loader = TextLoader(file_path)
    elif mime_type == "application/pdf":
        loader = PyPDFLoader(file_path)
    else:  # DOCX
        loader = Docx2txtLoader(file_path)

    documents = loader.load()
    # Split documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    texts_collections.extend(texts)
    

incorrect startxref pointer(3)


In [22]:
len(texts_collections)

1622

In [21]:
import re
def preprocess_text(text):
    # Replace unique characters with spaces
    text = re.sub(r'\u2003\x00', '', text)
    text = re.sub(r'\n', '\n ', text)

    return text

In [15]:
for texts_collection in tqdm(texts_collections[1511:], desc="Processing files", unit="file"):
    texts_collection.metadata["id"] = str(uuid.uuid4())
    cleaned_string = preprocess_text(texts_collection.page_content)
    texts_collection.page_content = cleaned_string
    vectorstore.add_documents([texts_collection], ids=[texts_collection.metadata["id"]])

Processing files: 100%|██████████| 111/111 [02:26<00:00,  1.32s/file]


In [27]:
results  = vectorstore.similarity_search_with_score("kitty", k=10)
results

[(Document(metadata={'id': 1, 'topic': 'animals', 'location': 'pond'}, page_content='there are cats in the pond'),
  0.14601593339344443),
 (Document(metadata={'id': 'c6641fd7-a221-4ac4-a950-e83a5fa5a71f', 'page': 604, 'source': 'uploaded_files\\The Wine Bible, 3rd Edition.pdf'}, page_content='******ebook converter DEMO W atermarks*******\nAn old Sicilian terra-cotta amphora sits near the vines.'),
  0.22807908580801273),
 (Document(metadata={'id': 'c02a1adc-a1e2-4dc2-bfc7-3c3d538fc3d6', 'page': 861, 'source': 'uploaded_files\\The Wine Bible, 3rd Edition.pdf'}, page_content='******ebook converter DEMO W atermarks*******\nKirschwasser\nTHE MOSEL'),
  0.2298462849946702),
 (Document(metadata={'id': '52ad19e1-477d-47b4-b25c-cac19166525c', 'page': 266, 'source': 'uploaded_files\\The Wine Bible, 3rd Edition.pdf'}, page_content='******ebook converter DEMO W atermarks*******\nThe stunning crayères  deep underneath the Champagne house T aittinger .'),
  0.23495811582609682),
 (Document(metadat