In [11]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def create_and_save_vector_db(
    xls_path='Статьи.xls',
    title_index_path='title_faiss_index',
    chunk_index_path='chunk_faiss_index',
    chunk_size=1000
):
    """
    Create and save FAISS vector stores for a two-step RAG pipeline.

    Args:
        xls_path (str): Path to the Excel file.
        title_index_path (str): Directory to save the title FAISS index.
        chunk_index_path (str): Directory to save the description chunk FAISS index.
        chunk_size (int): Size of description chunks in characters.
    """
    # Load the Excel file
    df = pd.read_excel(xls_path)
    if 'Заголовок статьи' not in df.columns or 'Описание' not in df.columns:
        raise ValueError("Excel file must contain 'Заголовок статьи' and 'Описание' columns")
    
    # Initialize embeddings
    embeddings = HuggingFaceEmbeddings(model_name='bert-base-multilingual-cased')
    
    # Create title documents
    title_docs = [
        Document(page_content=row['Заголовок статьи'], metadata={'article': row['Заголовок статьи']})
        for _, row in df.iterrows()
    ]
    
    # Create description chunk documents with error handling
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunk_docs = []
    for _, row in df.iterrows():
        article = row['Заголовок статьи']
        description = row['Описание']
        
        # Convert description to string and handle NaN or non-string types
        if pd.isna(description):
            description = ""  # Replace NaN with empty string
        else:
            description = str(description)  # Ensure it's a string
        
        # Only split if description has content
        if description.strip():
            chunks = text_splitter.split_text(description)
            for chunk in chunks:
                doc = Document(page_content=chunk, metadata={'article': article})
                chunk_docs.append(doc)
        else:
            print(f"Skipping empty or invalid description for article: {article}")
    
    # Create and save the title vector store
    title_vector_store = FAISS.from_documents(title_docs, embeddings)
    title_vector_store.save_local(title_index_path)
    
    # Create and save the description chunk vector store
    chunk_vector_store = FAISS.from_documents(chunk_docs, embeddings)
    chunk_vector_store.save_local(chunk_index_path)
    
    print(f"Title vector store saved to '{title_index_path}'")
    print(f"Description chunk vector store saved to '{chunk_index_path}'")

# Run the function
create_and_save_vector_db()

No sentence-transformers model found with name bert-base-multilingual-cased. Creating a new one with mean pooling.
Created a chunk of size 1031, which is longer than the specified 1000
Created a chunk of size 1109, which is longer than the specified 1000
Created a chunk of size 1680, which is longer than the specified 1000
Created a chunk of size 1850, which is longer than the specified 1000
Created a chunk of size 3113, which is longer than the specified 1000
Created a chunk of size 1439, which is longer than the specified 1000
Created a chunk of size 1448, which is longer than the specified 1000
Created a chunk of size 1080, which is longer than the specified 1000
Created a chunk of size 1181, which is longer than the specified 1000
Created a chunk of size 2125, which is longer than the specified 1000
Created a chunk of size 1079, which is longer than the specified 1000
Created a chunk of size 1102, which is longer than the specified 1000
Created a chunk of size 1009, which is longer

Skipping empty or invalid description for article: Участие в котировочной сессии
Skipping empty or invalid description for article: Уведомления о наступающих сроках исполнения этапа
Skipping empty or invalid description for article: Видео
Skipping empty or invalid description for article: Заказчику не приходит УПД
Skipping empty or invalid description for article: Карта сайта
Skipping empty or invalid description for article: Несанкционированное вмешательство в Подсистему
Skipping empty or invalid description for article: Настройка MS Word перед работой
Skipping empty or invalid description for article: Работа с контрактами, Мои контракты
Skipping empty or invalid description for article: Список аккредитованных удостоверяющих центров
Skipping empty or invalid description for article: Как в Исполнении контрактов, электронно актируемых через ЕИС, указать КПП крупнейшем налогоплательщике, и что делать, если у организации 2 и более КПП
Skipping empty or invalid description for article: Тес