In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import threading

# Define paths
data_dir = 'data'
faiss_db = 'vectorstore/db_faiss'

def process_excel_file(excel_path):
    # Load the Excel file using UnstructuredExcelLoader
    loader = UnstructuredExcelLoader(excel_path, mode="elements")
    document = loader.load()
    print(f"Loaded Excel file: {excel_path}")

    # Initialize a text splitter to divide documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(document)
    print(f"Texts splitted for Excel file: {excel_path}")

    # Initialize HuggingFaceEmbeddings using a specific model
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
    print(f"Embeddings created for Excel file: {excel_path}")

    # Create a vector store using FAISS from the text chunks and embeddings
    db = FAISS.from_documents(texts, embeddings)
    print(f"Vector store created for Excel file: {excel_path}")

    # Save the vector store locally
    db.save_local(os.path.join(faiss_db, f"{os.path.basename(excel_path)}_db"))

def process_excel_files_in_parallel():
    # Ensure the directory exists
    if not os.path.exists(faiss_db):
        os.makedirs(faiss_db)

    # List all Excel files in the directory
    excel_files = [file for file in os.listdir(data_dir) if file.endswith('.xlsx')]
    print("excel_files_loaded")

    # Create threads for each Excel file processing
    threads = []
    for excel_file in excel_files:
        excel_path = os.path.join(data_dir, excel_file)
        thread = threading.Thread(target=process_excel_file, args=(excel_path,))
        threads.append(thread)
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()

if __name__ == "__main__":
    process_excel_files_in_parallel()


excel_files_loaded
Loaded Excel file: data\Book.xlsx
Texts splitted for Excel file: data\Book.xlsx
Loaded Excel file: data\Superstore 2023.xlsx
Texts splitted for Excel file: data\Superstore 2023.xlsx


  from .autonotebook import tqdm as notebook_tqdm


Embeddings created for Excel file: data\Book.xlsxEmbeddings created for Excel file: data\Superstore 2023.xlsx

Vector store created for Excel file: data\Book.xlsx
Vector store created for Excel file: data\Superstore 2023.xlsx
