## Creating an index and populating it with documents using FAISS

Simple example on how to ingest PDF documents, then web pages content into a FAISS VectorStore.


### Base parameters

In [1]:
index_name = "pdf_docs"

#### Imports

In [2]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

## Initial index creation and document ingestion

#### Document loading from a folder containing PDFs

In [8]:
pdf_folder_path = 'pdf'

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

#### Split documents into chunks with some overlap

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)

#### Create the index and ingest the documents

In [9]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.from_documents(all_splits, embeddings)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Write the schema to a yaml file to be able to open the index later on

In [10]:
faiss_index.save_local("pdf", index_name)

## Ingesting new documents

#### Example with Web pages

In [8]:
from langchain.document_loaders import WebBaseLoader

In [10]:
loader = WebBaseLoader(["https://ai-on-openshift.io/getting-started/openshift/",
                        "https://ai-on-openshift.io/getting-started/opendatahub/",
                        "https://ai-on-openshift.io/getting-started/openshift-data-science/",
                        "https://ai-on-openshift.io/odh-rhods/configuration/",
                        "https://ai-on-openshift.io/odh-rhods/custom-notebooks/",
                        "https://ai-on-openshift.io/odh-rhods/nvidia-gpus/",
                        "https://ai-on-openshift.io/odh-rhods/custom-runtime-triton/",
                        "https://ai-on-openshift.io/odh-rhods/openshift-group-management/",
                        "https://ai-on-openshift.io/tools-and-applications/minio/minio/"
                       ])

In [11]:
data = loader.load()

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(data)

In [14]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.from_documents(all_splits, embeddings)

In [11]:
faiss_index.add_documents(all_splits)

['f6cd0a8a-9fb8-4e96-b739-1e471e4fa91e',
 '1e5906f5-f391-47f2-88c8-55c7407987f4',
 '6734579a-17da-49b6-b843-655a4b2c6c74',
 'e7972a9d-67aa-4b82-bb6e-9939b96065a6',
 '50e81553-f392-4f76-b0c9-d84b20772ca2',
 'f88b4b9f-73e9-4b73-8289-eece688591f2',
 'c9d61914-860f-4a72-bcc5-2aee42aac734',
 '5af02199-1172-439b-b388-c5c3c77f7e16',
 'ca704d9f-5e42-4dce-bee6-0531fa6b5d60',
 '27e9b43f-2087-46bd-a90a-21ea829c16f2',
 '04a1dd4a-2f75-47d6-8fe1-4c831b84a01d',
 'fd0d4b65-c7e4-4819-86fd-c04598150809',
 'f89abe3b-1a0e-4a50-9c1a-677d06665fd2',
 '439a0f6e-8245-4382-ae55-5639b58d8243',
 'ae93ff22-0cfc-4123-a743-f69d9e7f3515',
 '6f6a2ec9-b1fb-4493-a81b-49a539a71af9',
 '8d9c5260-cf93-4db3-91a9-59fbfe5b7aee',
 '491d2b10-9ba2-474c-b872-a4eae2536a0f',
 '0a623895-ba01-40a3-8974-fece0f519006',
 '3e14aaca-f687-49ec-a632-38e8345a5259',
 '786150a9-78d0-49cb-938b-8637184e9d53',
 '82bea82a-7ccd-4734-a3c6-42c730f15421',
 '690e38c4-d711-4512-957e-7830da024c2d',
 '979f1089-5c42-488a-96c6-47464900a728',
 'dd467ca0-e5f6-