## Creating an index and populating it with documents using FAISS

Simple example on how to ingest PDF documents, then web pages content into a FAISS VectorStore.


### Base parameters

In [2]:
index_name = "pdf_docs"

#### Imports

In [3]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

## Initial index creation and document ingestion

#### Document loading from a folder containing PDFs

In [4]:
pdf_folder_path = 'pdf'

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

#### Split documents into chunks with some overlap

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)

#### Create the index and ingest the documents

In [7]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.from_documents(all_splits, embeddings)

#### Write the schema to a yaml file to be able to open the index later on

In [8]:
faiss_index.save_local("pdf", index_name)

faiss_index.add_documents(all_splits)


## Ingesting new documents

#### Example with Web pages

In [8]:
from langchain.document_loaders import WebBaseLoader

In [10]:
loader = WebBaseLoader(["https://ai-on-openshift.io/getting-started/openshift/",
                        "https://ai-on-openshift.io/getting-started/opendatahub/",
                        "https://ai-on-openshift.io/getting-started/openshift-data-science/",
                        "https://ai-on-openshift.io/odh-rhods/configuration/",
                        "https://ai-on-openshift.io/odh-rhods/custom-notebooks/",
                        "https://ai-on-openshift.io/odh-rhods/nvidia-gpus/",
                        "https://ai-on-openshift.io/odh-rhods/custom-runtime-triton/",
                        "https://ai-on-openshift.io/odh-rhods/openshift-group-management/",
                        "https://ai-on-openshift.io/tools-and-applications/minio/minio/"
                       ])

In [11]:
data = loader.load()

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(data)

In [14]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.from_documents(all_splits, embeddings)

In [9]:
faiss_index.add_documents(all_splits)

['d8de92ad-8f69-4bee-8b9a-7aff775fcf40',
 'ec8df273-9c6e-41d7-8e20-faa43afd41a4',
 '89a84fd2-02c8-4047-81c4-e357baef32e6',
 '48b5fed9-524c-4a74-8a94-9ccd9df2886a',
 '51af36ff-008f-49ac-b158-6a8bac4e5ebf',
 'bba9828b-4db5-4cb3-b51c-8ed35e36f60c',
 '7c1d9831-33f7-49c8-9707-59d5746ad281',
 '11bc4fb1-8651-4945-9df0-69f49893c816',
 '7d1e7967-8635-4c5d-9cc9-edf9cef5890c',
 '7bb29970-b59b-43c0-8adc-cef27f80e214',
 '88ed9b21-abb1-4e8a-b831-a744bf6547e0']