## Creating an index and populating it with documents using FAISS

Simple example on how to ingest PDF documents, then web pages content into a FAISS VectorStore.


### Base parameters

In [8]:
index_name = "pdf_docs"

#### Imports

In [11]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

## Initial index creation and document ingestion

#### Document loading from a folder containing PDFs

In [12]:
pdf_folder_path = 'pdf'

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

#### Split documents into chunks with some overlap

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)

#### Create the index and ingest the documents

In [14]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.from_documents(all_splits, embeddings)



#### Write the schema to a yaml file to be able to open the index later on

In [15]:
faiss_index.save_local("pdf", index_name)

faiss_index.add_documents(all_splits)


['5ccee31b-a2a9-4e87-9b0d-6e60f16c07a0',
 'd22241b6-c2ab-4a36-a5d0-f78efa1c54d1',
 '56c37ebf-382b-4147-a91b-7f2a8174145a',
 '6b10a531-ccd6-4c89-ad85-581dc777845e',
 '931fb08f-246b-4de1-b2b9-66256ab6e4f1',
 'd26e166d-8192-4ad4-bfbe-602f74d1e05b',
 '2fbd7e06-e4f3-4353-ac61-831ee0101f93',
 '82350946-64f1-45f5-bb4c-0a7d701586bd',
 'a864abd8-b6e5-4ac3-9efa-86c75d5ce582',
 '0bfb8409-95af-4468-96a8-5335a43745da',
 'c2bbfa0f-8e13-413f-87df-66edf650b3d5']

## Ingesting new documents

#### Example with Web pages

In [8]:
from langchain.document_loaders import WebBaseLoader

In [10]:
loader = WebBaseLoader(["https://ai-on-openshift.io/getting-started/openshift/",
                        "https://ai-on-openshift.io/getting-started/opendatahub/",
                        "https://ai-on-openshift.io/getting-started/openshift-data-science/",
                        "https://ai-on-openshift.io/odh-rhods/configuration/",
                        "https://ai-on-openshift.io/odh-rhods/custom-notebooks/",
                        "https://ai-on-openshift.io/odh-rhods/nvidia-gpus/",
                        "https://ai-on-openshift.io/odh-rhods/custom-runtime-triton/",
                        "https://ai-on-openshift.io/odh-rhods/openshift-group-management/",
                        "https://ai-on-openshift.io/tools-and-applications/minio/minio/"
                       ])

In [11]:
data = loader.load()

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(data)

In [14]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.from_documents(all_splits, embeddings)

In [7]:
faiss_index.add_documents(all_splits)

['f4c03e9c-4ba3-481c-91f6-11af99745e26',
 '09a19392-ab4c-4eb8-ab76-10afbe08954b',
 '15ad6c60-a630-4139-ae87-730b5617fd7d',
 'df35721f-cb4e-412c-a0dd-87a86ecd49fe',
 '49189c0c-56e2-4b3c-b3d5-9b991f080845',
 '46fecf43-8139-41ff-879c-40375a5405b9',
 'ea8affe0-3161-42d8-9260-dcd72e1326fe',
 '96745a91-626d-453e-b482-a36950c52f6f',
 'e038d2cd-7095-4030-af87-2f46cda83a1c',
 'ed4aacd7-c9af-4529-bac6-99e7b7bd79ad',
 'a9b62fef-c4c2-4c6c-bb04-8378291c118e']