In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [2]:
!pip install -q langchain pypdf sentence-transformers faiss-gpu

In [3]:
from glob import glob
from os import path
from langchain.document_loaders import PyPDFLoader
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Extracting Data From Documents

## Extracting Data

In [4]:
# Change the data path to NVMe directory
root_path = '/content/drive/MyDrive/Colab Notebooks/NVMe-QABot/'
file_list = sorted(glob(path.join(root_path+'NVMe', '*.pdf')))
page_start = [14, 5, 6, 7, 1, 5, 6]

In [5]:
def get_pdf_data(file_list, page_start):
    all_pages_data = []
    for i in range(len(file_list)):
        loader = PyPDFLoader(file_list[i])
        pages_data = loader.load()[page_start[i]-1:]
        all_pages_data.extend(pages_data)
    return all_pages_data

In [6]:
all_pages_data = get_pdf_data(file_list, page_start)

# Data Preprocessing

## Splitting in Chunks using Text Splitters

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(all_pages_data)

## Creating Embeddings and Storing in Vector Store

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

## Saving and loading FAISS index

In [9]:
vectorstore.save_local(root_path+"faiss_index")