## Document Question Answering with initial data storage in FAISS DB and vectors created once

### Using FAISS DB and LangChain to do question answering over documents, with a locally persisted database. Store embeddings and documents, then use them again later.

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import GCSDirectoryLoader
import os
import configparser
import pandas as pd
from google.cloud import storage

In [None]:
config = configparser.ConfigParser()
config.read('../config.ini')

In [None]:
OPENAI_API_KEY = config.get('api_key', 'openai')

## Load and process documents

### Load documents to do question answering over.
### Next we split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/teekyboy/code/gcp/chatbot-t1-firebase.json'

In [None]:
loader = GCSDirectoryLoader(project_name="chatbot", bucket="chatbot-t1.appspot.com", prefix="data")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(data)
vector_directory = "../raw_data/embeddings/"

## Initialize FAISS DB

### Create embeddings for each chunk and insert into the FAISS vector database.

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectordb = FAISS.from_documents(texts, embeddings)

## Saving the Database

In [None]:
vectordb.save_local(vector_directory)
vectordb = None

## Load the Database from disk, and create the chain

### Initialize the chain, we will use it for question answering.

In [None]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

In [None]:
vectordb = FAISS.load_local(vector_directory, embeddings)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectordb.as_retriever())

## Ask questions!

### Now we can use the chain to ask questions!

In [None]:
query = {user_input})
qa.run(query)