In [46]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [47]:
#load openAI api key
os.environ['OPENAI_API_KEY'] = 'TODO'

#Initialize LLM
llm = OpenAI(temperature=0.8, max_tokens=500)

1. Load Data

In [71]:
loader = UnstructuredURLLoader(
    urls = [
        "https://en.wikipedia.org/wiki/India",
        "https://en.wikipedia.org/wiki/Bangalore"
    ]
)

data = loader.load()
len(data)

2

2. Split data to create chunks

In [72]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

docs = text_splitter.split_documents(data)

In [73]:
len(docs)

371

In [74]:
docs[0]

Document(page_content="Toggle the table of contents\n\nIndia\n\n295 languages\n\nAcèh\n\nАдыгэбзэ\n\nАдыгабзэ\n\nAfrikaans\n\nAlemannisch\n\nአማርኛ\n\nAnarâškielâ\n\nअंगिका\n\nÆnglisc\n\nАԥсшәа\n\nالعربية\n\nAragonés\n\nܐܪܡܝܐ\n\nԱրեւմտահայերէն\n\nArmãneashti\n\nArpetan\n\nঅসমীয়া\n\nAsturianu\n\nAtikamekw\n\nअवधी\n\nAvañe'ẽ\n\nАвар\n\nAymar aru\n\nAzərbaycanca\n\nتۆرکجه\n\nBasa Bali\n\nবাংলা\n\nBanjar\n\nBân-lâm-gú\n\nBasa Banyumasan\n\nБашҡортса\n\nБеларуская\n\nБеларуская (тарашкевіца)\n\nभोजपुरी\n\nBikol Central\n\nBislama\n\nБългарски\n\nBoarisch\n\nབོད་ཡིག\n\nBosanski\n\nBrezhoneg\n\nБуряад\n\nCatalà\n\nЧӑвашла\n\nCebuano\n\nČeština\n\nChamoru\n\nChavacano de Zamboanga\n\nChi-Chewa\n\nChiShona\n\nChiTumbuka\n\nCorsu\n\nCymraeg\n\nDagbanli\n\nDansk\n\nالدارجة\n\nDavvisámegiella\n\nDeitsch\n\nDeutsch\n\nދިވެހިބަސް\n\nDiné bizaad\n\nDolnoserbski\n\nडोटेली\n\nཇོང་ཁ\n\nEesti\n\nΕλληνικά\n\nЭрзянь\n\nEspañol\n\nEsperanto\n\nEstremeñu\n\nEuskara\n\nEʋegbe\n\nفارسی\n\nFiji Hindi\n\nFøroyskt

3. Create embeddings for these chunks and save the embedding to FAISS index

In [75]:
# create openai embeddings
embeddings = OpenAIEmbeddings()

vectorindex = FAISS.from_documents(docs, embeddings)

In [76]:
# store vector index locally in a file. So later we load from that file.
file_path = "faiss_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex, f)

TypeError: cannot pickle '_thread.RLock' object

In [None]:
if os.path.exists(file_path):
    with open(file_path, "rd") as f:
        vectorindex = pickle.load(f);

4. Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [77]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorindex.as_retriever())
chain



In [80]:
query = "How many union territories are there in India? List them."

langchain.debug = True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "How many union territories are there in India? List them."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Related topics v t e States and union territories of India States Andhra Pradesh Arunachal Pradesh Assam Bihar Chhattisgarh Goa Gujarat Haryana Himachal Pradesh Jharkhand Karnataka Kerala Madhya Pradesh Maharashtra Manipur Meghalaya Mizoram Nagaland Odisha Punjab Rajasthan Sikkim Tamil Nadu Telangana Tripura Uttar Pradesh Uttarakhand West Bengal Union territories Andaman and Nicobar Islands Chandigarh Dadra and Nagar Haveli and Daman and Diu Delhi Jammu 

{'answer': ' There are 8 union territories in India: Jammu and Kashmir, Puducherry, National Capital Territory of Delhi, Andaman and Nicobar Islands, Chandigarh, Dadra and Nagar Haveli and Daman and Diu, Ladakh, and Lakshadweep.\n',
 'sources': 'https://en.wikipedia.org/wiki/India'}