In [13]:
import os
from dotenv import load_dotenv

import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [14]:
load_dotenv()

True

In [19]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

## First Get chunks of the PDF ready

In [16]:
# Easy way
loader = PyPDFLoader("./book-no-6.pdf")
chunks = pages = loader.load_and_split()

In [None]:
# advanced way
# choose your chunk size
import textract
doc = textract.process("./book-no-6.pdf")

with open("./book-no-6.pdf", "r") as f:
    text = f.read()

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512, 
    chunk_overlap = 24,
    length_funtion = count_tokens
)

chunks = text_splitter.create_documents([text])

## Embed text and store embeddings

In [20]:
embeddings = OpenAIEmbeddings()

db = FAISS.from_documents(chunks, embeddings)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 8.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 10.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and bi

RateLimitError: You exceeded your current quota, please check your plan and billing details.

## Now Retreival Function

In [None]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

query = "Who created transformers?"
docs = db.similarity_search(query)


output = chain.run(input_documents=docs, question = query)
output

## Chatbot with conversation history

In [None]:
from IPython.display import display
import ipywidgets as widgets

qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.1), db.as_retriever())

In [None]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""

    if query.lower() == "exit":
        print("Bye bye")
        return
    
    result = qa({"question":query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))

    display(widgets.HTML(f'<b>User:<b> {query}'))
    display(widgets.HTML(f'<b>Chatbot:</b> {result["answer"]}'))

print("Welcome to custom ChatGPT")

input_box = widgets.Text(placeholder = "Enter your message")
input_box.on_submit(on_submit)

display(input_box)