In [None]:
# Install and Import all required libraries

%pip install openai pandas cassandra-driver cassio langchain PyMuPDF
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory
from cassandra.query import SimpleStatement
from getpass import getpass
import os

In [None]:
# Collect all information required to connect to your Astra Database and OpenAI API. Also specify the model_id for generating the embeddings.

ASTRA_DB_SECURE_BUNDLE_PATH = input("Please provide the full path to your Secure Connect Bundle zipfile: ")
ASTRA_DB_APPLICATION_TOKEN = getpass("Please provide your Database Token ('AstraCS:...' string): ")
ASTRA_DB_KEYSPACE = input("Please provide the Keyspace name for your Database: ")
os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API Key: ")

In [None]:

# Connect to your Astra Database

cluster = Cluster(
    cloud={
        "secure_connect_bundle": ASTRA_DB_SECURE_BUNDLE_PATH
    },
    auth_provider=PlainTextAuthProvider("token", ASTRA_DB_APPLICATION_TOKEN),
)

session = cluster.connect()

In [None]:
# Read the PDF file and convert it into loadable format
from langchain.document_loaders import PyMuPDFLoader

pdffilepath = input("Please provide the full path to your PDF file: ")
loader = PyMuPDFLoader(pdffilepath)
documents = loader.load()

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=4000, chunk_overlap=2000, separator="\n\n")
texts = text_splitter.split_documents(documents)


In [None]:
from langchain.vectorstores import Cassandra
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

table_name = "pdftexttable"
pdftextsearchdb = Cassandra.from_documents(
    documents=texts,
    embedding=embeddings,
    table_name=table_name,
    keyspace=ASTRA_DB_KEYSPACE,
    session=session,
)

In [None]:
query = input("Please enter your search query: ")

docs = pdftextsearchdb.similarity_search(query, k=3)

In [None]:
supporting_text = ""

for doc in docs:
    supporting_text = supporting_text + "\n\n" + doc.page_content

print(supporting_text)

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI


chat_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful sales person. You are helping a customer with a question. The customer asks you a question. You answer the question."),
    ("system", "An assistant will provide you with some supporting text. You will have to answer the question based on the supporting text."),
    ("system", "If the assistant does not provide you with relevant supporting text, you can ask the customer to rephrase the question."),
    ("assistant", "The following are some supporting text: {assistant_supporting_text}"),
    ("human", "Hi, I have a question. {customer_question}"),
])

messages = chat_template.format_messages(
    assistant_supporting_text = supporting_text,
    customer_question = query,
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
response = llm(messages)

print(response.content)
