In [1]:
! pip install -q ipykernel

In [12]:
import os
from dotenv import load_dotenv


from langchain_community.vectorstores import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_community.llms import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

from datasets import load_dataset

import cassio   # initialize DB connection

from PyPDF2 import PdfReader
from typing_extensions import Concatenate

In [9]:
load_dotenv()

ASTRADB_APP_TOKEN = os.getenv("ASTRA_DB_TOKEN")
ASTRADB_ID = os.getenv("ASTRA_DB_ID")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
pdfReader = PdfReader("dinosaurs.pdf")

In [5]:
raw_text = ""
for _, page in enumerate(pdfReader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [6]:
raw_text[:100]

'BASIC DINOSAUR FACTS\n•Dinosaurs are a group of reptiles that have lived \non Earth for about 165 mill'

In [None]:
# initialize connection to database
cassio.init(
    token=ASTRADB_APP_TOKEN,
    database_id=ASTRADB_ID
)

In [10]:
# create LLM and embedding objects

llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embed = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)


In [14]:
astra_vector_store = Cassandra(
    embedding=embed,
    table_name="dino_pdf_chat",
    session=None,
    keyspace=None
)

In [36]:
# chunking the raw text

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=400,
    chunk_overlap=100,
    length_function=len
)
# splitting text into chunks, with a max character size of 100 

texts = text_splitter.split_text(raw_text)

In [37]:
len(texts)

105

In [None]:
# load texts to database

astra_vector_store.add_texts(texts)
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

In [None]:
# ask a question (What is an OLORITAN)

first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type quit to exit):").strip()
    else:
        query_text = input("\nWhat's your next question?").strip()
    
    if query_text.lower() == "quit":
        break
    if query_text == "":
        continue
    first_question == False
    
    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)
    
    k = 4
    print(f"FIRST {k} DOCUMENTS BY RELEVANCE:") # first k vectors
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=k):
        print("     [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))
    


QUESTION: "ask question (What is an OLORITAN)"
ANSWER: "Olorotitan was a duckbilled dinosaur from the Late Cretaceous period, whose remains were found in far Eastern Russia. It was one of the last non-avian dinosaurs and went extinct. It grew up to 26 feet long and weighed 3.4 tons. Its name means "helmet lizard" and its skull, including the crest, was 28 inches tall."

FIRST DOCUMENTS BY RELEVANCE:
     [0.8897] "America.
Its name means 
"helmet lizard.  
It grew  to 30 feet long  
and had a skul ..."
     [0.8745] "The Cincinnatian layer is in the Ordovician period range 
and was here 451 to 443 MY ..."
     [0.8678] "Bristol palaeo -biologist and fossil 
color expert, Dr Jakob Vinther . 
Analysis of  ..."
     [0.8651] "The red line in each image shows the plane of the 
ankle hinge. 
THE CINCINNATIAN TI ..."
