In [28]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

In [1]:
from PyPDF2 import PdfReader

In [22]:
ASTRA_DB_APPLICATION_TOKEN = ""
ASTRA_DB_ID = "41e053ea-c871-46d8-87d2-b769f70befe6" #enter your Database ID
GROQ_API_KEY = ""


In [10]:
# provide the path of  pdf file/files.
pdfreader = PdfReader(r"C:\Users\Uyama\Downloads\LANGCHAIN-PROJECTS\RAG-Document\research_papers\LLM.pdf")

In [11]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [12]:
raw_text

'A Comprehensive Overview of Large Language Models\nHumza Naveeda, Asad Ullah Khana,∗, Shi Qiub,∗, Muhammad Saqibc,d,∗, Saeed Anware,f, Muhammad Usmane,f, Naveed Akhtarg,i,\nNick Barnesh, Ajmal Miani\naUniversity of Engineering and Technology (UET), Lahore, Pakistan\nbThe Chinese University of Hong Kong (CUHK), HKSAR, China\ncUniversity of Technology Sydney (UTS), Sydney, Australia\ndCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\neKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\nfSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\ngThe University of Melbourne (UoM), Melbourne, Australia\nhAustralian National University (ANU), Canberra, Australia\niThe University of Western Australia (UWA), Perth, Australia\nAbstract\nLarge Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\nbeyond. This success of LLMs has led 

Initialize the connection to your database

In [13]:
import cassio

In [23]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [24]:
from dotenv import load_dotenv
import os
load_dotenv()

Python-dotenv could not parse statement starting at line 17
Python-dotenv could not parse statement starting at line 18
Python-dotenv could not parse statement starting at line 19
Python-dotenv could not parse statement starting at line 20
Python-dotenv could not parse statement starting at line 21
Python-dotenv could not parse statement starting at line 24
Python-dotenv could not parse statement starting at line 25


True

In [25]:
## load the GROQ And OpenAI API KEY 
os.environ['GROQ_API_KEY']=os.getenv("GROQ_API_KEY")
groq_api_key=os.getenv('GROQ_API_KEY')

In [31]:
## If you do not have open AI key use the below Huggingface embedding
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
from langchain_huggingface import HuggingFaceEmbeddings
embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")





In [29]:
llm=ChatGroq(groq_api_key=groq_api_key,model_name="Llama3-8b-8192")

Create your LangChain vector store ... backed by Astra DB!

In [32]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [33]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [34]:
texts[:50]

['A Comprehensive Overview of Large Language Models\nHumza Naveeda, Asad Ullah Khana,∗, Shi Qiub,∗, Muhammad Saqibc,d,∗, Saeed Anware,f, Muhammad Usmane,f, Naveed Akhtarg,i,\nNick Barnesh, Ajmal Miani\naUniversity of Engineering and Technology (UET), Lahore, Pakistan\nbThe Chinese University of Hong Kong (CUHK), HKSAR, China\ncUniversity of Technology Sydney (UTS), Sydney, Australia\ndCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\neKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\nfSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\ngThe University of Melbourne (UoM), Melbourne, Australia\nhAustralian National University (ANU), Canberra, Australia',
 'gThe University of Melbourne (UoM), Melbourne, Australia\nhAustralian National University (ANU), Canberra, Australia\niThe University of Western Australia (UWA), Perth, Australia\nAbstract\nLarge Language Models (LLMs) have rec

Load the dataset into the vector store

In [35]:
astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 50 headlines.


Run the QA cycle

In [36]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "what is tokenization"
ANSWER: "According to the provided context, tokenization is an essential pre-processing step in LLM (Large Language Model) training that parses the text into non-decomposing units called tokens. Tokens can be characters, subwords, symbols, or words, depending on the tokenization process. Some common tokenization schemes used in LLMs include wordpiece, byte pair encoding (BPE), and unigramLM."

FIRST DOCUMENTS BY RELEVANCE:
    [0.7522] "in details to the original works.
2.1. Tokenization
Tokenization [59] is an essentia ..."
    [0.7440] "utilization in di fferent domains. Section 4 highlights the config-
uration and para ..."
    [0.6600] "favors using recent tokens for attention.
RoPE [66]: It rotates query and key repres ..."
    [0.6550] "independently of each other. Moreover, the attention mod-
ule in the transformer doe ..."
