In [78]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline
import warnings


In [45]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get('Pinecone_Key')
HF_API_KEY = os.environ.get('HF_Key')

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_API_KEY


# Suppress specific deprecation warning from Hugging Face
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")


In [88]:
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks


In [89]:
data = load_pdf_file("../Data")
text_chunks=text_split(data)
print(len(text_chunks))

10160


In [90]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

import os

pc = Pinecone(api_key=PINECONE_API_KEY)

pc.create_index(
    name="neu-bot",
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

{
    "name": "neu-bot",
    "metric": "cosine",
    "host": "neu-bot-e9p6iyz.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [93]:
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name="neu-bot",
    embedding=embeddings, 
)

In [96]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":20})
retrieved_docs = retriever.invoke("What is course number for Foundations of Artificial Intelligence??")
retrieved_docs

[Document(id='cba09122-e1c6-4637-8c2d-f037170a71e9', metadata={'author': 'CourseLeaf', 'creationdate': '2024-08-07T11:36:54-04:00', 'creator': 'Adobe Acrobat Pro (32-bit) 24.2.20965', 'moddate': '2025-02-11T21:18:58-05:00', 'page': 763.0, 'page_label': '764', 'producer': 'Adobe Acrobat Pro (32-bit) 24.2.20965', 'source': '..\\Data\\Northeastern University 2024-2025 Course Descriptions.pdf', 'title': '', 'total_pages': 939.0}, page_content='realize the beneﬁts of AI while responsibly developing and implementing it, it is necessary to identify the ethical issues at stake and work to resolve\nthem. This course takes up the philosophical and ethical questions essential to this project.\nPrerequisite(s): PHIL\xa01145 with a minimum grade of D- or PHIL\xa01300 with a minimum grade of D- or IS\xa01300 with a minimum grade of D- or graduate\nprogram admission'),
 Document(id='703e7424-efc2-480e-a43c-05f649e1dc9f', metadata={'author': 'CourseLeaf', 'creationdate': '2024-08-07T11:36:54-04:00', '

In [97]:
def clean_and_process_input(user_input):
    if not user_input.endswith('?'):
        user_input += '?'
    
    return user_input

def clean_chatbot_response(response):
    cleaned_response = response["answer"].replace("Assistant:", "").strip()
    return cleaned_response

In [None]:
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "Do not include any prefixes like "
    "'Assistant:' or 'Answer:', just the content."
    "\n\n"
    "{context}"
)

user_prompt="{user_input}"

prompt = ChatPromptTemplate.from_messages([
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "{input}"},
])


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
response = rag_chain.invoke({"input": clean_and_process_input("What is course number for Foundations of Artificial Intelligence?")})

print(clean_chatbot_response(response))