In [None]:
#Pinecone and hugging face API keys
%env PINECONE_API_KEY=
%env HUGGINGFACE_API_KEY=

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.core.settings import Settings
import os

Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
embeddings =  HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
HF_TOKEN = os.environ["HUGGINGFACE_API_KEY"]
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
Settings.llm = HuggingFaceInferenceAPI(model_name=repo_id,model_kwargs={"temperature": 0.1},token=HF_TOKEN )

In [None]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["/content/drive/MyDrive/INPT _infos.pdf"]
).load_data()

In [None]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [None]:
import pinecone
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone


# Extract nodes from documents
nodes = node_parser.get_nodes_from_documents(documents)

index_name = "chatbot"
api_key = os.environ["PINECONE_API_KEY"]
pc = Pinecone(api_key=api_key)
pinecone_index = pc.Index(index_name)

# Initialize the Pinecone vector store
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# Set up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Setup the index
# build VectorStoreIndex that takes care of chunking documents and encoding chunks to embeddings for future retrieval
index = VectorStoreIndex(
    nodes,
    storage_context = storage_context,
)

In [None]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor


postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    top_n = 2,
    model = "BAAI/bge-reranker-base"
)

In [None]:
from llama_index.core import PromptTemplate

def create_prompt_with_context(query, context):
    prompt_template = """
    You are an assistant specializing in the Institut National des Postes et Télécommunications (INPT).
    Your role is to provide information and answer questions  related only to INPT, including its programs, research, and organizational details.
    If a question is outside your area of expertise, politely inform the user that you can only assist with questions related to INPT.


    Context: {context}
    Question: {query}

    """
    return prompt_template.format(query=query, context=context)

In [None]:
query_engine = index.as_query_engine(
similarity_top_k = 6,
vector_store_query_mode="hybrid",
alpha=0.5,
node_postprocessors = [postproc, rerank],
)

In [None]:
def chat_bot_rag(query):
  context = query_engine.query(query)
  final_prompt = create_prompt_with_context(query, context)
  final_response = Settings.llm.complete(final_prompt)

  return final_response

In [None]:
query="What are the fields in INPT?"
final_response=chat_bot_rag(query)
print(str(final_response))