### Imports & Environment Variables 

In [18]:
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.file import PagedCSVReader
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.llms.huggingface import HuggingFaceLLM
import faiss
import os
import pandas as pd
from dotenv import load_dotenv
import torch
import os

# Load environment variables from a .env file
load_dotenv()

EMBED_DIMENSION=384
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embedding=Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    #dimensions=EMBED_DIMENSION
)
print(embedding)
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


from llama_index.llms.deepseek import DeepSeek


from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
Settings.llm = HuggingFaceInferenceAPI(
    model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", token=os.environ["HUGGINGFACE_API_KEY"]
)


model_name='BAAI/bge-small-en-v1.5' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x0000019B80570F50> num_workers=None max_length=512 normalize=True query_instruction=None text_instruction=None cache_folder=None


In [19]:
file_path = ('new_custom.csv') # insert the path of the csv file
data = pd.read_csv(file_path)

# Preview the csv file
data.head()

Unnamed: 0,id,key,project,summary,description,status,assignee,reporter,priority,issuetype,...,components,sprint,sprintId,sprintState,sprintStartDate,sprintEndDate,storyPoints,epicLink,rank,column
0,1,LOG-1,Logistics Platform,Task 1 for Logistics Platform,Description for task 1 in Logistics Platform,To Do,Charlie,Charlie,Medium,Epic,...,['Database'],Sprint 3,32.0,Active,2024-02-01,2024-02-14,1.0,,3,To Do
1,2,E-C-2,E-Commerce,Task 2 for E-Commerce,Description for task 2 in E-Commerce,Done,David,Alice,Medium,Task,...,['Database'],Sprint 1,10.0,Completed,2024-01-01,2024-01-14,,EPIC-1,21,Done
2,3,E-C-3,E-Commerce,Task 3 for E-Commerce,Description for task 3 in E-Commerce,To Do,Charlie,Alice,Medium,Epic,...,['UI'],Sprint 6,26.0,Future,2025-04-03,2025-04-17,8.0,,15,To Do
3,4,LOG-4,Logistics Platform,Task 4 for Logistics Platform,Description for task 4 in Logistics Platform,To Do,Eve,Bob,Low,Epic,...,['Payment Service'],Sprint 6,43.0,Future,2025-04-03,2025-04-17,5.0,,45,To Do
4,5,E-C-5,E-Commerce,Task 5 for E-Commerce,Description for task 5 in E-Commerce,To Do,David,Alice,Low,Story,...,['UI'],Sprint 4,30.0,Active,2024-02-15,2024-02-28,8.0,EPIC-1,36,To Do


### Vector Store

In [20]:
# Create FaisVectorStore to store embeddings
fais_index = faiss.IndexFlatL2(EMBED_DIMENSION)
vector_store = FaissVectorStore(faiss_index=fais_index)

### Load and Process CSV Data as Document

In [21]:
csv_reader = PagedCSVReader()

reader = SimpleDirectoryReader( 
    input_files=[file_path],
    file_extractor= {".csv": csv_reader}
    )

docs = reader.load_data()

In [22]:
# Check a sample chunk
print(docs[0].text)

id: 1
key: LOG-1
project: Logistics Platform
summary: Task 1 for Logistics Platform
description: Description for task 1 in Logistics Platform
status: To Do
assignee: Charlie
reporter: Charlie
priority: Medium
issuetype: Epic
created: 2025-02-04
updated: 2025-03-02
resolution: 
labels: ['backend']
components: ['Database']
sprint: Sprint 3
sprintId: 32.0
sprintState: Active
sprintStartDate: 2024-02-01
sprintEndDate: 2024-02-14
storyPoints: 1.0
epicLink: 
rank: 3
column: To Do


### Ingestion Pipeline

In [23]:
pipeline = IngestionPipeline(
    vector_store=vector_store,
    documents=docs
)

nodes = pipeline.run()

### Create Query Engine

In [24]:
vector_store_index = VectorStoreIndex(nodes)
query_engine = vector_store_index.as_query_engine(similarity_top_k=2)

### Query the rag bot with a question based on the CSV data

In [25]:
response = query_engine.query("what are the currently open epics")
print(response.response)

'Alright, so I need to figure out what the currently open epics are based on the provided context information. Let me start by understanding what the context includes.\n\nFirst, I see two files: new_custom.csv with id 3 and id 188. Each of these files has several fields like id, key, project, summary, description, status, assignee, reporter, priority, isotype, created, updated, resolution, labels, components, etc.\n\nLooking at the first file with id 3, the status is "To Do," the reporter is Alice, and the priority is Medium. The label is \'bugfix\' and the component is \'UI\'. The second file with id 188 has a status of "Done," no reporter or reporter, priority is Critical, label is \'bugfix\', and the component is \'Database\'.\n\nNow, the query is asking for the currently open epics. From the context, I can see that both entries have a status of "To Do" and "Done." Since "To Do" implies that the task is currently open and needs to be prioritized, and "Done" suggests it\'s completed,

In [26]:
print(response.response)

Alright, so I need to figure out what the currently open epics are based on the provided context information. Let me start by understanding what the context includes.

First, I see two files: new_custom.csv with id 3 and id 188. Each of these files has several fields like id, key, project, summary, description, status, assignee, reporter, priority, isotype, created, updated, resolution, labels, components, etc.

Looking at the first file with id 3, the status is "To Do," the reporter is Alice, and the priority is Medium. The label is 'bugfix' and the component is 'UI'. The second file with id 188 has a status of "Done," no reporter or reporter, priority is Critical, label is 'bugfix', and the component is 'Database'.

Now, the query is asking for the currently open epics. From the context, I can see that both entries have a status of "To Do" and "Done." Since "To Do" implies that the task is currently open and needs to be prioritized, and "Done" suggests it's completed, it seems like b