In [2]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex



## Load the files

In [3]:
loader = SimpleDirectoryReader(input_dir="data")
raw_data = loader.load_data()

In [4]:
from llama_index.llms.ollama import Ollama
llm = Ollama(model="llama3.2:1b", base_url = "http://localhost:11434", request_timeout=120.0)

In [6]:
len(raw_data) 

21

In [10]:
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter

In [11]:
parser = SentenceSplitter(
    chunk_size=128,
    chunk_overlap=16,
    paragraph_separator="\n\n"
)

nodes = parser.get_nodes_from_documents(raw_data, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 21/21 [00:00<00:00, 98.21it/s] 


In [16]:
len(nodes)
nodes[0].text


'Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N.'

In [17]:
%time
resp = llm.complete('what are transformers')
print(resp)

CPU times: total: 0 ns
Wall time: 0 ns
Transformers are a fictional franchise of science fiction, created by Japanese designer and filmmaker Shoji Kawamori. The first series, "The Transformers," was produced from 1984 to 1987 and has since become a global phenomenon.

In the Transformers universe, robots called "Autobots" are led by Optimus Prime, who fights against an evil force known as the Decepticons. Autobots are equipped with advanced technology that allows them to transform into various forms, such as cars, trucks, planes, and even animals like dinosaurs and bees.

The core concept of Transformers is based on a battle between two factions:

1. **Autobots**: These are the heroic robots who fight against the Decepticons. The Autobots possess advanced technology that allows them to transform into different forms.
2. **Decepticons**: These are the evil robots led by Megatron, who seek to destroy the Autobots and rule the universe.

The Transformers have appeared in various forms of 

In [18]:
%pip install llama-index-embeddings-ollama

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
from llama_index.embeddings.ollama import OllamaEmbedding

In [20]:
ollama_embedding = OllamaEmbedding(
    model_name="all-minilm:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

In [48]:
raw_data[0]

Document(id_='67901e4f-7a91-469b-8089-ed61f980d86e', embedding=None, metadata={'page_label': '1', 'file_name': 'attention.pdf', 'file_path': 'c:\\GenAI Projects\\final-rag-project\\data\\attention.pdf', 'file_type': 'application/pdf', 'file_size': 569417, 'creation_date': '2025-01-26', 'last_modified_date': '2025-01-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nA

In [21]:
embeddings = []
for document in nodes:
    embedding = ollama_embedding.get_text_embedding(document.text)
    embeddings.append(embedding)

In [22]:
embeddings

[[0.05185085907578468,
  -0.0659915879368782,
  0.08417318016290665,
  -0.02065650001168251,
  0.0053813825361430645,
  0.02666342630982399,
  0.11952488124370575,
  -0.03976438567042351,
  -0.012597658671438694,
  0.14364466071128845,
  0.10965606570243835,
  -0.10348615050315857,
  -0.12096141278743744,
  0.006700817029923201,
  0.05043504759669304,
  0.014132274314761162,
  0.06699415296316147,
  0.07531709223985672,
  -0.1646629124879837,
  0.044057149440050125,
  -0.07559619098901749,
  0.014202933758497238,
  0.11654205620288849,
  -0.09253348410129547,
  0.033537670969963074,
  0.06690312176942825,
  -0.10279379785060883,
  -0.13330890238285065,
  0.03196153789758682,
  -0.054135773330926895,
  0.10982208698987961,
  -0.009621725417673588,
  -0.05160133168101311,
  0.07605405896902084,
  -0.03428881615400314,
  0.10549502819776535,
  -0.16402897238731384,
  0.08211789280176163,
  -0.0543692484498024,
  -0.027513671666383743,
  -0.13371877372264862,
  -0.037171587347984314,
  -0.

In [23]:
%pip install chromadb





[notice] A new release of pip is available: 23.1.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
import chromadb
# Initialize ChromaDB client
client = chromadb.Client()

# Create a collection
collection = client.get_or_create_collection("my_collection")

# Add embeddings and documents to ChromaDB
for idx, embedding in enumerate(embeddings):
    collection.add(
        documents=[nodes[idx].text],  # The document object (containing metadata and text)
        embeddings=[embedding],      # The embedding vector of the document
        metadatas=[{"id": f"doc_{idx}"}],  # Optional metadata for each document (e.g., document ID)
        ids=[f"doc_{idx}"]          # Unique ID for each document (e.g., doc_0, doc_1, etc.)
    )

Insert of existing embedding ID: doc_0
Add of existing embedding ID: doc_0
Insert of existing embedding ID: doc_1
Add of existing embedding ID: doc_1
Insert of existing embedding ID: doc_2
Add of existing embedding ID: doc_2
Insert of existing embedding ID: doc_3
Add of existing embedding ID: doc_3
Insert of existing embedding ID: doc_4
Add of existing embedding ID: doc_4
Insert of existing embedding ID: doc_5
Add of existing embedding ID: doc_5
Insert of existing embedding ID: doc_6
Add of existing embedding ID: doc_6
Insert of existing embedding ID: doc_7
Add of existing embedding ID: doc_7
Insert of existing embedding ID: doc_8
Add of existing embedding ID: doc_8
Insert of existing embedding ID: doc_9
Add of existing embedding ID: doc_9
Insert of existing embedding ID: doc_10
Add of existing embedding ID: doc_10
Insert of existing embedding ID: doc_11
Add of existing embedding ID: doc_11
Insert of existing embedding ID: doc_12
Add of existing embedding ID: doc_12
Insert of existing 

In [29]:
query = "What are transformers?"

# Get the embedding for the query
query_embedding = ollama_embedding.get_text_embedding(query)

# Perform the query and get the top 5 most relevant documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

# Display the results
for result in results['documents']:
    print(result)

['Search [34] generates potential bounding boxes, a convolu-\ntional network extracts features, an SVM scores the boxes, a\nlinear model adjusts the bounding boxes, and non-max sup-\npression eliminates duplicate detections. Each stage of this\ncomplex pipeline must be precisely tuned independently\nand the resulting system is very slow, taking more than 40\nseconds per image at test time [14].\nYOLO shares some similarities with R-CNN. Each grid\ncell proposes a potential bounding boxes and scores those\nboxes using convolutional features. However, our system\nputs spatial constraints on the grid cell proposals which\nhelps mitigate multiple detections of the same object. Our\nsystem also proposes far fewer bounding boxes, only 98\nper image compared to about 2000 from Selective Search.\nFinally, our system combines these individual components\ninto a single, jointly optimized model.\nOther Fast Detectors Fast and Faster R-CNN focus on\nspeeding up the R-CNN framework by sharing compu

In [30]:
type(results) 

dict

In [31]:
# Assume you get the most relevant documents
relevant_docs = results['documents']

# Generate a response based on the context
response = llm.complete(
    prompt= relevant_docs[0][0][:100] + " " + query
)
print(response)


It seems like you're discussing the architecture of a deep learning model. 

Transformers are a type of neural network architecture introduced in 2017 by Vaswani et al. They were designed to handle sequential data such as text, speech, and time series data. The primary goal of a transformer is to process input sequences one token at a time.

A typical transformer model consists of an encoder and a decoder. 

1. **Encoder**: The encoder takes in the input sequence and processes it into a fixed-length vector representation called the embedding layer.
2. **Encoder-Decoder Path**: After embedding, the encoder generates key vectors (K) and value vectors (V) through self-attention mechanisms. These key vectors are then used to compute attention weights between each pair of tokens in the input sequence.
3. The attention mechanism allows the model to focus on different parts of the input sequence when computing the output.

The outputs from the encoder are then passed through a linear layer, w

In [66]:
# " ".join(relevant_docs[0]) + " " + query

'Recurrent models typically factor computation along the symbol positions of the input and output\nsequences. Aligning the positions to steps in computation time, they generate a sequence of hidden\nstates ht, as a function of the previous hidden state ht−1 and the input for position t. This inherently\nsequential nature precludes parallelization within training examples, which becomes critical at longer\nsequence lengths, as memory constraints limit batching across examples. Recent work has achieved\nsigniﬁcant improvements in computational efﬁciency through factorization tricks [18] and conditional\ncomputation [26], while also improving model performance in case of the latter. The fundamental\nconstraint of sequential computation, however, remains.\nAttention mechanisms have become an integral part of compelling sequence modeling and transduc-\ntion models in various tasks, allowing modeling of dependencies without regard to their distance in\nthe input or output sequences [2, 16]. 

In [34]:
# len(relevant_docs[0][0])

5587

In [35]:
relevant_docs

[['Search [34] generates potential bounding boxes, a convolu-\ntional network extracts features, an SVM scores the boxes, a\nlinear model adjusts the bounding boxes, and non-max sup-\npression eliminates duplicate detections. Each stage of this\ncomplex pipeline must be precisely tuned independently\nand the resulting system is very slow, taking more than 40\nseconds per image at test time [14].\nYOLO shares some similarities with R-CNN. Each grid\ncell proposes a potential bounding boxes and scores those\nboxes using convolutional features. However, our system\nputs spatial constraints on the grid cell proposals which\nhelps mitigate multiple detections of the same object. Our\nsystem also proposes far fewer bounding boxes, only 98\nper image compared to about 2000 from Selective Search.\nFinally, our system combines these individual components\ninto a single, jointly optimized model.\nOther Fast Detectors Fast and Faster R-CNN focus on\nspeeding up the R-CNN framework by sharing comp

In [38]:
# len(relevant_docs[0][1])

4194