In [None]:
import marimo as mo

Install the ChromaDB Evaluation Framework package

For `pip` users:


```
pip add git+https://github.com/brandonstarxel/chunking_evaluation.git
```

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available else 'cpu'
print(f"Using CUDA ? {'YES' if device == 'cuda' else 'NO'}")
if device:
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using CUDA ? YES
GPU: NVIDIA GeForce GTX 1660 Ti


### Import and dependencies

In [None]:
import os
import pandas as pd

from chunking_evaluation import GeneralEvaluation, SyntheticEvaluation, BaseChunker
from chromadb.utils import embedding_functions

### Naive chunking stratergy

We'll use the `BaseChunker` class to define our own. At it's core `BaseChunker` is very simple:

```python
class BaseChunker(ABC):
    @abstractmethod
    def split_text(self, text: str) -> list[str]:
        pass
```

It expects only a `split_text` method that can take in a string and return a list of strings, which is our chunks. The transformation along the way can be more creatively defined.

We'll use this as a base to reimplement our naive chunker.

In [None]:
from spacy.lang.en import English


class SentenceChunker(BaseChunker):
    def __init__(self, sentences_per_chunk: int = 3):
        self.sentences_per_chunk = sentences_per_chunk
        self.nlp = English()
        self.nlp.add_pipe("sentencizer")

    def split_text(self, text: str) -> list[str]:
        chunk_size = self.sentences_per_chunk

        if not text:
            return []

        sentences = list(self.nlp(text).sents)
        sentences = [str(sent) for sent in sentences]

        chunks = []

        for i in range(0, len(sentences), chunk_size):
            chunk = ' '.join(sentences[i:i+chunk_size])
            chunks.append(chunk)

        return chunks

### Defining a embedding function and chunker

In [None]:
sent_trans_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name='all-mpnet-base-v2',
    device='device'
)

sentence_chunker = SentenceChunker(sentences_per_chunk = 10)

sent_trans_ef.__class__.__name__

### Start General Evaluation

In [None]:

evaluation = GeneralEvaluation()


results = evaluation.run(
    sentence_chunker, 
    sent_trans_ef,
    db_to_save_chunks="datasets/general_evaluation/naive-configuration"
)


<span class="codehilite"><div class="highlight"><pre><span></span><span class="gt">Traceback (most recent call last):</span>
  File <span class="nb">&quot;/home/saladass/crafts/rag-int14124-final/.venv/lib/python3.11/site-packages/marimo/_runtime/executor.py&quot;</span>, line <span class="m">138</span>, in <span class="n">execute_cell</span>
<span class="w">    </span><span class="n">exec</span><span class="p">(</span><span class="n">cell</span><span class="o">.</span><span class="n">body</span><span class="p">,</span> <span class="n">glbls</span><span class="p">)</span>
  File <span class="nb">&quot;/tmp/marimo_8279/__marimo__cell_HpiH_.py&quot;</span>, line <span class="m">5</span>, in <span class="n">&lt;module&gt;</span>
<span class="w">    </span><span class="n">results</span> <span class="o">=</span> <span class="n">evaluation</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
<span class="w">              </span><span class="pm">^^^^^^^^^^^^^^^</sp

> /home/saladass/crafts/rag-int14124-final/.venv/lib/python3.11/site-packages/chunking_evaluation/evaluation_framework/base_evaluation.py(272)_scores_from_dataset_and_retrievals()
-> precision_score = numerator_value / precision_denominator
(Pdb) 

 where


  /home/saladass/crafts/rag-int14124-final/.venv/lib/python3.11/site-packages/marimo/_runtime/executor.py(138)execute_cell()
-> exec(cell.body, glbls)
  /tmp/marimo_8279/__marimo__cell_HpiH_.py(5)<module>()
-> results = evaluation.run(
  /home/saladass/crafts/rag-int14124-final/.venv/lib/python3.11/site-packages/chunking_evaluation/evaluation_framework/base_evaluation.py(417)run()
-> iou_scores, recall_scores, precision_scores = self._scores_from_dataset_and_retrievals(retrievals['metadatas'], highlighted_chunks_count)
> /home/saladass/crafts/rag-int14124-final/.venv/lib/python3.11/site-packages/chunking_evaluation/evaluation_framework/base_evaluation.py(272)_scores_from_dataset_and_retrievals()
-> precision_score = numerator_value / precision_denominator
(Pdb) 

 vals


*** NameError: name 'vals' is not defined
(Pdb) 

 v


*** NameError: name 'v' is not defined
(Pdb) 

 help



Documented commands (type help <topic>):
EOF    c          d        h         list      q        rv       undisplay
a      cl         debug    help      ll        quit     s        unt      
alias  clear      disable  ignore    longlist  r        source   until    
args   commands   display  interact  n         restart  step     up       
b      condition  down     j         next      return   tbreak   w        
break  cont       enable   jump      p         retval   u        whatis   
bt     continue   exit     l         pp        run      unalias  where    

Miscellaneous help topics:
exec  pdb

(Pdb) 

 rv


*** Not yet returned!
(Pdb) 

 p


*** SyntaxError: invalid syntax
(Pdb) 

 precision_denominator


0
(Pdb) 

 denomiator_chunks_sets


*** NameError: name 'denomiator_chunks_sets' is not defined
(Pdb) 

 numerator_sets


[]
(Pdb) 

 denominator_chunks_sets


[]
(Pdb) 