## Prerequisites
- Download llama-index documentation
- Create chunks with markdown parsing
- Embed chunks (we used `BAAI/bge-small-en-v1.5`)
- Store in LanceDB

In [None]:
#%pip install llama-index llama-index-vector-stores-lancedb numpy pandas scikit-learn plotly

In [6]:
from llama_index.vector_stores.lancedb import LanceDBVectorStore
import numpy as np
import pandas as pd

In [7]:
def extract_embeddings(_vec_store: LanceDBVectorStore) -> pd.DataFrame:
    tbl = _vec_store._table
    # work-around, see: https://github.com/lancedb/lancedb/issues/2046
    df = tbl.head(tbl.count_rows()).to_pandas()
    return df

In [8]:
vec_store = LanceDBVectorStore(
    uri="./data/lancedb", 
    mode="overwrite", 
    query_type="hybrid", 
    refine_factor=30, 
    nprobes=100,
)

In [9]:
df = extract_embeddings(vec_store)
df.head()

Unnamed: 0,id,doc_id,vector,text,metadata
0,30e18eec-8c2e-4737-b640-ef5f24684708,129885b3-f6bb-4fd1-85e9-8af445bb69e3,"[-0.045516003, -0.025388127, 0.004847408, -0.0...",::: llama_index.question_gen.openai\n optio...,"{'_node_content': '{""id_"": ""30e18eec-8c2e-4737..."
1,eb273b19-15d4-4f3d-994b-1705d450d409,d77cbb09-ee2a-4450-a8ac-19e48a5f8dbe,"[-0.079640314, 0.0019300836, 0.018389827, -0.0...",::: llama_index.readers.agent_search\n opti...,"{'_node_content': '{""id_"": ""eb273b19-15d4-4f3d..."
2,998ba7bc-0623-49e7-b4cb-7df944b9e0bf,742bebe3-2bf4-4321-bb7a-201c54ec4edf,"[-0.08911939, -0.012804543, 0.051525857, -0.02...",::: llama_index.readers.airbyte_cdk\n optio...,"{'_node_content': '{""id_"": ""998ba7bc-0623-49e7..."
3,d3dd102d-4ee9-47cc-ab96-04bc13c3f7a9,21883516-2c9c-4ddc-b15b-3e8551b2378c,"[-0.08306652, -0.03686305, 0.05229051, -0.0256...",::: llama_index.readers.airbyte_gong\n opti...,"{'_node_content': '{""id_"": ""d3dd102d-4ee9-47cc..."
4,87c36450-ece1-4689-8c78-a391415f9663,d37225d5-9ccb-4cd6-bf6f-186ec85ca24c,"[-0.057715178, -0.024075592, 0.03144333, -0.02...",::: llama_index.readers.airbyte_hubspot\n o...,"{'_node_content': '{""id_"": ""87c36450-ece1-4689..."


In [10]:
chunk = df.sample(1, random_state=44).text.values[0]
print(chunk)

## Custom Installation from Pip

If you aren't using OpenAI, or want a more selective installation, you can install individual packages as needed.

For example, for a local setup with Ollama and HuggingFace embeddings, the installation might look like:

```
pip install llama-index-core llama-index-readers-file llama-index-llms-ollama llama-index-embeddings-huggingface
```

[Check out our Starter Example with Local Models](starter_example_local.md)

A full guide to using and configuring LLMs is available [here](../module_guides/models/llms.md).

A full guide to using and configuring embedding models is available [here](../module_guides/models/embeddings.md).


## Cluster embeddings

In [11]:
from sklearn.cluster import KMeans

In [12]:
X = np.vstack(df.vector.to_numpy())
assert X.shape == (df.shape[0], 384) # change this if you use different embedding size than 384

In [13]:
kmeans = KMeans(n_clusters=20)
c = kmeans.fit_predict(X)
df["cluster"] = c
df.head()

Unnamed: 0,id,doc_id,vector,text,metadata,cluster
0,30e18eec-8c2e-4737-b640-ef5f24684708,129885b3-f6bb-4fd1-85e9-8af445bb69e3,"[-0.045516003, -0.025388127, 0.004847408, -0.0...",::: llama_index.question_gen.openai\n optio...,"{'_node_content': '{""id_"": ""30e18eec-8c2e-4737...",2
1,eb273b19-15d4-4f3d-994b-1705d450d409,d77cbb09-ee2a-4450-a8ac-19e48a5f8dbe,"[-0.079640314, 0.0019300836, 0.018389827, -0.0...",::: llama_index.readers.agent_search\n opti...,"{'_node_content': '{""id_"": ""eb273b19-15d4-4f3d...",2
2,998ba7bc-0623-49e7-b4cb-7df944b9e0bf,742bebe3-2bf4-4321-bb7a-201c54ec4edf,"[-0.08911939, -0.012804543, 0.051525857, -0.02...",::: llama_index.readers.airbyte_cdk\n optio...,"{'_node_content': '{""id_"": ""998ba7bc-0623-49e7...",2
3,d3dd102d-4ee9-47cc-ab96-04bc13c3f7a9,21883516-2c9c-4ddc-b15b-3e8551b2378c,"[-0.08306652, -0.03686305, 0.05229051, -0.0256...",::: llama_index.readers.airbyte_gong\n opti...,"{'_node_content': '{""id_"": ""d3dd102d-4ee9-47cc...",2
4,87c36450-ece1-4689-8c78-a391415f9663,d37225d5-9ccb-4cd6-bf6f-186ec85ca24c,"[-0.057715178, -0.024075592, 0.03144333, -0.02...",::: llama_index.readers.airbyte_hubspot\n o...,"{'_node_content': '{""id_"": ""87c36450-ece1-4689...",2


In [14]:
df.cluster.value_counts()

cluster
2     238
15    198
18    136
7     131
19    121
11    100
3      88
0      67
5      66
8      59
12     56
4      55
10     53
17     38
16     36
9      32
1      30
14     29
13     22
6      17
Name: count, dtype: int64

### Viz

In [15]:
import plotly.express as px
from plotly.express.colors import qualitative
from plotly.graph_objs import FigureWidget
from sklearn.manifold import TSNE

In [16]:
def print_samples(df, indices, n_samples=10):
    for _, row in df.iloc[indices][['metadata', 'text']].head(n_samples).iterrows():
        print(f"""[Filename]: {row.metadata['filename']}
[Text]: 
{row.text[:500]}
{"-"*50}
""")

In [17]:
X_tsne = TSNE(n_components=2).fit_transform(X)

In [18]:
kmeans_tsne = KMeans(n_clusters=20)
c_tsne = kmeans_tsne.fit_predict(X_tsne)

In [19]:
df["cluster_tsne"] = c_tsne

In [20]:
df["cluster_tsne"].value_counts()

cluster_tsne
2     112
16    110
11    108
12    107
0      98
4      95
18     91
6      85
19     79
9      77
13     76
10     72
3      71
17     69
5      65
1      62
8      60
14     51
15     47
7      37
Name: count, dtype: int64

In [21]:
# Create a scatter plot with hover annotations
annos = df.text.str.slice(0, 50)
fig = px.scatter(
    x=X_tsne[:, 0], 
    y=X_tsne[:, 1], 
    hover_name=annos,
    # color=c,
    color=c_tsne,
    color_discrete_sequence=qualitative.Set1,
    labels={"color": "Cluster"},
    title="Chunk Embeddings",
    width=800, height=800,
)

# Enable selection of points
fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))

fig_widget = FigureWidget(fig)

# Global variable to store selected indices
selected_indices = []

# Define a callback to capture selected points
def on_selection(trace, points, state):
    global selected_indices
    selected_indices = points.point_inds  # Store selected indices

# Attach the callback to the scatter trace
scatter_trace = fig_widget.data[0]
scatter_trace.on_selection(on_selection)

# Display the interactive plot
fig_widget

FigureWidget({
    'data': [{'hovertemplate': ('<b>%{hovertext}</b><br><br>x=%' ... '%{marker.color}<extra></extra>'),
              'hovertext': array(['::: llama_index.question_gen.openai\n    options:\n ',
                                  '::: llama_index.readers.agent_search\n    options:\n',
                                  '::: llama_index.readers.airbyte_cdk\n    options:\n ', ...,
                                  '# Querying CSVs\n\nTODO', '# Parsing Tables and Charts\n\nTODO',
                                  '# Text to SQL\n\nTODO'], dtype=object),
              'legendgroup': '',
              'marker': {'color': array([ 2, 11, 12, ...,  2,  0,  2], dtype=int32),
                         'coloraxis': 'coloraxis',
                         'size': 5,
                         'symbol': 'circle'},
              'mode': 'markers',
              'name': '',
              'showlegend': False,
              'type': 'scattergl',
              'uid': '92b29f44-e937-4b2e-b715-3ddc8

In [22]:
df.loc[selected_indices].cluster_tsne.value_counts()

Series([], Name: count, dtype: int64)

In [23]:
print_samples(df, selected_indices, n_samples=10)

## Cluster sampling

In [24]:
# Group by cluster, sample 10 chunks from each
df_samples = df.groupby("cluster_tsne").apply(
    lambda x: x.sample(10, random_state=44).index.values,
    include_groups=False,
)#.reset_index(name='samples')#.explode('samples').reset_index(drop=True)

In [25]:
df_samples.head()

cluster_tsne
0    [783, 851, 600, 1432, 746, 1081, 525, 599, 597...
1    [1511, 992, 365, 1501, 991, 1188, 990, 1496, 1...
2    [103, 170, 324, 264, 175, 80, 21, 311, 1088, 176]
3    [1428, 1568, 590, 389, 1426, 1452, 1453, 888, ...
4    [822, 834, 804, 1363, 819, 848, 577, 803, 801,...
dtype: object

In [26]:
print_samples(df, df_samples[4], n_samples=10)

[Filename]: index.md
[Text]: 
### Local Cache Management

Once you have a pipeline, you may want to store and load the cache.

```python
# save
pipeline.persist("./pipeline_storage")

# load and restore state
new_pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        TitleExtractor(),
    ],
)
new_pipeline.load("./pipeline_storage")

# will run instantly due to the cache
nodes = pipeline.run(documents=[Document.example()])
```

If the cache becomes too large, you
--------------------------------------------------

[Filename]: index.md
[Text]: 
### Standalone Usage

Node parsers can be used on their own:

```python
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

nodes = node_parser.get_nodes_from_documents(
    [Document(text="long text")], show_progress=False
)
```
---------------------------------------------

In [52]:
df.iloc[df_samples.explode()].to_parquet(
    './data/eval_sampled.parquet'
)