# Analyze Elasticsearch ingest pipelines

The notebook gives insights to index templates:
- Obsolete/Deprecated ingest pipelines

## Prepare environment

### Install required Python packages

In [None]:
pip install pandas~=2.2 elasticsearch~=8.15

### Restart Jupiter kernel

In [None]:
get_ipython().kernel.do_shutdown(True)

### Import packages

In [None]:
import getpass
import time
from pathlib import Path
from elasticsearch import Elasticsearch
from IPython.display import display, FileLink
import pandas as pd

### Input Elasticsearch connection settings

To connect Elasticsearh instance it's hostname and valid API key are required.

API key can be created via Kibana - [https://www.elastic.co/guide/en/kibana/current/api-keys.html](https://www.elastic.co/guide/en/kibana/current/api-keys.html)

In [None]:
elasticsearch_host = input("Enter Elasticsearch hostname: ").strip()

In [None]:
elasticsearch_api_key = getpass.getpass("Enter Elasticsearch API key: ").strip()

### Create Elasticsearch client and connect the cluster

In [None]:
client = Elasticsearch(
    hosts=elasticsearch_host,
    api_key=elasticsearch_api_key,
    verify_certs=False,             # Elasticsearch certificate is signed by the non-public authority, so ignore any warning
    ssl_show_warn=False             # Unverified SSL/TLS connections cause a lot of warnings, so them should be supressed
)

In [None]:
print(client.cat.health())

## Analyze ingest pipelines in the cluster

### Find unused ingest pipelines

> **WARNING**: all of the steps below must be proceeded to ensure correctness of results

#### Determine ingest pipelines referenced by the index and component templates

First places where to look for ingest pipelines usage are index and component templates. 

In [None]:
# The function is helper used in several blocks, so it's necessary to execute it
# Get ingest pipelines from index settings
def extract_pipelines_from_settings(settings):
    """
    Extract pipeline names from settings dictionary.

    Args:
    settings (dict): Settings dictionary from a template.

    Returns:
    list: A list of pipeline names.
    """
    default_pipeline = settings.get("index", {}).get("default_pipeline", "")
    final_pipeline = settings.get("index", {}).get("final_pipeline", "")
    
    pipelines = []
    if default_pipeline != "":
        pipelines.append(default_pipeline)
    if final_pipeline != "":
        pipelines.append(final_pipeline)

    return pipelines

The code below extracts ingest pipelines names are referenced in index and component templates

In [None]:
# Get ingest pipelines referenced by index or component templates
def pipelines_from_templates(client, template_type):
    """
    Fetch ingest pipelines referenced by index or component templates.

    Args:
    client (Elasticsearch): Elasticsearch client instance.
    template_type (str): Type of template, either 'index' or 'component'.

    Returns:
    list: A list of pipeline names.

    Raises:
    ValueError: If the template_type is not one of 'index' or 'component'.
    """
    if template_type not in ["index", "component"]:
        raise ValueError("template_type must be one of: 'index', 'component'.")
        
    if template_type == "component":
        response = client.cluster.get_component_template()
    else:
        response = client.indices.get_index_template()

    # Extract templates from the response
    templates = response[f"{template_type}_templates"]
    
    pipelines = []
    for template in templates:
        settings = template.get(f"{template_type}_template", {}).get("template", {}).get("settings", {})
        pipelines.extend(extract_pipelines_from_settings(settings))

    return pipelines

pipelines = pipelines_from_templates(client, "index") + pipelines_from_templates(client, "component")
# Deduplicate pipelines
unique_pipelines_in_templates = list(set(pipelines))
print(f"There are {len(unique_pipelines_in_templates)} unique pipelines referenced by templates.")

#### Determine ingest pipelines that are referenced by existing indices.

> **WARNING**: the execution of code below could take a long time, depending on the number of indices in cluster. Note the batch loop behavior which adds sleep after loop and might be adjusted.

It might be achieved with in, at least, 2 ways:
- By using the indices API to query an Elasticsearch cluster for all indices, e.g., GET /*.
- By using the cat API to query for index names from the cluster, and then retrieving indices configuration in a batch manner."

The code below uses the second way to avoid heavy requests to Elasticsearch cluster

In [None]:
def get_indices_names(client):
    """
    Retrieve the names of all indices in the Elasticsearch cluster.

    Args:
    client (Elasticsearch): The Elasticsearch client instance.

    Returns:
    list: A list of index names.
    """
    response = client.cat.indices(expand_wildcards="all", format="json", h="index")
    return [index["index"] for index in response]

def batch(sequence, batch_size=1):
    """
    Generate batches of a specified size from a sequence.

    Args:
    sequence (list): The sequence to batch.
    batch_size (int): The size of each batch.

    Yields:
    list: A batch from the sequence.
    """
    length = len(sequence)
    for start_index in range(0, length, batch_size):
        yield sequence[start_index:min(start_index + batch_size, length)]

indices_names = get_indices_names(client)
print(f"There are {len(indices_names)} indices in cluster")

pipelines = []
for indices_batch in batch(indices_names, 10):
    response = client.indices.get(index=indices_batch, features="settings")
    for _, index_info in response.items():
        pipelines.extend(extract_pipelines_from_settings(index_info["settings"]))
    time.sleep(0.05)  # Throttle requests to avoid overwhelming the server

# Deduplicate pipelines
unique_pipelines_in_indices = list(set(pipelines))
print(f"There are {len(unique_pipelines_in_indices)} unique pipelines referenced by indices.")

#### Determine ingest pipelines that are referenced in ingest pipelines

> NOTE: ingest pipelines may refer to non-existent pipeline

In [None]:
pipelines = []
response = client.ingest.get_pipeline()
for _, pipeline_info in response.items():
    for processor in pipeline_info["processors"]:
        if "pipeline" in processor:
            pipelines.append(processor["pipeline"]["name"])

unique_pipelines_in_pipelines = list(set(pipelines))
print(f"There are {len(unique_pipelines_in_pipelines)} unique pipelines referenced by other ingest pipelines.")

#### Analyze unused pipelines

Process involves comparison of the ingest pipelines installed in cluster with the cumulative list of ingest pipelines referenced by:
- Index and component templates
- Existing indices
- Other ingest pipelines

Ingest pipelines installed in cluster but not referenced by an entity are compiled to CSV file.


In [None]:
unique_referenced_pipelines = list(set(
    unique_pipelines_in_templates +
    unique_pipelines_in_indices +
    unique_pipelines_in_pipelines
))
print(f"There are {len(unique_referenced_pipelines)} unique pipelines referenced in cluster.")

response = client.ingest.get_pipeline()
pipelines_in_cluster = list(response.keys())
print(f"There are {len(pipelines_in_cluster)} total pipelines in the cluster.")

In [None]:
df1 = pd.DataFrame(pipelines_in_cluster, columns=["name"])
df2 = pd.DataFrame(unique_referenced_pipelines, columns=["name"])

diff = pd.merge(df1, df2, on=['name'], how='left', indicator=True)
unused_pipelines = diff[diff['_merge'] == 'left_only']

# Save DataFrame to CSV
output_dir = Path('temp')
output_dir.mkdir(parents=True, exist_ok=True)
csv_path = output_dir / "obsolete_ingest_pipelines.csv"
unused_pipelines.to_csv(csv_path, index=False)

# Display a link to download the CSV
display(FileLink(csv_path, result_html_prefix="Open CSV file: "))