## Segments

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs/blob/mainline/python/examples/advanced/Segments.ipynb)

In [1]:
import os
ANALYTICS_OPT_OUT = "WHYLOGS_NO_ANALYTICS"
os.environ['ANALYTICS_OPT_OUT']= "1"

In [None]:
# %pip install whylogs[whylabs]

In [1]:
import numpy as np
import pandas as pd

import whylogs as why
from whylogs.api.logger.result_set import SegmentedResultSet
from whylogs.core.metrics.metrics import CardinalityMetric, DistributionMetric
from whylogs.core.schema import DatasetSchema
from whylogs.core.segment import Segment
from whylogs.core.segmentation_partition import (
    ColumnMapperFunction,
    SegmentationPartition,
    SegmentFilter,
    segment_on_column,
)

In [4]:
input_rows = 100
segment_column = "col3"
number_of_segments = 5
d = {
    "col1": [i for i in range(input_rows)],
    "col2": [i * i * 1.1 for i in range(input_rows)],
    segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)],
}

df = pd.DataFrame(data=d)
print(df.describe())
segmentation_partition = segment_on_column("col3")
segmentation_partition.filter = SegmentFilter(filter_function=lambda df: df.col1 > 49)
test_segments = {segmentation_partition.name: segmentation_partition}
results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments))
assert results.count == number_of_segments
print(f"After profiling the result set has: {results.count} segments")
partitions = results.partitions
assert len(partitions) == 1
partition = partitions[0]
segments = results.segments_in_partition(partition)
assert len(segments) == number_of_segments

first_segment: Segment = next(iter(segments))
first_segment_profile = results.profile(first_segment)
assert first_segment.key == ("x0",)
assert first_segment_profile is not None
assert first_segment_profile._columns["col1"]._schema.dtype == np.int64
assert first_segment_profile._columns["col2"]._schema.dtype == np.float64
assert first_segment_profile._columns["col3"]._schema.dtype.name == "object"
segment_distribution: DistributionMetric = (
    first_segment_profile.view().get_column("col1").get_metric("distribution")
)
count = segment_distribution.n
assert count is not None
assert count == 10

             col1          col2
count  100.000000    100.000000
mean    49.500000   3611.850000
std     29.011492   3264.992284
min      0.000000      0.000000
25%     24.750000    674.025000
50%     49.500000   2695.550000
75%     74.250000   6064.575000
max     99.000000  10781.100000
After profiling the result set has: 5 segments
