# Categorical

In [1]:
import polars as pl

In [2]:
df = (
    pl.DataFrame(
        {
            "text": ["cat", "dog", "rabbit", "cat"]
        }
    )
)
df

text
str
"""cat"""
"""dog"""
"""rabbit"""
"""cat"""


In [3]:
(
    df
    .with_columns(
        pl.col("text").cast(pl.Categorical).alias("text_cat")
    )
)

text,text_cat
str,cat
"""cat""","""cat"""
"""dog""","""dog"""
"""rabbit""","""rabbit"""
"""cat""","""cat"""


In [4]:
df_physical = (
    pl.DataFrame(
        {"strings": ["c", "b", "a", "c"], "values": [0, 1, 2, 3]}
    )
    .with_columns(
        pl.col("strings").cast(pl.Categorical).alias("cats")
    )
)
df_physical

strings,values,cats
str,i64,cat
"""c""",0,"""c"""
"""b""",1,"""b"""
"""a""",2,"""a"""
"""c""",3,"""c"""


In [7]:
(
    df_physical
    .with_columns(
        pl.col("cats").to_physical().alias("cat_physical")
    )
)
df_physical.sort("cats")

strings,values,cats
str,i64,cat
"""c""",0,"""c"""
"""c""",3,"""c"""
"""b""",1,"""b"""
"""a""",2,"""a"""


In [11]:
# df_lexical = (
#     df_physical
#     .with_columns(
#         pl.col("cats").cat.set_ordering("lexical")
#     )
# )
# df_lexical.sort("cats")

In [13]:
df = (
    pl.DataFrame(
        {
            "strings": ["c", "b", "a", "c"],
            "values":[1, 2, 3, 4]
        }
    )
    .with_columns(
        pl.col("strings").cast(pl.Categorical).alias("cats")
    )
)
df

strings,values,cats
str,i64,cat
"""c""",1,"""c"""
"""b""",2,"""b"""
"""a""",3,"""a"""
"""c""",4,"""c"""


In [15]:
(
    df
    .filter(
        cats="b"
    )
)

strings,values,cats
str,i64,cat
"""b""",2,"""b"""


In [16]:
with pl.StringCache():
    df = (
        pl.DataFrame(
            {
                "strings": ["c", "b", "a", "c"],
                "values":[1, 2, 3, 4]
            }
        )
        .with_columns(
            pl.col("strings").cast(pl.Categorical).alias("cats")
        )
        .filter(
            pl.col("cats").is_in(["b", "c"])
        )
    )
df

strings,values,cats
str,i64,cat
"""c""",1,"""c"""
"""b""",2,"""b"""
"""c""",4,"""c"""


In [17]:
pl.enable_string_cache()

In [18]:
pl.using_string_cache()

True

In [20]:
pl.disable_string_cache()