# List Dtypes

In [1]:
import polars as pl

In [2]:
df_lists = pl.DataFrame({
    "ints": [[0, 1], [2, 3]],
    "floats": [[0.0, 1], [2, 3]],
    "strings": [["0", "1"], ["2", "3"]]
}, strict=False)
df_lists

ints,floats,strings
list[i64],list[f64],list[str]
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]"


In [3]:
df_lists[0, "ints"]

0
1


In [4]:
pl.Config.set_fmt_table_cell_list_len(20)

polars.config.Config

In [5]:
(
    df_lists
    .select(
        pl.col(pl.List(pl.Int64))
    )
)

ints
list[i64]
"[0, 1]"
"[2, 3]"


In [6]:
(
    pl.DataFrame(
        {
            "values": [
                [0, 1],
                [2, 3, 4],
                [4, 5, 6, 7, 8]
            ]
        }
    )
)

values
list[i64]
"[0, 1]"
"[2, 3, 4]"
"[4, 5, 6, 7, 8]"


In [7]:
(
    df_lists
    .with_columns(
        ints_array = pl.col("ints").cast(pl.Array(width=2, inner=pl.Int64))
    )
    .select("ints", "ints_array")
)

(Deprecated in version 0.20.31)
  ints_array = pl.col("ints").cast(pl.Array(width=2, inner=pl.Int64))


ints,ints_array
list[i64],"array[i64, 2]"
"[0, 1]","[0, 1]"
"[2, 3]","[2, 3]"


In [9]:
df_list = pl.DataFrame(
    {
        "id": ["a", "b"],
        "values": [
            [0, 1],
            [2, 3, 4]
        ]
    }
)
df_list

id,values
str,list[i64]
"""a""","[0, 1]"
"""b""","[2, 3, 4]"


In [10]:
(
    df_list
    .explode("values")
)

id,values
str,i64
"""a""",0
"""a""",1
"""b""",2
"""b""",3
"""b""",4


In [11]:
(
    df_list
    .explode("values")
    .with_columns(
        pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
    )
)

id,values,rank
str,i64,i32
"""a""",0,1
"""a""",1,2
"""b""",2,1
"""b""",3,2
"""b""",4,3


In [13]:
(
    df_list
    .explode("values")
    .with_columns(
        pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
    )
    .group_by("id")
    .agg(
        pl.col("values"),
        pl.col("rank")
    )
)

id,values,rank
str,list[i64],list[i32]
"""a""","[0, 1]","[1, 2]"
"""b""","[2, 3, 4]","[1, 2, 3]"


In [14]:
df_list_extra_column = pl.DataFrame(
    {
        "id": ["a", "b"],
        "id2": ["c", "d"],
        "values": [
            [0, 1],
            [2, 3, 4]
        ]
    }
)
df_list_extra_column

id,id2,values
str,str,list[i64]
"""a""","""c""","[0, 1]"
"""b""","""d""","[2, 3, 4]"


In [15]:
(
    df_list_extra_column
    .join(
        df_list_extra_column.pipe(
            lambda df: (
                df
                .select("id", "values")
                .explode("values")
                .with_columns(
                    pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
                )
                .group_by("id")
                .agg(
                    pl.col("rank")
                )
            )
        ),
        on="id",
        how="left"
    )
)

id,id2,values,rank
str,str,list[i64],list[i32]
"""a""","""c""","[0, 1]","[1, 2]"
"""b""","""d""","[2, 3, 4]","[1, 2, 3]"


In [16]:
df_list

id,values
str,list[i64]
"""a""","[0, 1]"
"""b""","[2, 3, 4]"


In [17]:
(
    df_list
    .with_columns(
        pl.col("values").list.to_struct().alias("value_struct")
    )
)

  pl.col("values").list.to_struct().alias("value_struct")


id,values,value_struct
str,list[i64],struct[2]
"""a""","[0, 1]","{0,1}"
"""b""","[2, 3, 4]","{2,3}"


In [18]:
(
    df_list
    .with_columns(
        pl.col("values").list.to_struct().alias("value_struct")
    )
    .unnest("value_struct")
)

  pl.col("values").list.to_struct().alias("value_struct")


id,values,field_0,field_1
str,list[i64],i64,i64
"""a""","[0, 1]",0,1
"""b""","[2, 3, 4]",2,3


In [20]:
df_embeddings = (
    pl.DataFrame(
        {
            "embeddings": [
                [0.0, 1],
                [2, 3],
                [4, 5]
            ]
        },
        strict=False
    )
)
df_embeddings

embeddings
list[f64]
"[0.0, 1.0]"
"[2.0, 3.0]"
"[4.0, 5.0]"


In [21]:
df_embeddings["embeddings"].to_numpy()

array([array([0., 1.]), array([2., 3.]), array([4., 5.])], dtype=object)

In [23]:
(
    df_embeddings["embeddings"]
    .explode()
    .to_numpy()
    .reshape(len(df_embeddings), -1)
)

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

In [24]:
(
    df_embeddings
    .with_columns(
        pl.col("embeddings").cast(pl.Array(width=2, inner=pl.Float64))
    )
    ["embeddings"].to_numpy()
)

(Deprecated in version 0.20.31)
  pl.col("embeddings").cast(pl.Array(width=2, inner=pl.Float64))


array([[0., 1.],
       [2., 3.],
       [4., 5.]])