# List Dtypes

In [1]:
import polars as pl

In [2]:
df_lists = pl.DataFrame({
    "ints": [[0, 1], [2, 3]],
    "floats": [[0.0, 1], [2, 3]],
    "strings": [["0", "1"], ["2", "3"]]
}, strict=False)
df_lists

ints,floats,strings
list[i64],list[f64],list[str]
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]"


In [3]:
df_lists[0, "ints"]

0
1


In [4]:
pl.Config.set_fmt_table_cell_list_len(20)

polars.config.Config

In [5]:
(
    df_lists
    .select(
        pl.col(pl.List(pl.Int64))
    )
)

ints
list[i64]
"[0, 1]"
"[2, 3]"


In [6]:
(
    pl.DataFrame(
        {
            "values": [
                [0, 1],
                [2, 3, 4],
                [4, 5, 6, 7, 8]
            ]
        }
    )
)

values
list[i64]
"[0, 1]"
"[2, 3, 4]"
"[4, 5, 6, 7, 8]"


In [7]:
(
    df_lists
    .with_columns(
        ints_array = pl.col("ints").cast(pl.Array(width=2, inner=pl.Int64))
    )
    .select("ints", "ints_array")
)

(Deprecated in version 0.20.31)
  ints_array = pl.col("ints").cast(pl.Array(width=2, inner=pl.Int64))


ints,ints_array
list[i64],"array[i64, 2]"
"[0, 1]","[0, 1]"
"[2, 3]","[2, 3]"


In [8]:
df_list = pl.DataFrame(
    {
        "id": ["a", "b"],
        "values": [
            [0, 1],
            [2, 3, 4]
        ]
    }
)
df_list

id,values
str,list[i64]
"""a""","[0, 1]"
"""b""","[2, 3, 4]"


In [9]:
(
    df_list
    .explode("values")
)

id,values
str,i64
"""a""",0
"""a""",1
"""b""",2
"""b""",3
"""b""",4


In [10]:
(
    df_list
    .explode("values")
    .with_columns(
        pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
    )
)

id,values,rank
str,i64,i32
"""a""",0,1
"""a""",1,2
"""b""",2,1
"""b""",3,2
"""b""",4,3


In [11]:
(
    df_list
    .explode("values")
    .with_columns(
        pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
    )
    .group_by("id")
    .agg(
        pl.col("values"),
        pl.col("rank")
    )
)

id,values,rank
str,list[i64],list[i32]
"""b""","[2, 3, 4]","[1, 2, 3]"
"""a""","[0, 1]","[1, 2]"


In [12]:
df_list_extra_column = pl.DataFrame(
    {
        "id": ["a", "b"],
        "id2": ["c", "d"],
        "values": [
            [0, 1],
            [2, 3, 4]
        ]
    }
)
df_list_extra_column

id,id2,values
str,str,list[i64]
"""a""","""c""","[0, 1]"
"""b""","""d""","[2, 3, 4]"


In [13]:
(
    df_list_extra_column
    .join(
        df_list_extra_column.pipe(
            lambda df: (
                df
                .select("id", "values")
                .explode("values")
                .with_columns(
                    pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
                )
                .group_by("id")
                .agg(
                    pl.col("rank")
                )
            )
        ),
        on="id",
        how="left"
    )
)

id,id2,values,rank
str,str,list[i64],list[i32]
"""a""","""c""","[0, 1]","[1, 2]"
"""b""","""d""","[2, 3, 4]","[1, 2, 3]"


In [14]:
df_list

id,values
str,list[i64]
"""a""","[0, 1]"
"""b""","[2, 3, 4]"


In [15]:
(
    df_list
    .with_columns(
        pl.col("values").list.to_struct().alias("value_struct")
    )
)

  pl.col("values").list.to_struct().alias("value_struct")


id,values,value_struct
str,list[i64],struct[2]
"""a""","[0, 1]","{0,1}"
"""b""","[2, 3, 4]","{2,3}"


In [16]:
(
    df_list
    .with_columns(
        pl.col("values").list.to_struct().alias("value_struct")
    )
    .unnest("value_struct")
)

  pl.col("values").list.to_struct().alias("value_struct")


id,values,field_0,field_1
str,list[i64],i64,i64
"""a""","[0, 1]",0,1
"""b""","[2, 3, 4]",2,3


In [17]:
df_embeddings = (
    pl.DataFrame(
        {
            "embeddings": [
                [0.0, 1],
                [2, 3],
                [4, 5]
            ]
        },
        strict=False
    )
)
df_embeddings

embeddings
list[f64]
"[0.0, 1.0]"
"[2.0, 3.0]"
"[4.0, 5.0]"


In [18]:
df_embeddings["embeddings"].to_numpy()

array([array([0., 1.]), array([2., 3.]), array([4., 5.])], dtype=object)

In [19]:
(
    df_embeddings["embeddings"]
    .explode()
    .to_numpy()
    .reshape(len(df_embeddings), -1)
)

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

In [20]:
(
    df_embeddings
    .with_columns(
        pl.col("embeddings").cast(pl.Array(width=2, inner=pl.Float64))
    )
    ["embeddings"].to_numpy()
)

(Deprecated in version 0.20.31)
  pl.col("embeddings").cast(pl.Array(width=2, inner=pl.Float64))


array([[0., 1.],
       [2., 3.],
       [4., 5.]])

In [21]:
df = (
    pl.DataFrame(
        {
            "values": [
                [0, 1],
                [2, 3, 4],
                [4, 5, 6, 7, 8]
            ]
        }
    )
)
df

values
list[i64]
"[0, 1]"
"[2, 3, 4]"
"[4, 5, 6, 7, 8]"


In [22]:
(
    df
    .with_columns(
        pl.col("values").list.first().alias("first"),
        pl.col("values").list.last().alias("last"),
        pl.col("values").list.head(2).alias("head"),
        pl.col("values").list.tail(2).alias("tail"),
        pl.col("values").list.slice(1, 2).alias("slice")
    )
)

values,first,last,head,tail,slice
list[i64],i64,i64,list[i64],list[i64],list[i64]
"[0, 1]",0,1,"[0, 1]","[0, 1]",[1]
"[2, 3, 4]",2,4,"[2, 3]","[3, 4]","[3, 4]"
"[4, 5, 6, 7, 8]",4,8,"[4, 5]","[7, 8]","[5, 6]"


In [23]:
(
    df
    .with_columns(
        pl.col("values").list.get(0).alias("first"),
        pl.col("values").list.get(-1).alias("last")
    )
)

values,first,last
list[i64],i64,i64
"[0, 1]",0,1
"[2, 3, 4]",2,4
"[4, 5, 6, 7, 8]",4,8


In [24]:
(
    df
    .with_columns(
        pl.col("values").list.reverse().alias("reverse"),
        pl.col("values").list.sort().alias("sort"),
        pl.col("values").list.shift(1).alias("shift")
    )
)

values,reverse,sort,shift
list[i64],list[i64],list[i64],list[i64]
"[0, 1]","[1, 0]","[0, 1]","[null, 0]"
"[2, 3, 4]","[4, 3, 2]","[2, 3, 4]","[null, 2, 3]"
"[4, 5, 6, 7, 8]","[8, 7, 6, 5, 4]","[4, 5, 6, 7, 8]","[null, 4, 5, 6, 7]"


In [25]:
(
    df
    .with_columns(
        [
            pl.col("values").list.contains(i).alias(str(i))
            for i
            in [0, 1, 2]
        ] 
    )
)

values,0,1,2
list[i64],bool,bool,bool
"[0, 1]",True,True,False
"[2, 3, 4]",False,False,True
"[4, 5, 6, 7, 8]",False,False,False


In [26]:
(
    df
    .with_columns(
        four = pl.lit(4)
    )
    .with_columns(
        pl.col("values").list.contains(pl.col("four")).alias("has_four")
    )
)

values,four,has_four
list[i64],i32,bool
"[0, 1]",4,False
"[2, 3, 4]",4,True
"[4, 5, 6, 7, 8]",4,True


In [27]:
df_set = (
    pl.DataFrame(
        {
            "values": [
                [0, 1, 0],
                [2, 3],
                [4, 5, 6, 7, 8]
            ],
            "values_2": [
                [0],
                [2, 3, 4],
                [4, 5, 9]
            ]
        }
    )
)
df_set

values,values_2
list[i64],list[i64]
"[0, 1, 0]",[0]
"[2, 3]","[2, 3, 4]"
"[4, 5, 6, 7, 8]","[4, 5, 9]"


In [28]:
(
    df_set
    .select(
        "values",
        pl.col("values").list.unique().alias("unique")
    )
)

values,unique
list[i64],list[i64]
"[0, 1, 0]","[0, 1]"
"[2, 3]","[2, 3]"
"[4, 5, 6, 7, 8]","[4, 5, 6, 7, 8]"


In [29]:
pl.Config.set_fmt_table_cell_list_len(6)

polars.config.Config

In [30]:
(
    df_set
    .with_columns(
        pl.col("values").list.set_intersection(pl.col("values_2")).alias("intersection")
    )
)

values,values_2,intersection
list[i64],list[i64],list[i64]
"[0, 1, 0]",[0],[0]
"[2, 3]","[2, 3, 4]","[2, 3]"
"[4, 5, 6, 7, 8]","[4, 5, 9]","[4, 5]"


In [31]:
(
    df_set
    .with_columns(
        pl.col("values").list.set_difference(pl.col("values_2")).alias("difference"),
        pl.col("values").list.set_symmetric_difference(pl.col("values_2")).alias("symmetric_difference")
    )
)

values,values_2,difference,symmetric_difference
list[i64],list[i64],list[i64],list[i64]
"[0, 1, 0]",[0],[1],[1]
"[2, 3]","[2, 3, 4]",[],[4]
"[4, 5, 6, 7, 8]","[4, 5, 9]","[8, 7, 6]","[6, 7, 8, 9]"


In [32]:
(
    df
    .with_columns(
        pl.col("values").list.len().alias("length"),
        pl.col("values").list.mean().alias("mean")
    )
)

values,length,mean
list[i64],u32,f64
"[0, 1]",2,0.5
"[2, 3, 4]",3,3.0
"[4, 5, 6, 7, 8]",5,6.0


In [33]:
df_eval = pl.DataFrame(
    {
        "values": [
            [0, 1],
            [4, 3, 2]
        ]
    }
)
df_eval

values
list[i64]
"[0, 1]"
"[4, 3, 2]"


In [34]:
(
    df_eval
    .with_columns(
        pl.col("values").list.eval(
            pl.element().rank()
        ).alias("eval")
    )
)

values,eval
list[i64],list[f64]
"[0, 1]","[1.0, 2.0]"
"[4, 3, 2]","[3.0, 2.0, 1.0]"
