In [2]:
import polars as pl
from datetime import datetime, timedelta
import pyarrow
import pyarrow.parquet as pq
import numpy as np

In [18]:
df = pl.DataFrame({"a": np.arange(0, 8), 
                   "b": np.random.rand(8), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })
print(df)

shape: (8, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i32 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 0   ┆ 0.528318 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 1   ┆ 0.500866 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.029333 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.699894 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.130199 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.913011 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.629037 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.634939 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘


In [4]:
df.describe()

describe,a,b,c,d
str,f64,f64,str,f64
"""count""",8.0,8.0,"""8""",8.0
"""null_count""",0.0,0.0,"""0""",1.0
"""mean""",3.5,0.405812,,
"""std""",2.44949,0.296809,,
"""min""",0.0,0.002587,"""2022-12-01 00:...",-42.0
"""max""",7.0,0.834992,"""2022-12-08 00:...",2.0
"""median""",3.5,0.369454,,1.0


In [26]:
slct_cols = ['a','b']
df2 = df.lazy().select(slct_cols)
df2 = df2.filter(
    pl.col('a') > 4
)
df2.collect()
# df2

a,b
i32,f64
5,0.913011
6,0.629037
7,0.634939


In [31]:
df3 = df2.with_columns([
    pl.col('b').sum().alias('e'),
    (pl.col('b') + 42).alias('b+42')
])
df3.collect()

a,b,e,b+42
i32,f64,f64,f64
5,0.913011,2.176988,42.913011
6,0.629037,2.176988,42.629037
7,0.634939,2.176988,42.634939


##### Group By

In [34]:
df2 = pl.DataFrame({
                    "x": np.arange(0, 8), 
                    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})

In [41]:
df2.groupby("y", maintain_order=True).agg([
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum"),
    pl.col("*").min().alias("min"),
    pl.col("*").max().alias("max")
])

y,count,sum,min,max
str,u32,i32,i32,i32
"""A""",3,3,0,2
"""B""",2,7,3,4
"""C""",1,5,5,5
"""X""",2,13,6,7


##### Combining

In [43]:
# create a new column that multiplies column `a` and `b` from our DataFrame
# select all the columns, but exclude column `c` and `d` from the final DataFrame

df_x = df.with_columns(
    (pl.col("a") * pl.col("b")).alias("a * b")
).select([
    pl.all().exclude(['c', 'd'])
])

print(df_x)

shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i32 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.528318 ┆ 0.0      │
│ 1   ┆ 0.500866 ┆ 0.500866 │
│ 2   ┆ 0.029333 ┆ 0.058666 │
│ 3   ┆ 0.699894 ┆ 2.099682 │
│ 4   ┆ 0.130199 ┆ 0.520797 │
│ 5   ┆ 0.913011 ┆ 4.565056 │
│ 6   ┆ 0.629037 ┆ 3.774222 │
│ 7   ┆ 0.634939 ┆ 4.444576 │
└─────┴──────────┴──────────┘


##### JOIN and CONCAT

In [58]:
df = pl.DataFrame({"a": np.arange(0, 8), 
                   "b": np.random.rand(8), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })

df2 = pl.DataFrame({
                    "x": np.arange(0, 7), 
                    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X'],
})

In [59]:
df.join(df2, left_on="a", right_on="x")

a,b,c,d,y
i32,f64,datetime[μs],f64,str
0,0.749741,2022-12-01 00:00:00,1.0,"""A"""
1,0.564917,2022-12-02 00:00:00,2.0,"""A"""
2,0.729704,2022-12-03 00:00:00,,"""A"""
3,0.081248,2022-12-04 00:00:00,,"""B"""
4,0.313154,2022-12-05 00:00:00,0.0,"""B"""
5,0.678713,2022-12-06 00:00:00,-5.0,"""C"""
6,0.463344,2022-12-07 00:00:00,-42.0,"""X"""


In [60]:
pl.concat([df,df2], how="horizontal")
# pl.concat([df.select([pl.col('a').alias('x'),pl.col('b').cast(str).alias('y')]),df2], how="vertical")

a,b,c,d,x,y
i32,f64,datetime[μs],f64,i32,str
0,0.749741,2022-12-01 00:00:00,1.0,0.0,"""A"""
1,0.564917,2022-12-02 00:00:00,2.0,1.0,"""A"""
2,0.729704,2022-12-03 00:00:00,,2.0,"""A"""
3,0.081248,2022-12-04 00:00:00,,3.0,"""B"""
4,0.313154,2022-12-05 00:00:00,0.0,4.0,"""B"""
5,0.678713,2022-12-06 00:00:00,-5.0,5.0,"""C"""
6,0.463344,2022-12-07 00:00:00,-42.0,6.0,"""X"""
7,0.677681,2022-12-08 00:00:00,,,
