## Setup
```
pip install polars
```

## Videos
- [Polars: Blazingly Fast DataFrames in Rust and Python by Richie Vink at Databrick Data-AI summit 2022](https://youtu.be/kVy3-gMdViM)
- [Why Polars by Ritchie Vink at PyData Global 2021](https://youtu.be/iwGIuGk5nCE)  

## Blogs
- [Polars — A DataFrame library faster than pandas](https://medium.com/@pyzone.dev/polars-a-dataframe-library-faster-than-pandas-c1267315af0e) 2022-12-16
- [Using the Polars DataFrame Library](https://www.codemag.com/Article/2212051/Using-the-Polars-DataFrame-Library) 2022-11-10
- [Lightning-fast queries with Polars](https://www.orchest.io/blog/the-great-python-dataframe-showdown-part-3-lightning-fast-queries-with-polars) 2022-05-25
- [3x times faster Pandas with PyPolars](https://towardsdatascience.com/3x-times-faster-pandas-with-pypolars-7550e605805e) 2021-05-01

## References
- [Polars User Guide-Intro](https://pola-rs.github.io/polars-book/user-guide/notebooks/introduction_polars-py.html)

- [Polars quick exploration guide](https://pola-rs.github.io/polars-book/user-guide/quickstart/quick-exploration-guide.html#installation-and-import)



In [1]:
import polars as pl

# to enrich the examples in this quickstart with dates
from datetime import datetime, timedelta 
# to generate data for the examples
import numpy as np 

 ## create a Series 
 
 by providing a list or a tuple.

In [3]:
# with a tuple
series = pl.Series("a", [1, 2, 3, 4, 5])

series

a
i64
1
2
3
4
5


## Create DataFrame 
 from a dict or a collection of dicts.

In [6]:
df1 = pl.DataFrame({"integer": [1, 2, 3], 
                  "date": [
                      (datetime(2022, 1, 1)), 
                      (datetime(2022, 1, 2)), 
                      (datetime(2022, 1, 3))
                  ], 
                  "float":[4.0, 5.0, 6.0],
                 "utf8": ["Hello", "Polars", "you are fast"]
                 })

df1

integer,date,float,utf8
i64,datetime[μs],f64,str
1,2022-01-01 00:00:00,4.0,"""Hello"""
2,2022-01-02 00:00:00,5.0,"""Polars"""
3,2022-01-03 00:00:00,6.0,"""you are fast"""


In [7]:
print(df1)

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬──────────────┐
│ integer ┆ date                ┆ float ┆ utf8         │
│ ---     ┆ ---                 ┆ ---   ┆ ---          │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str          │
╞═════════╪═════════════════════╪═══════╪══════════════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   ┆ Hello        │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   ┆ Polars       │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   ┆ you are fast │
└─────────┴─────────────────────┴───────┴──────────────┘


## IO read/write files: csv, json, parquet

In [8]:
df1.write_csv('df1.csv')

In [10]:
!type df1.csv

integer,date,float,utf8
1,2022-01-01T00:00:00.000000,4.0,Hello
2,2022-01-02T00:00:00.000000,5.0,Polars
3,2022-01-03T00:00:00.000000,6.0,you are fast


In [12]:
df1.write_json('df1.json')

In [13]:
!type df1.json

{"columns":[{"name":"integer","datatype":"Int64","values":[1,2,3]},{"name":"date","datatype":{"Datetime":["Microseconds",null]},"values":[1640995200000000,1641081600000000,1641168000000000]},{"name":"float","datatype":"Float64","values":[4.0,5.0,6.0]},{"name":"utf8","datatype":"Utf8","values":["Hello","Polars","you are fast"]}]}


In [14]:
df1.write_parquet('df1.parquet')

In [15]:
!dir df1.*

 Volume in drive C has no label.
 Volume Serial Number is 8A04-0EBD

 Directory of C:\Users\w_gon\projects\wgong\py4kids\lesson-17-polars

01/14/2023  05:40 PM               149 df1.csv
01/14/2023  05:42 PM               330 df1.json
01/14/2023  05:43 PM             1,138 df1.parquet
               3 File(s)          1,617 bytes
               0 Dir(s)  328,747,536,384 bytes free


In [16]:
df1_ = pl.read_json("df1.json")

In [17]:
df1_

integer,date,float,utf8
i64,datetime[μs],f64,str
1,2022-01-01 00:00:00,4.0,"""Hello"""
2,2022-01-02 00:00:00,5.0,"""Polars"""
3,2022-01-03 00:00:00,6.0,"""you are fast"""


In [18]:
N = 8
df2 = pl.DataFrame({"a": np.arange(0, N), 
                   "b": np.random.rand(N), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(N)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })

print(df2)

shape: (8, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i32 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 0   ┆ 0.766023 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 1   ┆ 0.502188 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.363644 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.347956 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.814063 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.833931 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.046048 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.207825 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘


In [21]:
df2.row(4)  # 5th row

(4, 0.8140632812981042, datetime.datetime(2022, 12, 5, 0, 0), 0.0)

In [25]:
df2[:3]     # 1st 3 rows

a,b,c,d
i32,f64,datetime[μs],f64
0,0.766023,2022-12-01 00:00:00,1.0
1,0.502188,2022-12-02 00:00:00,2.0
2,0.363644,2022-12-03 00:00:00,


In [26]:
df2.head(3), df2.tail(3), df2.sample(3)

(shape: (3, 4)
 ┌─────┬──────────┬─────────────────────┬─────┐
 │ a   ┆ b        ┆ c                   ┆ d   │
 │ --- ┆ ---      ┆ ---                 ┆ --- │
 │ i32 ┆ f64      ┆ datetime[μs]        ┆ f64 │
 ╞═════╪══════════╪═════════════════════╪═════╡
 │ 0   ┆ 0.766023 ┆ 2022-12-01 00:00:00 ┆ 1.0 │
 │ 1   ┆ 0.502188 ┆ 2022-12-02 00:00:00 ┆ 2.0 │
 │ 2   ┆ 0.363644 ┆ 2022-12-03 00:00:00 ┆ NaN │
 └─────┴──────────┴─────────────────────┴─────┘,
 shape: (3, 4)
 ┌─────┬──────────┬─────────────────────┬───────┐
 │ a   ┆ b        ┆ c                   ┆ d     │
 │ --- ┆ ---      ┆ ---                 ┆ ---   │
 │ i32 ┆ f64      ┆ datetime[μs]        ┆ f64   │
 ╞═════╪══════════╪═════════════════════╪═══════╡
 │ 5   ┆ 0.833931 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
 │ 6   ┆ 0.046048 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
 │ 7   ┆ 0.207825 ┆ 2022-12-08 00:00:00 ┆ null  │
 └─────┴──────────┴─────────────────────┴───────┘,
 shape: (3, 4)
 ┌─────┬──────────┬─────────────────────┬─────┐
 │ a   ┆ b        ┆ c  

In [27]:
df2.describe()

describe,a,b,c,d
str,f64,f64,str,f64
"""count""",8.0,8.0,"""8""",8.0
"""null_count""",0.0,0.0,"""0""",1.0
"""mean""",3.5,0.48521,,
"""std""",2.44949,0.29576,,
"""min""",0.0,0.046048,"""2022-12-01 00:...",-42.0
"""max""",7.0,0.833931,"""2022-12-08 00:...",2.0
"""median""",3.5,0.432916,,1.0


## Expression

 core block of Polars: offer a versatile structure that solves easy queries, but is easily extended to complex analyses
 
 - select  (column-wise)
 - filter  (row-wise)
 - with_columns (add column)
 - groupby

In [28]:
df2.select(pl.col("*"))   # all columns

a,b,c,d
i32,f64,datetime[μs],f64
0,0.766023,2022-12-01 00:00:00,1.0
1,0.502188,2022-12-02 00:00:00,2.0
2,0.363644,2022-12-03 00:00:00,
3,0.347956,2022-12-04 00:00:00,
4,0.814063,2022-12-05 00:00:00,0.0
5,0.833931,2022-12-06 00:00:00,-5.0
6,0.046048,2022-12-07 00:00:00,-42.0
7,0.207825,2022-12-08 00:00:00,


In [29]:
df2.select(pl.all())

a,b,c,d
i32,f64,datetime[μs],f64
0,0.766023,2022-12-01 00:00:00,1.0
1,0.502188,2022-12-02 00:00:00,2.0
2,0.363644,2022-12-03 00:00:00,
3,0.347956,2022-12-04 00:00:00,
4,0.814063,2022-12-05 00:00:00,0.0
5,0.833931,2022-12-06 00:00:00,-5.0
6,0.046048,2022-12-07 00:00:00,-42.0
7,0.207825,2022-12-08 00:00:00,


In [31]:
df2.select(["a","b","c"])

a,b,c
i32,f64,datetime[μs]
0,0.766023,2022-12-01 00:00:00
1,0.502188,2022-12-02 00:00:00
2,0.363644,2022-12-03 00:00:00
3,0.347956,2022-12-04 00:00:00
4,0.814063,2022-12-05 00:00:00
5,0.833931,2022-12-06 00:00:00
6,0.046048,2022-12-07 00:00:00
7,0.207825,2022-12-08 00:00:00


In [35]:
df2.select(["a","b","c"]).filter(pl.col("a")< 4).select(pl.exclude("b"))

a,c
i32,datetime[μs]
0,2022-12-01 00:00:00
1,2022-12-02 00:00:00
2,2022-12-03 00:00:00
3,2022-12-04 00:00:00


In [36]:
df2.filter(
    pl.col("c").is_between(datetime(2022, 12, 2), datetime(2022, 12, 5)),
)



a,b,c,d
i32,f64,datetime[μs],f64
2,0.363644,2022-12-03 00:00:00,
3,0.347956,2022-12-04 00:00:00,


In [37]:
df2.filter(
    (pl.col('a') <= 3) & (pl.col('d').is_not_nan())
)

a,b,c,d
i32,f64,datetime[μs],f64
0,0.766023,2022-12-01 00:00:00,1.0
1,0.502188,2022-12-02 00:00:00,2.0


In [39]:
df2.filter(
    (pl.col('d').is_null())
)

a,b,c,d
i32,f64,datetime[μs],f64
7,0.207825,2022-12-08 00:00:00,


In [48]:
# create a new colum that multiplies column `a` and `b` from our DataFrame
# select all the columns, but exclude column `c` and `d` from the final DataFrame

df_x = df2.with_column(
    (pl.col("a") * pl.col("b")).alias("a * b")
).select([
    pl.all().exclude(['c', 'd'])
])

print(df_x)


shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i32 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.766023 ┆ 0.0      │
│ 1   ┆ 0.502188 ┆ 0.502188 │
│ 2   ┆ 0.363644 ┆ 0.727287 │
│ 3   ┆ 0.347956 ┆ 1.043869 │
│ 4   ┆ 0.814063 ┆ 3.256253 │
│ 5   ┆ 0.833931 ┆ 4.169655 │
│ 6   ┆ 0.046048 ┆ 0.276287 │
│ 7   ┆ 0.207825 ┆ 1.454774 │
└─────┴──────────┴──────────┘


In [43]:
df2.with_columns([
    pl.col('b').sum().alias('b_sum'),
    (pl.col('b') + 2).alias('b+2'),
    pl.col('b').mean().alias('b_avg'),
    (pl.col('b') - pl.col("b").mean()).alias("b-avg")
])

a,b,c,d,b_sum,b+2,b_avg,b-avg
i32,f64,datetime[μs],f64,f64,f64,f64,f64
0,0.766023,2022-12-01 00:00:00,1.0,3.881678,2.766023,0.48521,0.280813
1,0.502188,2022-12-02 00:00:00,2.0,3.881678,2.502188,0.48521,0.016978
2,0.363644,2022-12-03 00:00:00,,3.881678,2.363644,0.48521,-0.121566
3,0.347956,2022-12-04 00:00:00,,3.881678,2.347956,0.48521,-0.137253
4,0.814063,2022-12-05 00:00:00,0.0,3.881678,2.814063,0.48521,0.328854
5,0.833931,2022-12-06 00:00:00,-5.0,3.881678,2.833931,0.48521,0.348721
6,0.046048,2022-12-07 00:00:00,-42.0,3.881678,2.046048,0.48521,-0.439162
7,0.207825,2022-12-08 00:00:00,,3.881678,2.207825,0.48521,-0.277385


In [44]:
df = pl.DataFrame({
                    "x": np.arange(0, 8), 
                    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})

print(df)

shape: (8, 2)
┌─────┬─────┐
│ x   ┆ y   │
│ --- ┆ --- │
│ i32 ┆ str │
╞═════╪═════╡
│ 0   ┆ A   │
│ 1   ┆ A   │
│ 2   ┆ A   │
│ 3   ┆ B   │
│ 4   ┆ B   │
│ 5   ┆ C   │
│ 6   ┆ X   │
│ 7   ┆ X   │
└─────┴─────┘


In [45]:
# without maintain_order you will get a random order back.
df.groupby("y", maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [46]:
df.groupby("y", maintain_order=True).agg([
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum")
])


y,count,sum
str,u32,i32
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


### Combining Dataframes

In [54]:
df_1 = pl.DataFrame({"a": np.arange(0, 8), 
                   "b": np.random.rand(8), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })
print(df_1)

df_2 = pl.DataFrame({
                    "x": np.arange(0, 8), 
                    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})
print(df_2)


shape: (8, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i32 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 0   ┆ 0.842672 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 1   ┆ 0.602538 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.23952  ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.479529 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.030341 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.372705 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.61734  ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.783109 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘
shape: (8, 2)
┌─────┬─────┐
│ x   ┆ y   │
│ --- ┆ --- │
│ i32 ┆ str │
╞═════╪═════╡
│ 0   ┆ A   │
│ 1   ┆ A   │
│ 2   ┆ A   │
│ 3   ┆ B   │
│ 4   ┆ B   │
│ 5   ┆ C   │
│ 6   ┆ X   │
│ 7   ┆ X   │
└─────┴─────┘


In [55]:
df_1a = df_1.join(df_2, left_on="a", right_on="x")
df_1a

a,b,c,d,y
i32,f64,datetime[μs],f64,str
0,0.842672,2022-12-01 00:00:00,1.0,"""A"""
1,0.602538,2022-12-02 00:00:00,2.0,"""A"""
2,0.23952,2022-12-03 00:00:00,,"""A"""
3,0.479529,2022-12-04 00:00:00,,"""B"""
4,0.030341,2022-12-05 00:00:00,0.0,"""B"""
5,0.372705,2022-12-06 00:00:00,-5.0,"""C"""
6,0.61734,2022-12-07 00:00:00,-42.0,"""X"""
7,0.783109,2022-12-08 00:00:00,,"""X"""


In [58]:
df_12 = pl.concat([df_1,df_2], how="horizontal")
df_12

a,b,c,d,x,y
i32,f64,datetime[μs],f64,i32,str
0,0.842672,2022-12-01 00:00:00,1.0,0,"""A"""
1,0.602538,2022-12-02 00:00:00,2.0,1,"""A"""
2,0.23952,2022-12-03 00:00:00,,2,"""A"""
3,0.479529,2022-12-04 00:00:00,,3,"""B"""
4,0.030341,2022-12-05 00:00:00,0.0,4,"""B"""
5,0.372705,2022-12-06 00:00:00,-5.0,5,"""C"""
6,0.61734,2022-12-07 00:00:00,-42.0,6,"""X"""
7,0.783109,2022-12-08 00:00:00,,7,"""X"""


In [71]:
df_1_1 = pl.DataFrame({"a": [10 + i for i in np.arange(0, 8)], 
                   "b": np.random.rand(8) })

In [72]:
df_1_2 = pl.DataFrame({"a": [100 + i for i in np.arange(0, 4)], 
                   "b": np.random.rand(4)})

In [73]:
df_1_1, df_1_2      

(shape: (8, 2)
 ┌─────┬──────────┐
 │ a   ┆ b        │
 │ --- ┆ ---      │
 │ i64 ┆ f64      │
 ╞═════╪══════════╡
 │ 10  ┆ 0.132184 │
 │ 11  ┆ 0.594802 │
 │ 12  ┆ 0.401882 │
 │ 13  ┆ 0.226948 │
 │ 14  ┆ 0.231915 │
 │ 15  ┆ 0.641314 │
 │ 16  ┆ 0.796666 │
 │ 17  ┆ 0.461065 │
 └─────┴──────────┘,
 shape: (4, 2)
 ┌─────┬──────────┐
 │ a   ┆ b        │
 │ --- ┆ ---      │
 │ i64 ┆ f64      │
 ╞═════╪══════════╡
 │ 100 ┆ 0.845527 │
 │ 101 ┆ 0.444925 │
 │ 102 ┆ 0.566275 │
 │ 103 ┆ 0.796073 │
 └─────┴──────────┘)

In [74]:
df_1_12 = pl.concat([df_1_1, df_1_2])
df_1_12

a,b
i64,f64
10,0.132184
11,0.594802
12,0.401882
13,0.226948
14,0.231915
15,0.641314
16,0.796666
17,0.461065
100,0.845527
101,0.444925


## Processing missing data

- https://pola-rs.github.io/polars-book/user-guide/howcani/missing_data.html

## [Numpy universal functions](https://pola-rs.github.io/polars-book/user-guide/dsl/numpy.html)

In [1]:
import polars as pl
import numpy as np

df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

df.select(
    [
        pl.all(),
        np.log(pl.all()).suffix("_log"),
    ]
)
# print(out)


a,b,a_log,b_log
i64,i64,f64,f64
1,4,0.0,1.386294
2,5,0.693147,1.609438
3,6,1.098612,1.791759
