In [31]:
import polars as pl 
import datetime as dt 

df=pl.DataFrame(
    {
    "name":["Vamshi","Ajay","Mahesh","Yatish"],
    "birthdate":[
        dt.date(2003,10,14),
        dt.date(2003,1,1),
        dt.date(2003,1,1),
        dt.date(2003,1,1),
    ],
    "weight":[50,60,70,80],
    "height":[6.1,6.0,5.9,5.8]
    }
)

print(df)

df.write_csv("outputs/output-basic.csv")
df_csv=pl.read_csv("outputs/output-basic.csv",try_parse_dates=True) 
print(df_csv)

shape: (4, 4)
┌────────┬────────────┬────────┬────────┐
│ name   ┆ birthdate  ┆ weight ┆ height │
│ ---    ┆ ---        ┆ ---    ┆ ---    │
│ str    ┆ date       ┆ i64    ┆ f64    │
╞════════╪════════════╪════════╪════════╡
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
│ Yatish ┆ 2003-01-01 ┆ 80     ┆ 5.8    │
└────────┴────────────┴────────┴────────┘
shape: (4, 4)
┌────────┬────────────┬────────┬────────┐
│ name   ┆ birthdate  ┆ weight ┆ height │
│ ---    ┆ ---        ┆ ---    ┆ ---    │
│ str    ┆ date       ┆ i64    ┆ f64    │
╞════════╪════════════╪════════╪════════╡
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
│ Yatish ┆ 2003-01-01 ┆ 80     ┆ 5.8    │
└────────┴────────────┴────────┴────────┘


In [32]:
result=df.select(
    pl.col("name"),
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight")/(pl.col("height")**2)).alias("bmi"),
)

print(result)

shape: (4, 3)
┌────────┬────────────┬──────────┐
│ name   ┆ birth_year ┆ bmi      │
│ ---    ┆ ---        ┆ ---      │
│ str    ┆ i32        ┆ f64      │
╞════════╪════════════╪══════════╡
│ Vamshi ┆ 2003       ┆ 1.343725 │
│ Ajay   ┆ 2003       ┆ 1.666667 │
│ Mahesh ┆ 2003       ┆ 2.010916 │
│ Yatish ┆ 2003       ┆ 2.378121 │
└────────┴────────────┴──────────┘


In [33]:
result = df.select(
    pl.col("name"),
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
)
print(result)

shape: (4, 3)
┌────────┬───────────┬───────────┐
│ name   ┆ weight-5% ┆ height-5% │
│ ---    ┆ ---       ┆ ---       │
│ str    ┆ f64       ┆ f64       │
╞════════╪═══════════╪═══════════╡
│ Vamshi ┆ 47.5      ┆ 5.79      │
│ Ajay   ┆ 57.0      ┆ 5.7       │
│ Mahesh ┆ 66.5      ┆ 5.61      │
│ Yatish ┆ 76.0      ┆ 5.51      │
└────────┴───────────┴───────────┘


In [34]:
result=df.with_columns(
    birth_year=pl.col("birthdate").dt.year()
)
print(result)

shape: (4, 5)
┌────────┬────────────┬────────┬────────┬────────────┐
│ name   ┆ birthdate  ┆ weight ┆ height ┆ birth_year │
│ ---    ┆ ---        ┆ ---    ┆ ---    ┆ ---        │
│ str    ┆ date       ┆ i64    ┆ f64    ┆ i32        │
╞════════╪════════════╪════════╪════════╪════════════╡
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    ┆ 2003       │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    ┆ 2003       │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    ┆ 2003       │
│ Yatish ┆ 2003-01-01 ┆ 80     ┆ 5.8    ┆ 2003       │
└────────┴────────────┴────────┴────────┴────────────┘


In [35]:
result=df.filter(pl.col("birthdate").dt.year()==2003)
print(result)

shape: (4, 4)
┌────────┬────────────┬────────┬────────┐
│ name   ┆ birthdate  ┆ weight ┆ height │
│ ---    ┆ ---        ┆ ---    ┆ ---    │
│ str    ┆ date       ┆ i64    ┆ f64    │
╞════════╪════════════╪════════╪════════╡
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
│ Yatish ┆ 2003-01-01 ┆ 80     ┆ 5.8    │
└────────┴────────────┴────────┴────────┘


In [36]:
result=df.group_by(
    (pl.col("birthdate").dt.year()//10 * 10).alias("decade"),
    maintain_order=True
).len() 
print(result)

shape: (1, 2)
┌────────┬─────┐
│ decade ┆ len │
│ ---    ┆ --- │
│ i32    ┆ u32 │
╞════════╪═════╡
│ 2000   ┆ 4   │
└────────┴─────┘


In [37]:
result = df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).agg(
    pl.len().alias("sample_size"),
    pl.col("weight").mean().round(2).alias("avg_weight"),
    pl.col("height").max().alias("tallest"),
)
print(result)

shape: (1, 4)
┌────────┬─────────────┬────────────┬─────────┐
│ decade ┆ sample_size ┆ avg_weight ┆ tallest │
│ ---    ┆ ---         ┆ ---        ┆ ---     │
│ i32    ┆ u32         ┆ f64        ┆ f64     │
╞════════╪═════════════╪════════════╪═════════╡
│ 2000   ┆ 4           ┆ 65.0       ┆ 6.1     │
└────────┴─────────────┴────────────┴─────────┘


In [38]:
result = (
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
    .agg(
        pl.col("name"),
        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
    )
)
print(result)

shape: (1, 4)
┌────────┬────────────────────────────────┬────────────┬────────────┐
│ decade ┆ name                           ┆ avg_weight ┆ avg_height │
│ ---    ┆ ---                            ┆ ---        ┆ ---        │
│ i32    ┆ list[str]                      ┆ f64        ┆ f64        │
╞════════╪════════════════════════════════╪════════════╪════════════╡
│ 2000   ┆ ["Vamshi", "Ajay", … "Yatish"] ┆ 65.0       ┆ 5.95       │
└────────┴────────────────────────────────┴────────────┴────────────┘


In [39]:
df2=pl.DataFrame({
    "name":["Vamshi","Ajay","Yatish","Mahesh"],
    "siblings":[1,2,3,4],
})

print(df.join(df2,on="name",how="left"))

shape: (4, 5)
┌────────┬────────────┬────────┬────────┬──────────┐
│ name   ┆ birthdate  ┆ weight ┆ height ┆ siblings │
│ ---    ┆ ---        ┆ ---    ┆ ---    ┆ ---      │
│ str    ┆ date       ┆ i64    ┆ f64    ┆ i64      │
╞════════╪════════════╪════════╪════════╪══════════╡
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    ┆ 1        │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    ┆ 2        │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    ┆ 4        │
│ Yatish ┆ 2003-01-01 ┆ 80     ┆ 5.8    ┆ 3        │
└────────┴────────────┴────────┴────────┴──────────┘


In [40]:
df2=pl.DataFrame({
    "name":["Vamshi","Ajay","Yatish","Mahesh"],
    "siblings":[1,2,3,4],
})

print(df.join(df2,on="name",how="right"))

shape: (4, 5)
┌────────────┬────────┬────────┬────────┬──────────┐
│ birthdate  ┆ weight ┆ height ┆ name   ┆ siblings │
│ ---        ┆ ---    ┆ ---    ┆ ---    ┆ ---      │
│ date       ┆ i64    ┆ f64    ┆ str    ┆ i64      │
╞════════════╪════════╪════════╪════════╪══════════╡
│ 2003-10-14 ┆ 50     ┆ 6.1    ┆ Vamshi ┆ 1        │
│ 2003-01-01 ┆ 60     ┆ 6.0    ┆ Ajay   ┆ 2        │
│ 2003-01-01 ┆ 80     ┆ 5.8    ┆ Yatish ┆ 3        │
│ 2003-01-01 ┆ 70     ┆ 5.9    ┆ Mahesh ┆ 4        │
└────────────┴────────┴────────┴────────┴──────────┘


In [41]:
df3 = pl.DataFrame(
    {
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
        "birthdate": [
            dt.date(1977, 5, 10),
            dt.date(1975, 6, 23),
            dt.date(1973, 7, 22),
            dt.date(1971, 8, 3),
        ],
        "weight": [67, 72, 57, 93],  # (kg)
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
    }
)

print(pl.concat([df, df3], how="vertical"))

shape: (8, 4)
┌───────────────┬────────────┬────────┬────────┐
│ name          ┆ birthdate  ┆ weight ┆ height │
│ ---           ┆ ---        ┆ ---    ┆ ---    │
│ str           ┆ date       ┆ i64    ┆ f64    │
╞═══════════════╪════════════╪════════╪════════╡
│ Vamshi        ┆ 2003-10-14 ┆ 50     ┆ 6.1    │
│ Ajay          ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Mahesh        ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
│ Yatish        ┆ 2003-01-01 ┆ 80     ┆ 5.8    │
│ Ethan Edwards ┆ 1977-05-10 ┆ 67     ┆ 1.76   │
│ Fiona Foster  ┆ 1975-06-23 ┆ 72     ┆ 1.6    │
│ Grace Gibson  ┆ 1973-07-22 ┆ 57     ┆ 1.66   │
│ Henry Harris  ┆ 1971-08-03 ┆ 93     ┆ 1.8    │
└───────────────┴────────────┴────────┴────────┘


In [42]:
s=pl.Series("ints",[1,2,3,4,5])
print(s)

shape: (5,)
Series: 'ints' [i64]
[
	1
	2
	3
	4
	5
]


In [43]:
print(df.head(3))

shape: (3, 4)
┌────────┬────────────┬────────┬────────┐
│ name   ┆ birthdate  ┆ weight ┆ height │
│ ---    ┆ ---        ┆ ---    ┆ ---    │
│ str    ┆ date       ┆ i64    ┆ f64    │
╞════════╪════════════╪════════╪════════╡
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
└────────┴────────────┴────────┴────────┘


In [44]:
print(df.tail(3))

shape: (3, 4)
┌────────┬────────────┬────────┬────────┐
│ name   ┆ birthdate  ┆ weight ┆ height │
│ ---    ┆ ---        ┆ ---    ┆ ---    │
│ str    ┆ date       ┆ i64    ┆ f64    │
╞════════╪════════════╪════════╪════════╡
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
│ Yatish ┆ 2003-01-01 ┆ 80     ┆ 5.8    │
└────────┴────────────┴────────┴────────┘


In [45]:
print(df.glimpse(return_as_string=True))

Rows: 4
Columns: 4
$ name       <str> 'Vamshi', 'Ajay', 'Mahesh', 'Yatish'
$ birthdate <date> 2003-10-14, 2003-01-01, 2003-01-01, 2003-01-01
$ weight     <i64> 50, 60, 70, 80
$ height     <f64> 6.1, 6.0, 5.9, 5.8



In [46]:
import random 
random.seed(42) 
print(df.sample(3))

shape: (3, 4)
┌────────┬────────────┬────────┬────────┐
│ name   ┆ birthdate  ┆ weight ┆ height │
│ ---    ┆ ---        ┆ ---    ┆ ---    │
│ str    ┆ date       ┆ i64    ┆ f64    │
╞════════╪════════════╪════════╪════════╡
│ Mahesh ┆ 2003-01-01 ┆ 70     ┆ 5.9    │
│ Ajay   ┆ 2003-01-01 ┆ 60     ┆ 6.0    │
│ Vamshi ┆ 2003-10-14 ┆ 50     ┆ 6.1    │
└────────┴────────────┴────────┴────────┘


In [47]:
print(df.describe())

shape: (9, 5)
┌────────────┬────────┬─────────────────────┬───────────┬──────────┐
│ statistic  ┆ name   ┆ birthdate           ┆ weight    ┆ height   │
│ ---        ┆ ---    ┆ ---                 ┆ ---       ┆ ---      │
│ str        ┆ str    ┆ str                 ┆ f64       ┆ f64      │
╞════════════╪════════╪═════════════════════╪═══════════╪══════════╡
│ count      ┆ 4      ┆ 4                   ┆ 4.0       ┆ 4.0      │
│ null_count ┆ 0      ┆ 0                   ┆ 0.0       ┆ 0.0      │
│ mean       ┆ null   ┆ 2003-03-13 12:00:00 ┆ 65.0      ┆ 5.95     │
│ std        ┆ null   ┆ null                ┆ 12.909944 ┆ 0.129099 │
│ min        ┆ Ajay   ┆ 2003-01-01          ┆ 50.0      ┆ 5.8      │
│ 25%        ┆ null   ┆ 2003-01-01          ┆ 60.0      ┆ 5.9      │
│ 50%        ┆ null   ┆ 2003-01-01          ┆ 70.0      ┆ 6.0      │
│ 75%        ┆ null   ┆ 2003-01-01          ┆ 70.0      ┆ 6.0      │
│ max        ┆ Yatish ┆ 2003-10-14          ┆ 80.0      ┆ 6.1      │
└────────────┴──────

In [48]:
print(df.schema)

Schema({'name': String, 'birthdate': Date, 'weight': Int64, 'height': Float64})


In [49]:
schema = pl.Schema(
    {
        "int_1": pl.Int16,
        "int_2": pl.Int32,
        "float_1": pl.Float64,
        "float_2": pl.Float64,
        "float_3": pl.Float64,
    }
)

print(
    pl.LazyFrame(schema=schema)
    .select((pl.col(pl.Float64) * 1.1).name.suffix("*1.1"))
    .explain()
)

 SELECT [[(col("float_1")) * (1.1)].alias("float_1*1.1"), [(col("float_2")) * (1.1)].alias("float_2*1.1"), [(col("float_3")) * (1.1)].alias("float_3*1.1")] FROM
  DF ["int_1", "int_2", "float_1", "float_2"]; PROJECT 3/5 COLUMNS


In [50]:
import polars as pl
import numpy as np

np.random.seed(42)  # For reproducibility.

df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", "spam"],
        "random": list(np.random.rand(5)),
        "groups": ["A", "A", "B", "A", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.37454  ┆ A      │
│ 2    ┆ ham   ┆ 0.950714 ┆ A      │
│ 3    ┆ spam  ┆ 0.731994 ┆ B      │
│ null ┆ egg   ┆ 0.598658 ┆ A      │
│ 5    ┆ spam  ┆ 0.156019 ┆ B      │
└──────┴───────┴──────────┴────────┘


In [51]:
result=df.select(
    (pl.col("random")+10).alias("random+10"),
    (pl.col("nrs")*(pl.col("random"))*100).alias("nrs*random*100")
)

In [52]:
print(result)

shape: (5, 2)
┌───────────┬────────────────┐
│ random+10 ┆ nrs*random*100 │
│ ---       ┆ ---            │
│ f64       ┆ f64            │
╞═══════════╪════════════════╡
│ 10.37454  ┆ 37.454012      │
│ 10.950714 ┆ 190.142861     │
│ 10.731994 ┆ 219.598183     │
│ 10.598658 ┆ null           │
│ 10.156019 ┆ 78.00932       │
└───────────┴────────────────┘


In [53]:
result = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),  # .gt
    (pl.col("nrs") >= 3).alias("nrs >= 3"),  # ge
    (pl.col("random") < 0.2).alias("random < .2"),  # .lt
    (pl.col("random") <= 0.5).alias("random <= .5"),  # .le
    (pl.col("nrs") != 1).alias("nrs != 1"),  # .ne
    (pl.col("nrs") == 1).alias("nrs == 1"),  # .eq
)
print(result)

shape: (5, 6)
┌─────────┬──────────┬─────────────┬──────────────┬──────────┬──────────┐
│ nrs > 1 ┆ nrs >= 3 ┆ random < .2 ┆ random <= .5 ┆ nrs != 1 ┆ nrs == 1 │
│ ---     ┆ ---      ┆ ---         ┆ ---          ┆ ---      ┆ ---      │
│ bool    ┆ bool     ┆ bool        ┆ bool         ┆ bool     ┆ bool     │
╞═════════╪══════════╪═════════════╪══════════════╪══════════╪══════════╡
│ false   ┆ false    ┆ false       ┆ true         ┆ false    ┆ true     │
│ true    ┆ false    ┆ false       ┆ false        ┆ true     ┆ false    │
│ true    ┆ true     ┆ false       ┆ false        ┆ true     ┆ false    │
│ null    ┆ null     ┆ false       ┆ false        ┆ null     ┆ null     │
│ true    ┆ true     ┆ true        ┆ true         ┆ true     ┆ false    │
└─────────┴──────────┴─────────────┴──────────────┴──────────┴──────────┘


In [54]:
result=df.select(
    ((~pl.col("nrs").is_null()) & (pl.col("groups")=="A")).alias(
        "number not null and group A"
    )
)
print(result)

shape: (5, 1)
┌─────────────────────────────┐
│ number not null and group A │
│ ---                         │
│ bool                        │
╞═════════════════════════════╡
│ true                        │
│ true                        │
│ false                       │
│ false                       │
│ false                       │
└─────────────────────────────┘


In [58]:
df=pl.DataFrame({
    "num":[1,2,3,4,5]
})
print(df)

shape: (5, 1)
┌─────┐
│ num │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
│ 4   │
│ 5   │
└─────┘


In [60]:
print(df.columns)

['num']
