## Install Dependencies

In [2]:
:dep polars = { version = "0.28.0", features = ["describe", "lazy", "ndarray", "object", "dtype-struct", "concat_str", "mode"] }

<hr />

## Import Libraries

In [4]:
use std::path::Path;
use polars::prelude::*;

<hr />

## DataFrame Object

### DataFrame Initialization

In [43]:
let df = DataFrame::default();
df

shape: (0, 0)
┌┐
╞╡
└┘

In [51]:
let s1 = Series::new("Name", &["Mahmoud", "Ali"]);
let s2 = Series::new("Age", &[23, 27]);
let s3 = Series::new("Height", &[1.84, 1.78]);

let df: DataFrame = DataFrame::new(vec![s1, s2, s3]).unwrap();
df

shape: (2, 3)
┌─────────┬─────┬────────┐
│ Name    ┆ Age ┆ Height │
│ ---     ┆ --- ┆ ---    │
│ str     ┆ i32 ┆ f64    │
╞═════════╪═════╪════════╡
│ Mahmoud ┆ 23  ┆ 1.84   │
│ Ali     ┆ 27  ┆ 1.78   │
└─────────┴─────┴────────┘

### Describe

In [45]:
let df1: DataFrame = df!("categorical" => &["d","e","f"],
                         "numeric" => &[1, 2, 3],
                         "object" => &["a", "b", "c"]).unwrap();
df1

shape: (3, 3)
┌─────────────┬─────────┬────────┐
│ categorical ┆ numeric ┆ object │
│ ---         ┆ ---     ┆ ---    │
│ str         ┆ i32     ┆ str    │
╞═════════════╪═════════╪════════╡
│ d           ┆ 1       ┆ a      │
│ e           ┆ 2       ┆ b      │
│ f           ┆ 3       ┆ c      │
└─────────────┴─────────┴────────┘

In [46]:
let df2: DataFrame = df1.describe(None).unwrap();
df2

shape: (9, 4)
┌────────────┬─────────────┬─────────┬────────┐
│ describe   ┆ categorical ┆ numeric ┆ object │
│ ---        ┆ ---         ┆ ---     ┆ ---    │
│ str        ┆ str         ┆ f64     ┆ str    │
╞════════════╪═════════════╪═════════╪════════╡
│ count      ┆ 3           ┆ 3.0     ┆ 3      │
│ null_count ┆ 0           ┆ 0.0     ┆ 0      │
│ mean       ┆ null        ┆ 2.0     ┆ null   │
│ std        ┆ null        ┆ 1.0     ┆ null   │
│ …          ┆ …           ┆ …       ┆ …      │
│ 25%        ┆ null        ┆ 1.5     ┆ null   │
│ 50%        ┆ null        ┆ 2.0     ┆ null   │
│ 75%        ┆ null        ┆ 2.5     ┆ null   │
│ max        ┆ f           ┆ 3.0     ┆ c      │
└────────────┴─────────────┴─────────┴────────┘

### Head

In [47]:
let df: DataFrame = df!("Name" => &["Mahmoud", "Bob"],
                                      "Age" => &[23, 27],
                                      "Height" => &[1.84, 1.78]).unwrap();
// First 10 rows
df.head(None)

shape: (2, 3)
┌─────────┬─────┬────────┐
│ Name    ┆ Age ┆ Height │
│ ---     ┆ --- ┆ ---    │
│ str     ┆ i32 ┆ f64    │
╞═════════╪═════╪════════╡
│ Mahmoud ┆ 23  ┆ 1.84   │
│ Bob     ┆ 27  ┆ 1.78   │
└─────────┴─────┴────────┘

In [48]:
let df: DataFrame = df!("Name" => &["Mahmoud", "Bob"],
                                      "Age" => &[23, 27],
                                      "Height" => &[1.84, 1.78]).unwrap();
// First row
df.head(Some(1))

shape: (1, 3)
┌─────────┬─────┬────────┐
│ Name    ┆ Age ┆ Height │
│ ---     ┆ --- ┆ ---    │
│ str     ┆ i32 ┆ f64    │
╞═════════╪═════╪════════╡
│ Mahmoud ┆ 23  ┆ 1.84   │
└─────────┴─────┴────────┘

### Tail

In [49]:
let df: DataFrame = df!("Name" => &["Mahmoud", "Bob"],
                                      "Age" => &[23, 27],
                                      "Height" => &[1.84, 1.78]).unwrap();
// Last 10 rows
df.tail(None)

shape: (2, 3)
┌─────────┬─────┬────────┐
│ Name    ┆ Age ┆ Height │
│ ---     ┆ --- ┆ ---    │
│ str     ┆ i32 ┆ f64    │
╞═════════╪═════╪════════╡
│ Mahmoud ┆ 23  ┆ 1.84   │
│ Bob     ┆ 27  ┆ 1.78   │
└─────────┴─────┴────────┘

In [50]:
let df: DataFrame = df!("Name" => &["Mahmoud", "Bob"],
                                      "Age" => &[23, 27],
                                      "Height" => &[1.84, 1.78]).unwrap();
// Last row
df.tail(Some(1))

shape: (1, 3)
┌──────┬─────┬────────┐
│ Name ┆ Age ┆ Height │
│ ---  ┆ --- ┆ ---    │
│ str  ┆ i32 ┆ f64    │
╞══════╪═════╪════════╡
│ Bob  ┆ 27  ┆ 1.78   │
└──────┴─────┴────────┘

### Indexing & Slicing

In [56]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 29],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();
{
    let name_col = &df["Name"];
    println!("{}", name_col);
}
// or
// let name_col1 = &df[0];

shape: (3,)
Series: 'Name' [str]
[
	"Mahmoud"
	"Ali"
	"ThePrimeagen"
]


In [69]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 29],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

let name_age_cols = df.select(["Name", "Age"]).unwrap();
name_age_cols

shape: (3, 2)
┌──────────────┬─────┐
│ Name         ┆ Age │
│ ---          ┆ --- │
│ str          ┆ i32 │
╞══════════════╪═════╡
│ Mahmoud      ┆ 22  │
│ Ali          ┆ 25  │
│ ThePrimeagen ┆ 29  │
└──────────────┴─────┘

In [73]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 29],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

{
    let name_col = df.column("Name");
    println!("{:?}", name_col);
}

Ok(shape: (3,)
Series: 'Name' [str]
[
	"Mahmoud"
	"Ali"
	"ThePrimeagen"
])


()

In [72]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();


let mask = df.column("Age").expect("Age must exist!").gt(25).unwrap();
let filtered_data = df.filter(&mask).unwrap();
println!("{:?}", filtered_data);

shape: (1, 4)
┌──────────────┬─────┬────────┬────────┐
│ Name         ┆ Age ┆ Gender ┆ Salary │
│ str          ┆ i32 ┆ str    ┆ i32    │
│ ---          ┆ --- ┆ ---    ┆ ---    │
╞══════════════╪═════╪════════╪════════╡
│ ThePrimeagen ┆ 36  ┆ M      ┆ 250000 │
└──────────────┴─────┴────────┴────────┘


In [74]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

df.slice(2, 3)

shape: (1, 4)
┌──────────────┬─────┬────────┬────────┐
│ Name         ┆ Age ┆ Gender ┆ Salary │
│ ---          ┆ --- ┆ ---    ┆ ---    │
│ str          ┆ i32 ┆ str    ┆ i32    │
╞══════════════╪═════╪════════╪════════╡
│ ThePrimeagen ┆ 36  ┆ M      ┆ 250000 │
└──────────────┴─────┴────────┴────────┘

In [75]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

df.transpose().unwrap()[0]

shape: (4,)
Series: 'column_0' [str]
[
	"Mahmoud"
	"22"
	"M"
	"50000"
]

## Data Cleaning

### Nulls Count

In [76]:
let df = df!("Name" => &[Some("Mahmoud"),  None, None],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

 df.null_count()

shape: (1, 4)
┌──────┬─────┬────────┬────────┐
│ Name ┆ Age ┆ Gender ┆ Salary │
│ ---  ┆ --- ┆ ---    ┆ ---    │
│ u32  ┆ u32 ┆ u32    ┆ u32    │
╞══════╪═════╪════════╪════════╡
│ 2    ┆ 0   ┆ 0      ┆ 0      │
└──────┴─────┴────────┴────────┘

### Duplicates

In [77]:
let df = df!("Name" => &["Mahmoud",  "Mahmoud", "ThePrimeagen"],
             "Age" => &[22, 22, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 50000, 250000]).unwrap();
let mask = df.is_duplicated().unwrap();
let filtered_data = df.filter(&mask).unwrap();
filtered_data

shape: (2, 4)
┌─────────┬─────┬────────┬────────┐
│ Name    ┆ Age ┆ Gender ┆ Salary │
│ ---     ┆ --- ┆ ---    ┆ ---    │
│ str     ┆ i32 ┆ str    ┆ i32    │
╞═════════╪═════╪════════╪════════╡
│ Mahmoud ┆ 22  ┆ M      ┆ 50000  │
│ Mahmoud ┆ 22  ┆ M      ┆ 50000  │
└─────────┴─────┴────────┴────────┘

### Unique Values

In [78]:
let df = df!("Name" => &["Mahmoud",  "Mahmoud", "ThePrimeagen"],
             "Age" => &[22, 22, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 50000, 250000]).unwrap();
let mask = df.is_unique().unwrap();
let filtered_data = df.filter(&mask).unwrap();
filtered_data

shape: (1, 4)
┌──────────────┬─────┬────────┬────────┐
│ Name         ┆ Age ┆ Gender ┆ Salary │
│ ---          ┆ --- ┆ ---    ┆ ---    │
│ str          ┆ i32 ┆ str    ┆ i32    │
╞══════════════╪═════╪════════╪════════╡
│ ThePrimeagen ┆ 36  ┆ M      ┆ 250000 │
└──────────────┴─────┴────────┴────────┘

### Drop

In [80]:
let df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                        "Color" => &["Red", "Yellow", "Green"]).unwrap();
let df_remain = df.drop("Color").unwrap(); 
println!("{}", df_remain);
println!("{}", df); // the original DataFrame

shape: (3, 1)
┌───────┐
│ Fruit │
│ ---   │
│ str   │
│ Apple │
╞═══════╡
│ Apple │
│ Pear  │
└───────┘
shape: (3, 2)
┌───────┬────────┐
│ Fruit ┆ Color  │
│ ---   ┆ ---    │
│ str   ┆ str    │
╞═══════╪════════╡
│ Apple ┆ Red    │
│ Apple ┆ Yellow │
│ Pear  ┆ Green  │
└───────┴────────┘


In [81]:
let mut df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                                      "Color" => &["Red", "Yellow", "Green"]).unwrap();
df.drop_in_place("Color"); // remove the row with index 1 ("Banana") from df
df

shape: (3, 1)
┌───────┐
│ Fruit │
│ ---   │
│ str   │
╞═══════╡
│ Apple │
│ Apple │
│ Pear  │
└───────┘

In [84]:
let df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                                      "Color" => &["Red", "Yellow", "Green"]).unwrap();
let df_dropped_col = df.drop_many(&["Color", ""]);
df_dropped_col

shape: (3, 1)
┌───────┐
│ Fruit │
│ ---   │
│ str   │
╞═══════╡
│ Apple │
│ Apple │
│ Pear  │
└───────┘

In [85]:
let df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                                      "Color" => &[Some("Red"), None, None]).unwrap();
let df_clean = df.drop_nulls::<String>(None).unwrap();
df_clean

shape: (1, 2)
┌───────┬───────┐
│ Fruit ┆ Color │
│ ---   ┆ ---   │
│ str   ┆ str   │
╞═══════╪═══════╡
│ Apple ┆ Red   │
└───────┴───────┘

In [86]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();
let mask = df.column("Salary").expect("Salary must exist!").is_not_null();
mask.head(None)

shape: (3,)
ChunkedArray: 'Salary' [bool]
[
	true
	true
	false
]

In [87]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();
let filtered_data = df.filter(&mask).unwrap();
filtered_data

shape: (2, 4)
┌─────────┬─────┬────────┬────────┐
│ Name    ┆ Age ┆ Gender ┆ Salary │
│ ---     ┆ --- ┆ ---    ┆ ---    │
│ str     ┆ i32 ┆ str    ┆ i32    │
╞═════════╪═════╪════════╪════════╡
│ Mahmoud ┆ 22  ┆ M      ┆ 50000  │
│ Ali     ┆ 25  ┆ M      ┆ 60000  │
└─────────┴─────┴────────┴────────┘

### Fill

In [88]:
let mut df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

let filtered_nulls = df.fill_null(FillNullStrategy::Forward(None)).unwrap();
filtered_nulls

shape: (3, 4)
┌──────────────┬─────┬────────┬────────┐
│ Name         ┆ Age ┆ Gender ┆ Salary │
│ ---          ┆ --- ┆ ---    ┆ ---    │
│ str          ┆ i32 ┆ str    ┆ i32    │
╞══════════════╪═════╪════════╪════════╡
│ Mahmoud      ┆ 22  ┆ M      ┆ 50000  │
│ Ali          ┆ 25  ┆ M      ┆ 60000  │
│ ThePrimeagen ┆ 36  ┆ M      ┆ 60000  │
└──────────────┴─────┴────────┴────────┘

## Measures of central tendency

### Mean

In [89]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

df.mean()

shape: (1, 4)
┌──────┬───────────┬────────┬─────────┐
│ Name ┆ Age       ┆ Gender ┆ Salary  │
│ ---  ┆ ---       ┆ ---    ┆ ---     │
│ str  ┆ f64       ┆ str    ┆ f64     │
╞══════╪═══════════╪════════╪═════════╡
│ null ┆ 27.666667 ┆ null   ┆ 55000.0 │
└──────┴───────────┴────────┴─────────┘

### Median

In [90]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

df.median()

shape: (1, 4)
┌──────┬──────┬────────┬─────────┐
│ Name ┆ Age  ┆ Gender ┆ Salary  │
│ ---  ┆ ---  ┆ ---    ┆ ---     │
│ str  ┆ f64  ┆ str    ┆ f64     │
╞══════╪══════╪════════╪═════════╡
│ null ┆ 25.0 ┆ null   ┆ 55000.0 │
└──────┴──────┴────────┴─────────┘

## Measures of spread

### std

In [91]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

df.std(1)

shape: (1, 4)
┌──────┬──────────┬────────┬─────────────┐
│ Name ┆ Age      ┆ Gender ┆ Salary      │
│ ---  ┆ ---      ┆ ---    ┆ ---         │
│ str  ┆ f64      ┆ str    ┆ f64         │
╞══════╪══════════╪════════╪═════════════╡
│ null ┆ 7.371115 ┆ null   ┆ 7071.067812 │
└──────┴──────────┴────────┴─────────────┘

### var

In [93]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

df.var(1)

shape: (1, 4)
┌──────┬───────────┬────────┬────────┐
│ Name ┆ Age       ┆ Gender ┆ Salary │
│ ---  ┆ ---       ┆ ---    ┆ ---    │
│ str  ┆ f64       ┆ str    ┆ f64    │
╞══════╪═══════════╪════════╪════════╡
│ null ┆ 54.333333 ┆ null   ┆ 5e7    │
└──────┴───────────┴────────┴────────┘

## Ndarray

In [94]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

df.to_ndarray::<Float64Type>().unwrap()

[[NaN, 22.0, NaN, 50000.0],
 [NaN, 25.0, NaN, 60000.0],
 [NaN, 36.0, NaN, NaN]], shape=[3, 4], strides=[1, 3], layout=Ff (0xa), const ndim=2

<hr />

## Aggregation

Downlaod [the flights dataset](https://www.kaggle.com/datasets/tylerx/flights-and-airports-data?select=flights.csv) and put it under the `dataset` folder.

In [6]:
fn read_data_frame_from_csv(
    csv_file_path: &Path,
) -> DataFrame {
    CsvReader::from_path(csv_file_path)
        .expect("Cannot open file.")
        .has_header(true)
        .finish()
        .unwrap()
}


let flights_file_path: &Path = Path::new("dataset/flights.csv");
let flights_df: DataFrame = read_data_frame_from_csv(flights_file_path);
flights_df.head(Some(5))

shape: (5, 7)
┌────────────┬───────────┬─────────┬─────────────────┬───────────────┬──────────┬──────────┐
│ DayofMonth ┆ DayOfWeek ┆ Carrier ┆ OriginAirportID ┆ DestAirportID ┆ DepDelay ┆ ArrDelay │
│ ---        ┆ ---       ┆ ---     ┆ ---             ┆ ---           ┆ ---      ┆ ---      │
│ i64        ┆ i64       ┆ str     ┆ i64             ┆ i64           ┆ i64      ┆ i64      │
╞════════════╪═══════════╪═════════╪═════════════════╪═══════════════╪══════════╪══════════╡
│ 19         ┆ 5         ┆ DL      ┆ 11433           ┆ 13303         ┆ -3       ┆ 1        │
│ 19         ┆ 5         ┆ DL      ┆ 14869           ┆ 12478         ┆ 0        ┆ -8       │
│ 19         ┆ 5         ┆ DL      ┆ 14057           ┆ 14869         ┆ -4       ┆ -15      │
│ 19         ┆ 5         ┆ DL      ┆ 15016           ┆ 11433         ┆ 28       ┆ 24       │
│ 19         ┆ 5         ┆ DL      ┆ 11193           ┆ 12892         ┆ -6       ┆ -11      │
└────────────┴───────────┴─────────┴─────────────────┴──

In [7]:
let flights_carrier_df: DataFrame = flights_df.groupby(["Carrier"]).expect("Carrier Column must exist!").select(["ArrDelay"]).mean().unwrap();
flights_carrier_df.head(Some(5))

shape: (5, 2)
┌─────────┬───────────────┐
│ Carrier ┆ ArrDelay_mean │
│ ---     ┆ ---           │
│ str     ┆ f64           │
╞═════════╪═══════════════╡
│ HA      ┆ 1.532125      │
│ F9      ┆ 12.848704     │
│ FL      ┆ 7.228765      │
│ YV      ┆ 8.547585      │
│ AA      ┆ 7.136775      │
└─────────┴───────────────┘

In [10]:
let flights_df_arr: DataFrame = flights_df.groupby(["Carrier", "DayOfWeek"]).expect("Carrier and DayOfWeek Columns must exist!").select(["DepDelay"]).mean().unwrap();
flights_df_arr.head(Some(5))

shape: (5, 3)
┌─────────┬───────────┬───────────────┐
│ Carrier ┆ DayOfWeek ┆ DepDelay_mean │
│ ---     ┆ ---       ┆ ---           │
│ str     ┆ i64       ┆ f64           │
╞═════════╪═══════════╪═══════════════╡
│ UA      ┆ 4         ┆ 16.67069      │
│ B6      ┆ 3         ┆ 11.556713     │
│ WN      ┆ 5         ┆ 15.258676     │
│ 9E      ┆ 5         ┆ 11.122865     │
│ HA      ┆ 4         ┆ 2.071226      │
└─────────┴───────────┴───────────────┘

<hr />

## Merge

In [11]:
let df1: DataFrame = df!("Carrier" => &["HA", "EV", "VX", "DL"],
                         "ArrDelay" => &[-3, 28, 0, 1]).unwrap();
let df2: DataFrame = df!("Airline" => &["HA", "EV", "OO", "VX"],
                         "DepDelay" => &[21, -8, 11, -4]).unwrap();

let df3: DataFrame = df1.join(&df2, ["Carrier"], ["Airline"], JoinType::Inner, None).unwrap();
df3.head(Some(5))

shape: (3, 3)
┌─────────┬──────────┬──────────┐
│ Carrier ┆ ArrDelay ┆ DepDelay │
│ ---     ┆ ---      ┆ ---      │
│ str     ┆ i32      ┆ i32      │
╞═════════╪══════════╪══════════╡
│ HA      ┆ -3       ┆ 21       │
│ EV      ┆ 28       ┆ -8       │
│ VX      ┆ 0        ┆ -4       │
└─────────┴──────────┴──────────┘

<hr />