## Information about the use of EVCXR
https://github.com/evcxr/evcxr/blob/main/COMMON.md


### Activo cache.
Con ello evitamos que cada vez que ejecutemos el notebook se tengan que volver a compilar las dependencias.

In [None]:
:help

Según podemos ver en [Rust for DS and DE](https://rust.marcoinacio.com/data/jupyter/)
```shell
cargo install sccache --locked
```

# Configuración básica.
Indica que muestre:
- El tiempo que se tarda en la ejecución de cada celda.
- Activa la caché
- Compila con el [nivel de optimización indicado](https://docs.rust-embedded.org/book/unsorted/speed-vs-size.html)

In [None]:
//:timing
:sccache 1
:opt 0

## Instalar los crates necesarios

In [None]:
:dep dirs
:dep polars = {version = "0.33", features = ["lazy", "temporal", "describe", "json", "parquet", "dtype-datetime", "mode", "ndarray", "object", "dtype-struct", "concat_str", "round_series"]}
:dep chrono = {version = "0.4"}
:dep rand = {version = "0.8.5"}

In [None]:
//println!("Ruta del config file de evcxr: {:?}", dirs::config_dir().unwrap().join("evcxr").join("init.evcxr"));


In [None]:
use polars::prelude::*;
use chrono::{TimeZone, DateTime, Utc};
use std::path::Path;
use rand::{thread_rng, Rng};

### Empezamos a trabajar
Basado en los artículos sobre [analítica de datos con rust y polars](https://towardsdatascience.com/rust-polars-unlocking-high-performance-data-analysis-part-1-ce42af370ece)

In [None]:
let series: Series = [1,2,3].iter().collect();
println!("{:?}", series);

In [None]:
let series: Series = Series::new("numbers", &[1,2,3]);
println!("{:?}", series);

In [None]:
let seasons_ser: Series = Series::new("seasons", &["Winter", "Spring", "Summer", "Fall"]);
println!("{:?}", seasons_ser);

In [None]:
let s: Series = Series::new("seasons", &[None, Some(1), Some(2)]);
println!("{:?}", s);

In [None]:
Some(f64::NAN)==None

In [None]:
f64::NAN==f64::NAN

## Conversion types.
It is crucial to keep in mind that converting a series from one data type to another can lead to the loss or modification of certain values.

In [None]:
let s: Series = Series::new("numbers", &[Some(f64::NAN), Some(1.), Some(2.)]);
println!("Number of null values: {:?}", s.null_count());

s.drop_nulls()


In [None]:
let s: Series = Series::new("numbers", &[Some(f64::NAN), Some(1.), Some(2.)]);
println!("{:?}", s.cast(&DataType::Int64).unwrap());

## Series creation

In [None]:
let s = Series::new_empty("Height", &DataType::Float32);
println!("{:?}", s);

In [None]:
let s: Series = Series::new("employees", &["Mahmoud", "Ferris"]);
println!("{:?}", s);

In [None]:
s.name()

In [None]:
let s: Series = Series::new("employees", &vec!["Mahmoud", "Ferris"]);
println!("{:?}", s);

In [None]:
let s = Float64Chunked::new("b", &[1., 2., 3.]).into_series();
println!("{:?}", s);

## Slice

In [None]:
let s = Series::new("Measurements", &[-1.01, 0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
let sub_s = s.slice(0,3);
println!("{:?}, {:?}", s, sub_s);

## Datetime

In [None]:
let date: DateTime<Utc> = Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap();
let s = Series::new("b", &[date.date_naive()]);
println!("{:?}", s);

In [None]:
let mut s1 = Series::new("Age", &vec![23., 27.]);
let mut s2 = Series::new("Height", &[1.84, 1.78]);
println!("{:?}", s1.append(&s2));

In [None]:
let mut s = Series::new("Measurements", &[-1.01,  0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
println!("{:?}", s.cast(&DataType::Int32)?);

### Missing values in Series

In [None]:
let s = Series::new("some_missing", &[Some(1), None, Some(3), Some(4), None, Some(6)]);
let filled = s.fill_null(FillNullStrategy::Forward(None))?;
println!("Forward: {:?}", filled);

let filled = s.fill_null(FillNullStrategy::Backward(None))?;
println!("Backward: {:?}", filled);

let filled = s.fill_null(FillNullStrategy::Mean)?;
println!("Mean: {:?}", filled);

let filled = s.fill_null(FillNullStrategy::Min)?;
println!("Min: {:?}", filled);

let filled = s.fill_null(FillNullStrategy::Max)?;
println!("Max: {:?}", filled);

let filled = s.i32()?.fill_null_with_values(42)?.into_series();
println!("{:?}", filled);

In [None]:
let s = Series::new("Measurements", &[-1.01,  0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
// Take 4 samples with replacement and shuffle (Terms used in probability).
println!("{:?}", s.sample_n(4, true, true, Some(9999)));

## Estadística descriptiva

In [None]:
let s = Series::new("Measurements", &[-1.01,  0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
println!("{:?}", s.mean().unwrap());

In [None]:
let s = Series::new("Measures", &vec![Some(3), Some(4), None, Some(8), Some(6)]); // Mean no tiene en cuenta los valores nulos para hacer el cálculo.
println!("{:?}", s.mean().unwrap());

In [None]:
let s = Series::new("Measurements", &[-1.01,  0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
println!("{:?}", s.median().unwrap());

In [None]:
let s = Series::new("Measurements", &[-1.01,  0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
println!("{:?}", s.mode().unwrap());

## Measures of Spread

In [None]:
let s = Series::new("Measurements", &[-1.01,  0.86, -4.60, 3.98,  0.53, -7.04, 3.98,  0.53, -7.04]);
println!("{:?}", s.quantile_as_series(0.75, QuantileInterpolOptions::Nearest).unwrap());


# Parte 2
[Parte 2](https://towardsdatascience.com/rust-polars-unlocking-high-performance-data-analysis-part-2-7c58a3cb7a1f)

In [None]:
let df = DataFrame::default();
println!("{:?}", df);

In [None]:
let s1 = Series::new("Name", &["Mahmoud", "Arthur"]);
let s2 = Series::new("Age", &[23, 27]);
let s3 = Series::new("Height", &[1.84, 1.78]);
let df : DataFrame = DataFrame::new(vec![s1, s2, s3])?;
//let df = DataFrame::new(vec![s1, s2, s3]).unwrap();
println!("{:?}", df);

In [None]:
let df: PolarsResult<DataFrame> = df!("Name" => &["Mahmoud", "Ali"],
                                      "Age" => &[23, 27],
                                      "Height" => &[1.84, 1.78]);
println!("{:?}", df?);

In [None]:
let df1: DataFrame = df!("categorical" => &["d","e","f"],
                         "numeric" => &[1, 2, 3],
                         "object" => &["a", "b", "c"]).unwrap();
println!("{}", df1);

let df2: DataFrame = df1.describe(None).unwrap();
println!("{}", df2);

In [None]:
let df: DataFrame = df!("Name" => &["Mahmoud", "Bob"],
                        "Age" => &[23, 27],
                        "Height" => &[1.84, 1.78]).unwrap();
println!("{}", df.head(None));

In [None]:
{
// Create a sample DataFrame
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 29],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

let name_col = &df["Name"];
let name_col1 = &df[0];
println!("1 {:?}", name_col);
println!("2 {:?}", name_col1);

let subset = &df[..2];
println!("3 {:?}", subset);
}

In [None]:
let name_age_cols = df.select(["Name", "Age"]).unwrap();
println!("{:?}", name_age_cols);

In [None]:
{let my_col = &df.column("Name");
println!("{:?}", my_col);}

In [None]:
// Create a sample DataFrame
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

let mask = df.column("Age").expect("Age must exist!").gt(25)?;
let filtered_data = df.filter(&mask)?;

println!("{:?}", filtered_data);

// Output:

// shape: (1, 4)
// ┌──────────────┬─────┬────────┬────────┐
// │ Name         ┆ Age ┆ Gender ┆ Salary │
// │ ---          ┆ --- ┆ ---    ┆ ---    │
// │ str          ┆ i32 ┆ str    ┆ i32    │
// ╞══════════════╪═════╪════════╪════════╡
// │ ThePrimeagen ┆ 36  ┆ M      ┆ 250000 │
// └──────────────┴─────┴────────┴────────┘

In [None]:
// Create a sample DataFrame
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

println!("1 {:?}", df);
println!("2 {:?}", df.transpose(None, None)?);

// Output:

// shape: (4,)
// Series: 'column_0' [str]
// [
//     "Mahmoud"
//     "22"
//     "M"
//     "50000"
// ]

## Cleaning Data

In [None]:
let df = df!("Name" => &[Some("Mahmoud"),  None, None],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 60000, 250000]).unwrap();

println!("{:?}", df.null_count());
println!("{:?}", &df.select(["Name"])?.null_count());

In [None]:
let df = df!("Name" => &["Mahmoud",  "Mahmoud", "ThePrimeagen"],
             "Age" => &[22, 22, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 50000, 250000]).unwrap();

let mask = df.is_duplicated().unwrap();
let filtered_data = df.filter(&mask).unwrap();
println!("{:?}", filtered_data);

println!("{:?}", df.filter(&df.is_duplicated()?)?);

In [None]:
let df = df!("Name" => &["Mahmoud",  "Mahmoud", "ThePrimeagen"],
             "Age" => &[22, 22, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[50000, 50000, 250000]).unwrap();

df.filter(&df.is_unique()?)?

In [None]:
let df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                        "Color" => &["Red", "Yellow", "Green"])?;

In [None]:
let df_remain = df.drop("Color")?;
println!("{:?}", df_remain);
println!("{:?}", df);


In [None]:
let mut df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                            "Color" => &["Red", "Yellow", "Green"]).unwrap();
df.drop_in_place("Color"); // remove the row with index 1 ("Color") from df
println!("{:?}", df);

In [None]:
let df_dropped_col = df.drop_many(&["Color", ""]);
println!("{:?}", df_dropped_col);

In [None]:
let df: DataFrame = df!("Fruit" => &["Apple", "Apple", "Pear"],
                        "Color" => &[Some("Red"), None, None]).unwrap();
let df_clean = df.drop_nulls::<String>(None).unwrap();
println!("{:?}", df_clean);

In [None]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();
let mask = df.column("Salary").expect("Salary must exist!").is_not_null();
println!("{:?}", mask.head(None));
let filtered_data = df.filter(&mask)?;
println!("{:?}", filtered_data);
println!("{:?}", df.filter(&df.column("Salary")?.is_not_null())?);



In [None]:
let mut df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

let filtered_nulls = df.fill_null(FillNullStrategy::Forward(None)).unwrap();
    
println!("{:?}", filtered_nulls);
println!("{:?}", df.mean());
println!("{:?}", df.median());
println!("{:?}", df.std(1));
println!("{:?}", df.var(1));


## Ndarray

In [None]:
let df = df!("Name" => &["Mahmoud", "Ali", "ThePrimeagen"],
             "Age" => &[22, 25, 36],
             "Gender" => &["M", "M", "M"],
             "Salary" => &[Some(50000), Some(60000), None]).unwrap();

println!("{:?}", df.to_ndarray::<Float64Type>(IndexOrder::Fortran).unwrap());
println!("{:?}", df.to_ndarray::<Float64Type>(IndexOrder::C).unwrap());




In [None]:
let a = UInt32Chunked::new("a", &[1, 2, 3]).into_series();
let b = Float64Chunked::new("b", &[10., 8., 6.]).into_series();

let df = DataFrame::new(vec![a, b]).unwrap();
let ndarray = df.to_ndarray::<Float64Type>(IndexOrder::Fortran).unwrap();
println!("{:?}", ndarray);

In [None]:


    let a = UInt32Chunked::new("a", &[1, 2, 3]).into_series();
    let b = Float64Chunked::new("b", &[10., 8., 6.]).into_series();

    let df = DataFrame::new(vec![a, b]).unwrap();
    let ndarray = df.to_ndarray::<Int32Type>(IndexOrder::Fortran).unwrap();
    
    // Especifica el tipo de ndarray al imprimirlo
    println!("{:?}", ndarray);



## Aggregation Functions

Aggregation Functions
Download [the flights dataset](https://www.kaggle.com/datasets/deepak007chaubey/flight-on-time-dataset/) and move it to the dataset directory.

In [None]:
fn read_data_frame_from_csv(
    csv_file_path: &Path,
) -> DataFrame {
    CsvReader::from_path(csv_file_path)
        .expect("Cannot open file.")
        .has_header(true)
        .finish()
        .unwrap()
}


let flights_file_path: &Path = Path::new("../dataset/Flight_on_time_HIX.csv");
let columns = ["Airline", "Origin_Airport", "Destination_Airport", "Departure_Delay_Minutes", "Arrival_Delay_Minutes"];
let flights_df: DataFrame = read_data_frame_from_csv(flights_file_path).select(columns).unwrap();
flights_df.head(Some(5))

In [None]:
flights_df.shape()

In [None]:
std::env::set_var("POLARS_FMT_MAX_ROWS", "-1".to_string());
std::env::set_var("POLARS_FMT_MAX_COLS", "-1".to_string());

In [None]:
let arr_delay_mean_df: DataFrame = flights_df
.clone()
.lazy()
.group_by(["Airline"], )
.agg([col("Arrival_Delay_Minutes").mean().alias("Delay"),])
.sort("Delay",
    SortOptions {
        descending: true,
        nulls_last: false,
        maintain_order: false,
        multithreaded: false,
    },
)
.collect()?;

// .expect("Airline Column must exist!").select(["Arrival_Delay_Minutes"]).mean().unwrap().alias("Delay").sort("Delay");
arr_delay_mean_df.head(Some(5))

In [None]:
let arr_delay_mean_df: DataFrame = flights_df
.clone()
.lazy()
.group_by(["Airline"])
.agg([
    col("Arrival_Delay_Minutes")
        .mean()
        .round(3)
        .alias("Arrival_Delay"),
    col("Departure_Delay_Minutes")
        .mean()
        .round(3)
        .alias("Departure_Delay"),
])
.sort_by_exprs(
    vec![col("Arrival_Delay"), col("Departure_Delay")],
    vec![true, true],
    false,
    false,
)
.collect()
.unwrap();

println!("{:?}", arr_delay_mean_df.shape().0);
println!(
"{}",
arr_delay_mean_df.head(Some(arr_delay_mean_df.shape().0))
);

In [None]:
let dep_delay_mean_def: DataFrame = flights_df
.clone()
.lazy()
.group_by(["Airline", "Origin_Airport"])
.agg([col("Departure_Delay_Minutes").mean(),])
.sort_by_exprs(vec![col("Airline")], vec![false], false, false)
.collect()
.unwrap();
dep_delay_mean_def.head(Some(5))


## Mergin DataFrames

In [None]:
let df1: DataFrame = df!("Carrier" => &["HA", "EV", "VX", "DL"],
                         "ArrDelay" => &[-3, 28, 0, 1]).unwrap();
let df2: DataFrame = df!("Airline" => &["HA", "EV", "OO", "VX"],
                         "DepDelay" => &[21, -8, 11, -4]).unwrap();

let df3: DataFrame = df1
.clone()
.lazy()
.join(
    df2.clone().lazy(),
    [col("Carrier")],
    [col("Airline")],
    JoinArgs::new(JoinType::Inner),
)
.collect()
.unwrap();
// or: let df3: DataFrame = df1.inner_join(&df2, ["Carrier"], ["Airline"]).unwrap();
df3.head(Some(5))

In [None]:
:vars

In [None]:
let df1: DataFrame = df!("Carrier" => &["HA", "EV", "VX", "DL"],
                         "ArrDelay" => &[-3, 28, 0, 1]).unwrap();
let df2: DataFrame = df!("Airline" => &["HA", "EV", "OO", "VX"],
                         "DepDelay" => &[21, -8, 11, -4]).unwrap();

let df3: DataFrame = df1
.clone()
.lazy()
.inner_join(
    df2.clone().lazy(),
    col("Carrier"),
    col("Airline"),
)
.collect()?;
// or: let df3: DataFrame = df1.inner_join(&df2, ["Carrier"], ["Airline"]).unwrap();
df3.head(Some(5))