# Polars Evaluation

In [None]:
import os
import polars as pl

In [None]:
print("Polars Version:", pl.__version__)

# Data Operations

## Loading

In [None]:
main_dir, _ = os.path.split(os.getcwd())
data_dir = os.path.join(main_dir, "data")
internet_session_data_path = os.path.join(data_dir, "internet_sessions")

In [None]:
%%time
dfs = [pl.read_csv(os.path.join(internet_session_data_path, f)) for f in os.listdir(internet_session_data_path)]
polars_df = pl.concat(dfs)
polars_df.head(5)

## Column Selection

In [None]:
%%time
polars_df[["username", "private_ip", "terminatecause", "statustype"]].head(5)

## Row Selection

In [None]:
%%time
polars_df.row(51235)

## Filtering

In [None]:
%%time
polars_df.filter((polars_df['port_begin'] > 27224) & (polars_df['port_end'] < 60363)).head(5)

## Grouping & Aggregating

In [None]:
%%time
polars_df.group_by('username').agg(pl.col('download').sum(), pl.col('upload').sum()).head(5)

## Joining

In [None]:
polars_ids_df = pl.read_csv(os.path.join(data_dir, "ids", "username_id.csv"))

In [None]:
%%time
polars_df.join(polars_ids_df, on='username', how="inner").head(5)

## Pivoting

In [None]:
%%time
polars_df.fill_nan("Empty").pivot(index='statustype', columns='terminatecause', values="username", aggregate_function='count').fill_nan("").head(100)

## Sorting

In [None]:
%%time
polars_df.sort('username').head(5)

## Applying External Functions

In [None]:
def bytes_to_gb(value):
    return value/(1024**3)

In [None]:
%%time
polars_df.with_columns(pl.col("download").map_elements(bytes_to_gb)).head(5)

## String operation

In [None]:
%%time
polars_df['username'].str.contains('10|20').head(5)

## Datetime Operation

In [None]:
%%time
polars_df.with_columns(pl.col("start_time").cast(pl.String).str.strptime(pl.Datetime, format="%Y%m%d%H%M%S").alias("start_time")).head(5)

## Writing

In [None]:
%%time
polars_df.write_csv(os.path.join(data_dir, "output", "polars_data.csv"), include_bom=False)

# Done