## EDA: Exploratory Data Analysis

## Training & Test Data

In [None]:
import pandas as pd
import polars as pl
from pathlib import Path

# Set configs
BASE_DIR = Path().resolve().parent.parent
print(BASE_DIR)

In [None]:
# Load the datasets
df_test = pl.read_parquet(BASE_DIR / "data" / "processed" / "test.parquet")
df_train = pl.read_parquet(BASE_DIR / "data" / "processed" / "train.parquet")

In [None]:
df_test.head()

In [None]:
df_train.head()

In [None]:
print("Size of test set: ", df_test.shape, "\nSize of train set: ", df_train.shape)


In [None]:
print("Memory usage of test set: ", df_test.estimated_size() / 1024**2, "MB",
      "\nMemory usage of train set: ", df_train.estimated_size() / 1024**2, "MB")

## Training Results

In [None]:
import pandas as pd

training_runs_df = pd.read_csv(BASE_DIR / "data/training_results_har.csv")

training_runs_df.head()

In [None]:
training_runs_df.dtypes

In [None]:
# Unique types of models trained
model_num = len(training_runs_df['model'].unique())
print("Number of unique models trained: ", model_num)

# Unique sensor combinations used
sensor_comb_num = len(training_runs_df['sensor_config'].unique())
print("Number of unique sensor combinations used: ", sensor_comb_num)

In [None]:
# Number of unique models trained per model type
unique_models = training_runs_df["model"].unique()
for model in unique_models:
    x = len(training_runs_df[training_runs_df["model"] == model])
    print(f"For model type --> {model}, {x} unique models were trained")


In [None]:
# Best model for each unique model type by generalization gap
for model in unique_models:
    x = training_runs_df[training_runs_df["model"] == model]
    best_model_index = x['generalization_gap'].values.argmin()
    best_model_name = x.iloc[best_model_index]['Name']

    print(f"For model type {model} ---> Best model name: {best_model_name}")