In [146]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

from get_to_know_data import load_data

In [147]:
raw_outputs = Path("../data_sync/outputs")
files = list(raw_outputs.glob("*.csv"))
print(len(files))

24


In [148]:
info_columns = ['tt_s', 'Lap', 'Gear', 'ns1:LatitudeDegrees', 'ns1:LongitudeDegrees', 'Sport', 'Variant'] 
feature_columns_pole_data = ['speed_kmph', 'power_w', 'frequency_ppm', 'thrust_left_ms', 'thrust_right_ms', 'impulse_left_ns', 'impulse_right_ns', 'force_meanl_n', 'force_meanr_n', 'f_tot_mean_n']
feature_columns_gnss = ['ns1:AltitudeMeters', 'ns2:Speed', 'ns2:RunCadence', 'ns2:Watts']

In [149]:
feature_columns = ['speed_kmph',
                    'power_w',
                    'frequency_ppm',
                    'thrust_left_ms',
                    'thrust_right_ms',
                    'impulse_left_ns',
                    'impulse_right_ns',
                    'force_meanl_n',
                    'force_meanr_n',
                    'f_tot_mean_n',
                    'ns2:Speed',
                    'ns2:RunCadence',
                    'ns2:Watts',
                    'ns1:AltitudeMeters'
                ]

In [150]:
selected_columns = info_columns + feature_columns_pole_data + feature_columns_gnss

In [151]:
def extract_mean_and_std(train_val_files):
    all_files = train_val_files
    means = {}
    stds = {}
    for col in feature_columns_pole_data + feature_columns_gnss:
        col_values = []
        for file in all_files:
            data = load_data(file)
            if col in data.columns:
                col_values.extend(data[col].dropna().values)
        means[col] = np.mean(col_values)
        stds[col] = np.std(col_values)
    return means, stds

In [152]:
def normalize_files(
    files, mean, std,
    selected_columns,
    feature_columns,
    output_dir
):
    """
    Normalize selected columns in multiple CSV files, return DataFrames,
    and save normalized versions to another directory.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    normalized_dfs = {}  # store results and return them

    for file in files:
        df = load_data(file)

        # Keep only selected columns
        df = df[selected_columns].copy()

        # Normalize
        for col in feature_columns:
            if col in df.columns:
                df[col] = (df[col] - mean[col]) / std[col]

        # Save to new location
        out_path = output_dir / Path(file).name
        df.to_csv(out_path, index=False)

        normalized_dfs[file] = df  # store for return

    return normalized_dfs

In [153]:
def find_files_containing_variant(files, variant='NR'):
    selected_files = []
    for file in files:
        data = load_data(file)
        if 'Variant' in data.columns and (data['Variant'] == variant).any():
            selected_files.append(file)
    return selected_files

def extract_test_files_on_skier_ids(files, test_skier_ids):
    test_files = []
    train_val_files = []
    for file in files:
        for skier_id in test_skier_ids:
            if skier_id in file.name:
                test_files.append(file)
                break
        else:
            train_val_files.append(file)
    return train_val_files, test_files

## Dataset split on skiers

NR

In [154]:
dataset_name = "NR_split_on_skiers"

In [155]:
NR_files = find_files_containing_variant(files, variant='NR')
print(len(NR_files), "files with NR variant found.")
print([file.name for file in NR_files])

12 files with NR variant found.
['BIA24-8_NR_merged_with_gear.csv', 'BIA24-7_NR_merged_with_gear.csv', 'BIA24-18_NR_merged_with_gear.csv', 'BIA24-3_NR_merged_with_gear.csv', 'BIA24-22_NR_merged_with_gear.csv', 'BIA24-4_NR_merged_with_gear.csv', 'BIA24-16_NR_merged_with_gear.csv', 'BIA24-19_NR_merged_with_gear.csv', 'BIA24-9_NR_merged_with_gear.csv', 'BIA24-5_NR_merged_with_gear.csv', 'BIA24-15_NR_merged_with_gear.csv', 'BIA24-20_NR_merged_with_gear.csv']


In [156]:
train_val_files, test_files = extract_test_files_on_skier_ids(NR_files, test_skier_ids=['-5', '-22'])
print(f"Train/Val files: {len(train_val_files)}, Test files: {len(test_files)}")

Train/Val files: 10, Test files: 2


In [157]:
mean, std = extract_mean_and_std(train_val_files)

In [158]:
normalize_files(train_val_files, mean, std, selected_columns, feature_columns, Path("../datasets") / dataset_name / "train_val")
normalize_files(test_files, mean, std, selected_columns, feature_columns, Path("../datasets") / dataset_name / "test")

print("Normalization complete.")

Normalization complete.


WR

In [159]:
dataset_name = "WR_split_on_skiers"

In [160]:
WR_files = find_files_containing_variant(files, variant='WR')
print(len(WR_files), "files with WR variant found.")
print([file.name for file in WR_files])

12 files with WR variant found.
['BIA24-17_WR_merged_with_gear.csv', 'BIA24-18_WR_merged_with_gear.csv', 'BIA24-7_WR_merged_with_gear.csv', 'BIA24-8_WR_merged_with_gear.csv', 'BIA24-22_WR_merged_with_gear.csv', 'BIA24-3_WR_merged_with_gear.csv', 'BIA24-9_WR_merged_with_gear.csv', 'BIA24-19_WR_merged_with_gear.csv', 'BIA24-16_WR_merged_with_gear.csv', 'BIA24-15_WR_merged_with_gear.csv', 'BIA24-5_WR_merged_with_gear.csv', 'BIA24-20_WR_merged_with_gear.csv']


In [161]:
train_val_files, test_files = extract_test_files_on_skier_ids(WR_files, test_skier_ids=['-5', '-22'])
print(f"Train/Val files: {len(train_val_files)}, Test files: {len(test_files)}")

Train/Val files: 10, Test files: 2


In [162]:
mean, std = extract_mean_and_std(train_val_files)

In [163]:
normalize_files(train_val_files, mean, std, selected_columns, feature_columns, Path("../datasets") / dataset_name / "train_val")
normalize_files(test_files, mean, std, selected_columns, feature_columns, Path("../datasets") / dataset_name / "test")

print("Normalization complete.")

Normalization complete.


All

In [164]:
dataset_name = "NR_and_WR_split_on_skiers"

In [165]:
train_val_files, test_files = extract_test_files_on_skier_ids(files, test_skier_ids=['-5', '-22'])
print(f"Train/Val files: {len(train_val_files)}, Test files: {len(test_files)}")

Train/Val files: 20, Test files: 4


In [166]:
mean, std = extract_mean_and_std(train_val_files)

In [167]:
normalize_files(train_val_files, mean, std, selected_columns, feature_columns, Path("../datasets") / dataset_name / "train_val")
normalize_files(test_files, mean, std, selected_columns, feature_columns, Path("../datasets") / dataset_name / "test")

print("Normalization complete.")

Normalization complete.


# Split 80-20 randomly

In [168]:
def load_and_merge(files, selected_columns):
    """
    Load many CSV files and merge them row-wise into a single DataFrame.
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f)
        df = df[selected_columns]          # keep only needed columns
        df["source_file"] = f.name         # optional â€“ track origin
        dfs.append(df)

    big_df = pd.concat(dfs, ignore_index=True)
    return big_df    

In [169]:
from sklearn.model_selection import train_test_split

def train_test_split_rows(df, train_ratio=0.8, random_state=42):
    train_df, test_df = train_test_split(
        df,
        train_size=train_ratio,
        shuffle=True,
        random_state=random_state
    )
    return train_df, test_df


In [170]:
def compute_mean_std(train_df, feature_columns):
    mean = train_df[feature_columns].mean()
    std  = train_df[feature_columns].std()
    return mean, std


In [171]:
def normalize_df(df, mean, std, feature_columns):
    df = df.copy()
    df[feature_columns] = (df[feature_columns] - mean) / std
    return df


In [172]:
def save_dataset(df, path):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)


NR

In [173]:
dataset_path = Path("../datasets/NR_80_20")

# 1. Collect all NR files
NR_files = find_files_containing_variant(files, variant="NR")

# 2. Merge all files into one big DataFrame
big_df = load_and_merge(NR_files, selected_columns)

# 3. Split 80/20 row-wise
train_df, test_df = train_test_split_rows(big_df, train_ratio=0.8)

# 4. Compute normalization parameters from training only
mean, std = compute_mean_std(train_df, feature_columns)

# 5. Normalize both datasets
train_df_norm = normalize_df(train_df, mean, std, feature_columns)
test_df_norm  = normalize_df(test_df, mean, std, feature_columns)

# 6. Save
save_dataset(train_df_norm, dataset_path / "train_val" /"train_val.csv")
save_dataset(test_df_norm,  dataset_path / "test" / "test.csv")

print("Dataset created and saved.")


Dataset created and saved.


WR

In [174]:
dataset_path = Path("../datasets/WR_80_20")

# 1. Collect all NR files
WR_files = find_files_containing_variant(files, variant="WR")

# 2. Merge all files into one big DataFrame
big_df = load_and_merge(WR_files, selected_columns)

# 3. Split 80/20 row-wise
train_df, test_df = train_test_split_rows(big_df, train_ratio=0.8)

# 4. Compute normalization parameters from training only
mean, std = compute_mean_std(train_df, feature_columns)

# 5. Normalize both datasets
train_df_norm = normalize_df(train_df, mean, std, feature_columns)
test_df_norm  = normalize_df(test_df, mean, std, feature_columns)

# 6. Save
save_dataset(train_df_norm, dataset_path / "train_val" /"train_val.csv")
save_dataset(test_df_norm,  dataset_path / "test" / "test.csv")

print("Dataset created and saved.")


Dataset created and saved.


All

In [175]:
dataset_path = Path("../datasets/All_80_20")

# 2. Merge all files into one big DataFrame
big_df = load_and_merge(files, selected_columns)

# 3. Split 80/20 row-wise
train_df, test_df = train_test_split_rows(big_df, train_ratio=0.8)

# 4. Compute normalization parameters from training only
mean, std = compute_mean_std(train_df, feature_columns)

# 5. Normalize both datasets
train_df_norm = normalize_df(train_df, mean, std, feature_columns)
test_df_norm  = normalize_df(test_df, mean, std, feature_columns)

# 6. Save
save_dataset(train_df_norm, dataset_path / "train_val" /"train_val.csv")
save_dataset(test_df_norm,  dataset_path / "test" / "test.csv")

print("Dataset created and saved.")


Dataset created and saved.
