Import libraries

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ==========================================
#              USER CONFIGURATION
# ==========================================

# 1. File Paths & Selection
FOLDER_PATH = "/home/zebborjesson/Documents/school/tra300_digitalization_in_sports/tra300-ski-technique-classification/data_sync/outputs"
FILE_FILTER = "NR" 

# List of specific filenames to use as the TEST set. 
# (I added your specific file here, but you can add more like in the LSTM code)
TEST_FILES = [
    "BIA24-3_NR_merged_with_gear.csv"
]

# 2. Input Features
FEATURE_COLS_POLE = [
    'speed_kmph', 'power_w', 'frequency_ppm', 'thrust_left_ms', 'thrust_right_ms', 
    'impulse_left_ns', 'impulse_right_ns', 'force_meanl_n', 'force_meanr_n', 'f_tot_mean_n'
]
FEATURE_COLS_GNSS = [
    'ns1:AltitudeMeters', 'ns2:Speed', 'ns2:RunCadence', 'ns2:Watts'
]
NEW_RATIO_COLS = ["force_ratio", "thrust_ratio", "impulse_ratio"]

INPUT_COLS = FEATURE_COLS_POLE + FEATURE_COLS_GNSS + NEW_RATIO_COLS
LABEL_COL = "Gear"

# 3. Training Hyperparameters
HIDDEN_DIM = 128
DROPOUT = 0.2
BATCH_SIZE = 256
LEARNING_RATE = 5e-4
EPOCHS = 200
TRANSITION_SAMPLES_TO_REMOVE = 3

In [None]:
# --- HELPER FUNCTIONS ---
def normalize_label(val: object) -> str:
    s = str(val).strip()
    m = re.search(r'^-?\d+(?:\.\d+)?', s)
    if m:
        return f"{float(m.group(0)):.1f}"
    return s

def add_ratios(df):
    df = df.copy()
    eps = 1e-6
    if 'force_meanl_n' in df.columns and 'force_meanr_n' in df.columns:
        df["force_ratio"] = df["force_meanl_n"] / (df["force_meanl_n"] + df["force_meanr_n"] + eps)
    if 'thrust_left_ms' in df.columns and 'thrust_right_ms' in df.columns:
        df["thrust_ratio"] = df["thrust_left_ms"] / (df["thrust_left_ms"] + df["thrust_right_ms"] + eps)
    if 'impulse_left_ns' in df.columns and 'impulse_right_ns' in df.columns:
        df["impulse_ratio"] = df["impulse_left_ns"] / (df["impulse_left_ns"] + df["impulse_right_ns"] + eps)
    return df

def filter_unstable_gears(df, label_col, window_size):
    if df.empty: return df
    gear_changes = df[label_col].ne(df[label_col].shift(1))
    change_points = df[gear_changes].index.tolist()
    remove_indices = set()
    for idx in change_points:
        for i in range(idx - window_size, idx + window_size):
            remove_indices.add(i)
    return df.drop(index=list(remove_indices), errors='ignore')

def make_sure_numeric(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

# --- LOAD DATA ---
all_csvs = [f for f in os.listdir(FOLDER_PATH) if f.endswith('.csv')]
csv_files = [f for f in all_csvs if FILE_FILTER in f] if FILE_FILTER else all_csvs
print(f"Found {len(csv_files)} files matching filter '{FILE_FILTER}'.")

dataframes = {}
for file in csv_files:
    full_path = os.path.join(FOLDER_PATH, file)
    df = pd.read_csv(full_path)
    
    # Process
    df = make_sure_numeric(df, INPUT_COLS)
    df = add_ratios(df)
    if LABEL_COL in df.columns:
        df[LABEL_COL] = df[LABEL_COL].apply(normalize_label)
        df = filter_unstable_gears(df, LABEL_COL, TRANSITION_SAMPLES_TO_REMOVE)
    
    dataframes[file] = df

# Verify Test Files exist
valid_test_files = [tf for tf in TEST_FILES if tf in dataframes]
if len(valid_test_files) != len(TEST_FILES):
    print(f"WARNING: Some test files were not found. Using: {valid_test_files}")
TEST_FILES = valid_test_files

In [None]:
# 1. Identify keys available for training (All files MINUS test files)
train_pool_keys = sorted([k for k in dataframes.keys() if k not in TEST_FILES])

# 2. Split files for Train/Val
train_keys, val_keys = train_test_split(train_pool_keys, test_size=0.2, random_state=42)

print(f"Train files: {len(train_keys)} | Val files: {len(val_keys)} | Test files: {len(TEST_FILES)}")

# 3. Concatenate
train_df = pd.concat([dataframes[k] for k in train_keys], ignore_index=True)
val_df = pd.concat([dataframes[k] for k in val_keys], ignore_index=True)

test_dfs_list = [dataframes[k] for k in TEST_FILES]
if test_dfs_list:
    test_df = pd.concat(test_dfs_list, ignore_index=True)
else:
    raise ValueError("No valid test files found! Check your configuration.")

# Drop NaNs
train_df = train_df.dropna(subset=INPUT_COLS + [LABEL_COL])
val_df = val_df.dropna(subset=INPUT_COLS + [LABEL_COL])
test_df = test_df.dropna(subset=INPUT_COLS + [LABEL_COL])

# 4. Fit Label Encoder
all_labels = pd.concat([train_df[LABEL_COL], val_df[LABEL_COL], test_df[LABEL_COL]]).unique()
encoder = LabelEncoder()
encoder.fit(all_labels)
print(f"Classes found: {encoder.classes_}")

# 5. Fit Scaler (TRAIN ONLY)
scaler = StandardScaler()
scaler.fit(train_df[INPUT_COLS])
print("Scaler fitted on training data.")

Loading 12 files...


NameError: name 'make_sure_numeric' is not defined

In [None]:
def prepare_dataset(df):
    X_vals = df[INPUT_COLS].values
    y_vals = encoder.transform(df[LABEL_COL].values)
    
    # Scale features
    X_vals = scaler.transform(X_vals)
    
    # Convert to Tensors
    X_t = torch.tensor(X_vals, dtype=torch.float32)
    y_t = torch.tensor(y_vals, dtype=torch.long)
    
    return X_t, y_t

print("Building tensors...")
X_train, y_train = prepare_dataset(train_df)
X_val, y_val = prepare_dataset(val_df)
X_test, y_test = prepare_dataset(test_df)

print(f"Train shape: {X_train.shape}")
print(f"Val shape:   {X_val.shape}")
print(f"Test shape:  {X_test.shape}")

# Create DataLoaders
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

NameError: name 'dataframes' is not defined

In [None]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, hidden_dim, dropout=0.2):
        super(MLP, self).__init__()
        
        # 3 Hidden Layers (as per your previous config)
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x):
        return self.net(x)

# Compute class weights for imbalance
class_counts = torch.bincount(y_train)
class_weights = 1.0 / class_counts.float().clamp_min(1.0)
class_weights = class_weights * (len(class_counts) / class_weights.sum())
class_weights = class_weights.to(device)

print("Class Weights:", class_weights)

# Initialize Model
model = MLP(
    in_dim=len(INPUT_COLS),
    out_dim=len(encoder.classes_),
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT
).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

NameError: name 'train_df' is not defined

In [None]:
train_losses = []
val_losses = []
best_val_loss = float('inf')
best_model_state = None

print("Starting training...")

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    avg_train_loss = running_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
            
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    val_acc = 100 * correct / total

    # Save Best
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{EPOCHS}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")

# Load Best Model
if best_model_state:
    print(f"\nRestoring best model (Val Loss: {best_val_loss:.4f})")
    model.load_state_dict(best_model_state)

# Plot
plt.figure(figsize=(10,5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.title("Training History")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.numpy())

# Evaluation Metrics
print("\n--- Test Set Classification Report ---")
print(classification_report(all_labels, all_preds, target_names=encoder.classes_, zero_division=0))

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

tick_marks = np.arange(len(encoder.classes_))
plt.xticks(tick_marks, encoder.classes_, rotation=45)
plt.yticks(tick_marks, encoder.classes_)

# Label the squares
thresh = cm.max() / 2.
for i, j in np.ndindex(cm.shape):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()

Loading 12 files...
Data loading complete.
