# Train Cleaned Dynamic Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import csv

## Load and validate data

In [2]:
# Paths
ngt_dynamic_keypoint = Path('../data/dataset/ngt_dynamic_keypoint.csv')

# Load data - no header
df = pd.read_csv(ngt_dynamic_keypoint, header=None)

# Separate features and labels
y = df[0]  # Label indices (should be 7, 9, 20, 23, 25 for H, J, U, X, Z)
X = df.iloc[:, 1:]  # 84 features: 42 current + 42 delta

print(f"Total samples: {len(df)}")
print(f"Features: {X.shape[1]}")
print(f"Unique labels: {sorted(y.unique())}")
print(f"\nSamples per label:")
print(y.value_counts().sort_index())

# Load alphabet for letter names
with open('../data/dataset/keypoint_classifier_label.csv', 'r', encoding='utf-8-sig') as f:
    ALL_LETTERS = [row[0] for row in csv.reader(f)]

# Show which letters these are
print(f"\nDynamic letters: {[ALL_LETTERS[i] for i in sorted(y.unique())]}")

Total samples: 16128
Features: 84
Unique labels: [7, 9, 20, 23, 25]

Samples per label:
0
7     4024
9     3015
20    2581
23    3466
25    3042
Name: count, dtype: int64

Dynamic letters: ['H', 'J', 'U', 'X', 'Z']


## Split & Train

In [3]:
# Train/test split - stratified to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nStarting training...")

# Train Random Forest
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining complete!")
print(f"Accuracy: {accuracy:.4f}")

Training samples: 12902
Test samples: 3226

Starting training...

Training complete!
Accuracy: 0.9972


## Results

In [4]:
# Detailed report
label_names = [ALL_LETTERS[i] for i in sorted(y.unique())]
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))

# Save model
output_path = Path('../models/ngt_dynamic_classifier.pkl')
output_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, output_path)

print(f"\nModel saved to: {output_path}")


Classification Report:
              precision    recall  f1-score   support

           H       1.00      1.00      1.00       805
           J       1.00      1.00      1.00       603
           U       1.00      1.00      1.00       516
           X       1.00      1.00      1.00       693
           Z       0.99      0.99      0.99       609

    accuracy                           1.00      3226
   macro avg       1.00      1.00      1.00      3226
weighted avg       1.00      1.00      1.00      3226


Model saved to: ../models/ngt_dynamic_classifier.pkl
