# Train Cleaned Static Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


## Load and validate data

In [2]:
# Paths
ngt_static_keypoint = Path('../data/dataset/ngt_static_keypoint.csv')

# Load data - no header, first column is label
df = pd.read_csv(ngt_static_keypoint, header=None)

# Separate features and labels
y = df[0]  # Label indices (0-25, excluding H, J, U, X, Z)
X = df.iloc[:, 1:]  # 42 pre-normalized landmark features

print(f"Total samples: {len(df)}")
print(f"Features: {X.shape[1]}")
print(f"Unique labels: {sorted(y.unique())}")
print(f"\nSamples per label:")
print(y.value_counts().sort_index())

Total samples: 65408
Features: 42
Unique labels: [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 24]

Samples per label:
0
0     1898
1     3320
2     3097
3     2275
4     3840
5     2024
6     5348
8     3592
10    3467
11    2434
12    5654
13    3333
14    1918
15    5603
16    2012
17    2960
18    4026
19    3236
21    1227
22    1965
24    2179
Name: count, dtype: int64


## Split & Train

In [3]:
# Train/test split - stratified to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nStarting training...")

# Train Random Forest
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining complete!")
print(f"Accuracy: {accuracy:.4f}")

Training samples: 52326
Test samples: 13082

Starting training...

Training complete!
Accuracy: 0.9968


## Results

In [4]:
# Load alphabet for detailed report
with open('../data/dataset/keypoint_classifier_label.csv', 'r', encoding='utf-8-sig') as f:
    import csv
    ALL_LETTERS = [row[0] for row in csv.reader(f)]

# Get letter names for the labels we have
label_names = [ALL_LETTERS[i] for i in sorted(y.unique())]

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))

# Save model
output_path = Path('../models/ngt_static_classifier.pkl')
output_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, output_path)

print(f"\nModel saved to: {output_path}")


Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       380
           B       1.00      1.00      1.00       664
           C       1.00      1.00      1.00       619
           D       1.00      1.00      1.00       455
           E       1.00      1.00      1.00       768
           F       0.99      0.99      0.99       405
           G       1.00      1.00      1.00      1070
           I       1.00      0.99      1.00       718
           K       1.00      1.00      1.00       693
           L       0.99      0.99      0.99       487
           M       0.99      1.00      0.99      1131
           N       0.99      1.00      0.99       667
           O       1.00      1.00      1.00       384
           P       1.00      1.00      1.00      1121
           Q       1.00      1.00      1.00       402
           R       1.00      1.00      1.00       592
           S       1.00      1.00      1.00       805
   