# Train Cleaned Static Data

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


## Load and validate data

In [3]:
# Paths
ngt_static_keypoint = Path('../data/dataset/ngt_static_keypoint.csv')

# Load data - no header, first column is label
df = pd.read_csv(ngt_static_keypoint, header=None)

# Separate features and labels
y = df[0]  # Label indices (0-25, excluding H, J, U, X, Z)
X = df.iloc[:, 1:]  # 42 pre-normalized landmark features

print(f"Total samples: {len(df)}")
print(f"Features: {X.shape[1]}")
print(f"Unique labels: {sorted(y.unique())}")
print(f"\nSamples per label:")
print(y.value_counts().sort_index())

Total samples: 42909
Features: 42
Unique labels: [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 24]

Samples per label:
0
0     1898
1     1281
2     3097
3     2275
4     2340
5     1024
6     2281
8     3592
10    1870
11    1417
12    1912
13    1962
14    1918
15    1902
16    1891
17    3018
18    1965
19    1895
21    1227
22    1965
24    2179
Name: count, dtype: int64


## Split & Train

In [5]:
# Train/test split - stratified to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nStarting training...")

# Train Random Forest
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining complete!")
print(f"Accuracy: {accuracy:.4f}")

Training samples: 34327
Test samples: 8582

Starting training...

Training complete!
Accuracy: 0.9950


## Results

In [6]:
# Load alphabet for detailed report
with open('../data/dataset/keypoint_classifier_label.csv', 'r', encoding='utf-8-sig') as f:
    import csv
    ALL_LETTERS = [row[0] for row in csv.reader(f)]

# Get letter names for the labels we have
label_names = [ALL_LETTERS[i] for i in sorted(y.unique())]

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))

# Save model
output_path = Path('../models/ngt_static_classifier.pkl')
output_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, output_path)

print(f"\nModel saved to: {output_path}")


Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       380
           B       0.97      0.98      0.98       256
           C       1.00      1.00      1.00       620
           D       1.00      1.00      1.00       455
           E       1.00      1.00      1.00       468
           F       0.96      0.98      0.97       205
           G       1.00      1.00      1.00       456
           I       1.00      0.99      1.00       719
           K       1.00      1.00      1.00       374
           L       0.97      0.96      0.97       283
           M       1.00      1.00      1.00       382
           N       1.00      1.00      1.00       392
           O       1.00      1.00      1.00       384
           P       1.00      1.00      1.00       380
           Q       1.00      1.00      1.00       378
           R       0.99      1.00      1.00       604
           S       1.00      1.00      1.00       393
   