In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from pathlib import Path

In [2]:
csv_path = Path('./data/dataset/ngt_static_landmarks_1807.csv')
if not csv_path.exists():
    print(f"Error: Dataset not found at {csv_path}")
    print("Please run simple_static_collector.py first!")
    exit(1)

In [3]:
df = pd.read_csv(csv_path)
print(f"\n‚úì Loaded {len(df)} samples")
print(f"‚úì Letters: {sorted(df['letter'].unique())}")
print(f"\nSamples per letter:")
print(df['letter'].value_counts().sort_index())


‚úì Loaded 1807 samples
‚úì Letters: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

Samples per letter:
letter
A     80
B     80
C     80
D     80
E     80
F     80
G    110
I     80
K    109
L    110
M    100
N    100
O     80
P     80
Q     80
R     80
S     80
T     80
V     79
W     79
Y     80
Name: count, dtype: int64


In [4]:
def normalize_landmarks(row):
    """
    Normalize hand landmarks to be invariant to:
    1. Position (center on wrist)
    2. Scale (normalize by hand size)
    """
    # Reshape to 21 landmarks x 3 coords
    coords = row.values.reshape(21, 3)
    
    # Step 1: Center on wrist (landmark 0)
    # This removes position bias
    wrist = coords[0]
    coords = coords - wrist  # Now wrist is at origin (0,0,0)
    
    # Step 2: Calculate hand size
    # Distance from wrist to middle finger tip (landmark 12)
    hand_size = np.linalg.norm(coords[12] - coords[0])
    
    # Step 3: Scale by hand size
    # This removes size/distance bias
    if hand_size > 0:
        coords = coords / hand_size
    
    # Return flattened (back to 63 values)
    return coords.flatten()

In [5]:
X = df.drop('letter', axis=1)  # All landmark columns
y = df['letter']     

X_normalized = X.apply(normalize_landmarks, axis=1, result_type='expand')

print(f"Normalized feature shape: {X_normalized.shape}")

# Split train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Keep proportions balanced
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# Train Random Forest classifier
print("\nTraining Random Forest...")
clf = RandomForestClassifier(
    n_estimators=100,    # 100 trees
    max_depth=20,        # Max tree depth
    random_state=42,     # Reproducible
    n_jobs=-1            # Use all CPU cores
)

clf.fit(X_train, y_train)
print("Training complete!")

Normalized feature shape: (1807, 63)

Training samples: 1445
Testing samples: 362

Training Random Forest...
Training complete!


In [6]:
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print(f"\nResults:")
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Testing accuracy: {test_accuracy*100:.2f}%")


Results:
Training accuracy: 100.00%
Testing accuracy: 99.45%


In [7]:
y_pred = clf.predict(X_test)
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00        16
           B       1.00      1.00      1.00        16
           C       1.00      1.00      1.00        16
           D       1.00      1.00      1.00        16
           E       1.00      0.94      0.97        16
           F       1.00      0.94      0.97        16
           G       1.00      1.00      1.00        22
           I       1.00      1.00      1.00        16
           K       1.00      1.00      1.00        22
           L       1.00      1.00      1.00        22
           M       1.00      1.00      1.00        20
           N       1.00      1.00      1.00        20
           O       1.00      1.00      1.00        16
           P       1.00      1.00      1.00        16
           Q       0.94      1.00      0.97        16
           R       1.00      1.00      1.00        16
           S       1.00      1.00      1.00        16
   

In [8]:
model_path = Path('./models/static_model_clf.pkl')
model_path.parent.mkdir(exist_ok=True, parents=True)
joblib.dump(clf, model_path)

['models\\static_model_clf.pkl']

In [9]:
print("\nüîç Top 10 Most Important Features:")
feature_importance = clf.feature_importances_
top_features = np.argsort(feature_importance)[-10:][::-1]
for idx in top_features:
    landmark_num = idx // 3
    coord = ['x', 'y', 'z'][idx % 3]
    print(f"   Landmark {landmark_num} ({coord}): {feature_importance[idx]:.4f}")


üîç Top 10 Most Important Features:
   Landmark 8 (y): 0.0519
   Landmark 4 (x): 0.0496
   Landmark 3 (x): 0.0421
   Landmark 19 (y): 0.0399
   Landmark 20 (y): 0.0314
   Landmark 2 (x): 0.0295
   Landmark 7 (y): 0.0285
   Landmark 15 (y): 0.0277
   Landmark 16 (y): 0.0272
   Landmark 8 (z): 0.0271
