In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import csv
from pathlib import Path

In [12]:
with open('../static_collection/keypoint_classifier_label.csv', 'r', encoding='utf-8-sig') as f:
    LETTER_LABELS = [row[0] for row in csv.reader(f)]

print(f"Letter mapping: {LETTER_LABELS}")

Letter mapping: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [13]:
csv_path = Path('../static_collection/ngt_keypoint_static.csv')

In [14]:
df = pd.read_csv(csv_path, header=None)
print(f"\n✓ Loaded {len(df)} samples")
print(f"✓ Feature columns: {df.shape[1] - 1}")


✓ Loaded 48041 samples
✓ Feature columns: 42


In [15]:
y_indices = df.iloc[:, 0].astype(int)  # First column: label indices
X = df.iloc[:, 1:]  # Rest: landmark features

# Map indices to letter names for readable output
y = y_indices.map(lambda idx: LETTER_LABELS[idx])

print(f"\n✓ Letters present: {sorted(y.unique())}")
print(f"\nSamples per letter:")
print(y.value_counts().sort_index())


✓ Letters present: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

Samples per letter:
0
A    1898
B    1281
C    3097
D    2275
E    2340
F    1024
G    2281
I    3592
K    1870
L    1417
M    2956
N    3021
O    1918
P    2902
Q    2907
R    4031
S    1965
T    1895
V    1227
W    1965
Y    2179
Name: count, dtype: int64


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Keep proportions balanced
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 38432
Testing samples: 9609


In [17]:
clf = RandomForestClassifier(
    n_estimators=100,    # 100 trees
    max_depth=20,        # Max tree depth
    random_state=42,     # Reproducible
    n_jobs=-1            # Use all CPU cores
)

clf.fit(X_train, y_train)
print("✓ Training complete!")

✓ Training complete!


In [18]:
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print(f"\nResults:")
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Testing accuracy: {test_accuracy*100:.2f}%")


Results:
Training accuracy: 100.00%
Testing accuracy: 99.63%


In [19]:
y_pred = clf.predict(X_test)
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       380
           B       0.99      0.98      0.99       256
           C       1.00      1.00      1.00       620
           D       1.00      1.00      1.00       455
           E       1.00      1.00      1.00       468
           F       0.97      0.98      0.98       205
           G       1.00      1.00      1.00       456
           I       1.00      0.99      1.00       719
           K       1.00      1.00      1.00       374
           L       0.99      0.96      0.97       283
           M       1.00      1.00      1.00       591
           N       1.00      1.00      1.00       604
           O       0.99      1.00      1.00       384
           P       1.00      1.00      1.00       580
           Q       1.00      1.00      1.00       582
           R       1.00      1.00      1.00       806
           S       1.00      1.00      1.00       393
   

In [22]:
model_path = Path('./models/static_letters_rf.pkl')
model_path.parent.mkdir(exist_ok=True, parents=True)
joblib.dump(clf, model_path)
print(f"\n✓ Model saved to: {model_path}")


✓ Model saved to: models\static_letters_rf.pkl
