# Train Cleaned Static Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


## Load and validate data

In [26]:
# Paths
ngt_static_keypoint = Path('../data/dataset/ngt_static_keypoint.csv')

# Load data - no header, first column is label
df = pd.read_csv(ngt_static_keypoint, header=0)

# Column 0 = int (labels)
df.iloc[:, 0] = df.iloc[:, 0].astype(int)

# Columns 1â€“42 = float (features)
df.iloc[:, 1:] = df.iloc[:, 1:].astype(float)

# Separate features and labels
y = df[df.columns[0]]  # Label indices (0-25, excluding H, J, U, X, Z)
X = df.drop(columns=df.columns[0])  # 42 pre-normalized landmark features

print(f"Total samples: {len(df)}")
print(f"Features: {X.shape[1]}")
print(f"Unique labels: {sorted(y.unique())}")
print(f"\nSamples per label:")
print(y.value_counts().sort_index())

Total samples: 52225
Features: 42
Unique labels: [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 24]

Samples per label:
sign_index
0     1898
1     3320
2     3097
3     2275
4     2340
5     1024
6     3328
8     3592
10    3467
11    2434
12    3325
13    3333
14    1918
15    3270
16    2012
17    1020
18    1965
19    3236
21    1227
22    1965
24    2179
Name: count, dtype: int64


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52225 entries, 0 to 52224
Data columns (total 43 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sign_index  52225 non-null  int64  
 1   feature_1   52225 non-null  float64
 2   feature_2   52225 non-null  float64
 3   feature_3   52225 non-null  float64
 4   feature_4   52225 non-null  float64
 5   feature_5   52225 non-null  float64
 6   feature_6   52225 non-null  float64
 7   feature_7   52225 non-null  float64
 8   feature_8   52225 non-null  float64
 9   feature_9   52225 non-null  float64
 10  feature_10  52225 non-null  float64
 11  feature_11  52225 non-null  float64
 12  feature_12  52225 non-null  float64
 13  feature_13  52225 non-null  float64
 14  feature_14  52225 non-null  float64
 15  feature_15  52225 non-null  float64
 16  feature_16  52225 non-null  float64
 17  feature_17  52225 non-null  float64
 18  feature_18  52225 non-null  float64
 19  feature_19  52225 non-nul

## Split & Train

In [28]:
# Train/test split - stratified to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nStarting training...")

# Train Random Forest
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining complete!")
print(f"Accuracy: {accuracy:.4f}")

Training samples: 41780
Test samples: 10445

Starting training...

Training complete!
Accuracy: 0.9963


## Results

In [30]:
# Load alphabet for detailed report
with open('../data/dataset/keypoint_classifier_label.csv', 'r', encoding='utf-8-sig') as f:
    import csv
    ALL_LETTERS = [row[0] for row in csv.reader(f)]

# Get letter names for the labels we have
label_names = [ALL_LETTERS[i] for i in sorted(y.unique())]

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))

# Save model
output_path = Path('../models/ngt_static_classifier.pkl')
output_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, output_path)

print(f"\nModel saved to: {output_path}")


Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       380
           B       1.00      1.00      1.00       664
           C       1.00      1.00      1.00       619
           D       1.00      1.00      1.00       455
           E       1.00      1.00      1.00       468
           F       0.99      0.99      0.99       205
           G       1.00      1.00      1.00       666
           I       1.00      0.99      1.00       718
           K       1.00      1.00      1.00       693
           L       0.99      0.99      0.99       487
           M       1.00      1.00      1.00       665
           N       0.99      1.00      0.99       667
           O       1.00      1.00      1.00       384
           P       0.98      1.00      0.99       654
           Q       1.00      0.99      1.00       402
           R       1.00      1.00      1.00       204
           S       1.00      1.00      1.00       393
   