In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("trnhhunhthnhkhang/vi-air-writing")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/trnhhunhthnhkhang/vi-air-writing?dataset_version_number=8...


100%|██████████| 555M/555M [00:29<00:00, 19.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/trnhhunhthnhkhang/vi-air-writing/versions/8


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  # Changed from StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
class AirWritingTrainer:
    def __init__(self, data_path):
        self.data_path = data_path
        self.model = None
        self.labels = {}
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.max_length = 100
        os.makedirs("training_samples", exist_ok=True)

    def load_data(self, folder_types=["1_gram", "2_grams", "3_grams", "n_grams"], max_samples_per_label=1000):
        """
        Load data from different n-gram directories
        """
        all_data = []
        all_labels = []

        for folder_type in folder_types:
            folder_path = os.path.join(self.data_path, folder_type)

            if not os.path.exists(folder_path):
                print(f"Warning: Path {folder_path} does not exist, skipping.")
                continue

            try:
                labels = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
            except Exception as e:
                print(f"Error accessing {folder_path}: {e}")
                continue

            for label in labels:
                label_path = os.path.join(folder_path, label)

                try:
                    label_files = [f for f in os.listdir(label_path) if f.endswith('.csv')]
                    if max_samples_per_label > 0:
                        label_files = label_files[:max_samples_per_label]
                except Exception as e:
                    print(f"Error accessing {label_path}: {e}")
                    continue

                for file in label_files:
                    file_path = os.path.join(label_path, file)
                    try:
                        df = pd.read_csv(file_path)

                        if 'x' in df.columns and 'y' in df.columns:
                            coords = df[['x', 'y']].values
                        else:
                            coords = df.iloc[:, 0:2].values

                        if len(coords) > 0:
                            coords = self.normalize_coordinates(coords)
                            all_data.append(coords)
                            all_labels.append(label)

                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

        print(f"Loaded {len(all_data)} samples across {len(set(all_labels))} unique labels")

        unique_labels = sorted(set(all_labels))
        self.labels = {label: idx for idx, label in enumerate(unique_labels)}

        return all_data, all_labels

    def normalize_coordinates(self, coords):
        """Normalize coordinates to [0,1] range while preserving aspect ratio"""
        min_x, min_y = np.min(coords, axis=0)
        max_x, max_y = np.max(coords, axis=0)

        # Calculate range, avoiding division by zero
        width = max(max_x - min_x, 1e-5)
        height = max(max_y - min_y, 1e-5)

        # Normalize to [0,1] range
        normalized = coords.copy()
        normalized[:, 0] = (coords[:, 0] - min_x) / width
        normalized[:, 1] = (coords[:, 1] - min_y) / height

        return normalized

    def preprocess_data(self, data, labels):
        """
        Preprocess the air writing coordinate data
        """
        # Check if we have data to process
        if not data or len(data) == 0:
            raise ValueError("No data to preprocess")

        # Pad or truncate sequences to fixed length
        padded_data = []
        for seq in data:
            if len(seq) > self.max_length:
                indices = np.linspace(0, len(seq)-1, self.max_length).astype(int)
                seq = seq[indices]
            else:
                # Pad shorter sequences
                seq = np.pad(seq, ((0, self.max_length - len(seq)), (0, 0)), mode='constant')
            padded_data.append(seq)

        padded_data = np.array(padded_data)

        # Add additional features
        processed_data = self.add_dynamic_features(padded_data)

        # Fit and transform the data using the scaler
        original_shape = processed_data.shape
        processed_data = self.scaler.fit_transform(
            processed_data.reshape(-1, processed_data.shape[-1])
        ).reshape(original_shape)

        # Save some sample data for debugging
        self.save_sample_data(processed_data, labels)

        # Convert labels to categorical
        label_indices = [self.labels[label] for label in labels]
        categorical_labels = to_categorical(label_indices, num_classes=len(self.labels))

        return processed_data, categorical_labels

    def add_dynamic_features(self, data):
        """
        Add velocity features to improve model performance
        """
        # Original data shape: (samples, time_steps, 2)
        samples, time_steps, coords = data.shape

        # Initialize the new feature array (coordinates + velocity)
        new_features = np.zeros((samples, time_steps, coords * 2))

        for i in range(samples):
            # Copy original coordinates
            new_features[i, :, 0:coords] = data[i]

            # Calculate velocity (difference between consecutive points)
            velocity = np.zeros((time_steps, coords))
            # Only compute velocity for points 1 to end (point 0 has no velocity)
            velocity[1:] = np.diff(data[i], axis=0)
            new_features[i, :, coords:2*coords] = velocity

        return new_features

    def save_sample_data(self, processed_data, labels, num_samples=5):
        """Save sample processed data for debugging"""
        if len(processed_data) == 0:
            return

        indices = np.random.choice(len(processed_data),
                                  min(num_samples, len(processed_data)),
                                  replace=False)

        for i, idx in enumerate(indices):
            sample = processed_data[idx]
            label = labels[idx]

            # Save visualization
            plt.figure(figsize=(12, 6))

            # Plot coordinates
            plt.subplot(1, 2, 1)
            plt.title(f"Sample {i}: {label}")
            plt.plot(sample[:, 0], sample[:, 1], 'b-')
            plt.scatter(sample[0, 0], sample[0, 1], c='g', s=50, label='Start')

            # Find last non-zero point
            non_zero = np.where((sample[:, 0] != 0) | (sample[:, 1] != 0))[0]
            if len(non_zero) > 0:
                last_idx = non_zero[-1]
                plt.scatter(sample[last_idx, 0], sample[last_idx, 1],
                           c='r', s=50, label='End')

            plt.legend()

            # Plot features
            plt.subplot(1, 2, 2)
            plt.title("Feature Values")
            plt.plot(range(len(sample)), sample[:, 0], 'r-', label='x')
            plt.plot(range(len(sample)), sample[:, 1], 'g-', label='y')
            plt.plot(range(len(sample)), sample[:, 2], 'b-', label='vx')
            plt.plot(range(len(sample)), sample[:, 3], 'y-', label='vy')
            plt.legend()

            plt.tight_layout()
            plt.savefig(f"training_samples/sample_{i}_{label}.png")
            plt.close()

            # Save raw data
            np.save(f"training_samples/sample_{i}_{label}.npy", sample)

        print(f"Saved {len(indices)} sample visualizations to training_samples/")

    def create_model(self, input_shape, num_classes):
        """
        Create an improved model for Vietnamese air writing recognition
        """
        model = Sequential([
            Conv1D(64, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
            MaxPooling1D(pool_size=2),

            Bidirectional(GRU(128, return_sequences=True)),
            Dropout(0.3),
            Bidirectional(GRU(64, return_sequences=False)),
            Dropout(0.3),

            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(num_classes, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def train_and_save_model(self, test_size=0.2, random_state=42, batch_size=32, epochs=100):
        """
        Complete training pipeline with detailed logging
        """
        print("Starting the training process...")

        # Load data
        data, labels = self.load_data()
        if not data:
            raise ValueError("No data was loaded. Check your dataset path.")

        # Preprocess data
        print(f"Preprocessing {len(data)} samples...")
        X, y = self.preprocess_data(data, labels)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Testing set: {X_test.shape[0]} samples")
        print(f"Number of classes: {y.shape[1]}")

        # Prepare model input shape
        input_shape = (X_train.shape[1], X_train.shape[2])
        num_classes = y_train.shape[1]

        # Create and compile model
        self.model = self.create_model(input_shape, num_classes)
        print(self.model.summary())

        # Callbacks for better training
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        )

        reduce_lr = ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=1
        )

        # Training with progress updates
        print("\nTraining model...")
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        # Evaluate the model
        print("\nEvaluating model...")
        test_loss, test_accuracy = self.model.evaluate(X_test, y_test)
        print(f"Test Accuracy: {test_accuracy*100:.2f}%")

        # Save model and metadata
        self.save_model()

        # Plot training history
        self._plot_training_history(history)

        # Save preprocessing info
        self.save_preprocessing_info()

        return test_accuracy

    def save_preprocessing_info(self):
        """Save preprocessing details for reference"""
        info = {
            'max_length': self.max_length,
            'scaler_type': type(self.scaler).__name__,
            'feature_range': self.scaler.feature_range if hasattr(self.scaler, 'feature_range') else None,
            'normalization': 'per_sample',
            'features': ['x', 'y', 'vx', 'vy']
        }

        with open('preprocessing_info.txt', 'w') as f:
            for key, value in info.items():
                f.write(f"{key}: {value}\n")

        print("Saved preprocessing info to preprocessing_info.txt")

    def save_model(self, model_path='air_writing_model.h5', metadata_path='model_metadata.joblib'):
        """
        Save trained model and labels
        """
        if self.model:
            # Save model
            self.model.save(model_path)

            # Save metadata
            joblib.dump({
                'labels': self.labels,
                'scaler': self.scaler,
                'max_length': self.max_length,
                'scaler_params': {
                    'type': type(self.scaler).__name__,
                    'feature_range': self.scaler.feature_range if hasattr(self.scaler, 'feature_range') else None
                }
            }, metadata_path)

            print(f"Model saved to {model_path}")
            print(f"Metadata saved to {metadata_path}")

            # Print out label mapping
            print("\nLabel Mapping:")
            for label, index in self.labels.items():
                print(f"{label}: {index}")
        else:
            print("No model to save. Train a model first.")

    def _plot_training_history(self, history):
        """
        Plot and save training metrics
        """
        plt.figure(figsize=(12, 5))

        # Accuracy plot
        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()

        # Loss plot
        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Model Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()

        plt.tight_layout()
        plt.savefig('training_history.png')
        print("Training history plot saved to 'training_history.png'")
        plt.close()

In [None]:
data_path = path + "/VNI_airwriting"
trainer = AirWritingTrainer(data_path)
trainer.train_and_save_model(
    test_size=0.2,
    random_state=42,
    batch_size=32,
    epochs=100
)

print("Training complete!")

Starting the training process...
Loaded 22760 samples across 660 unique labels
Preprocessing 22760 samples...
Saved 5 sample visualizations to training_samples/
Training set: 18208 samples
Testing set: 4552 samples
Number of classes: 660


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None

Training model...
Epoch 1/100
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 17ms/step - accuracy: 0.0304 - loss: 5.6262 - val_accuracy: 0.3214 - val_loss: 2.9013 - learning_rate: 0.0010
Epoch 2/100
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.2809 - loss: 2.9188 - val_accuracy: 0.6322 - val_loss: 1.4194 - learning_rate: 0.0010
Epoch 3/100
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step - accuracy: 0.5070 - loss: 1.7679 - val_accuracy: 0.7821 - val_loss: 0.8113 - learning_rate: 0.0010
Epoch 4/100
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.6457 - loss: 1.2109 - val_accuracy: 0.8363 - val_loss: 0.5914 - learning_rate: 0.0010
Epoch 5/100
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.7204 - loss: 0.9145 - val_accuracy: 0.8937 - val_loss: 0.3763 - learning_rate: 0.0010
Epoch 6/100
[1m569/569[0m [32m



Test Accuracy: 99.63%
Model saved to air_writing_model.h5
Metadata saved to model_metadata.joblib

Label Mapping:
anh_w: 0
ba_w: 1
bao giờ_w: 2
bia_w: 3
biển rộng_w: 4
biển xanh rộng_w: 5
biển xanh sóng vỗ_w: 6
biển đêm lung linh_w: 7
biển_w: 8
buồn bã_w: 9
buồn_w: 10
buổi chiều yên ả_w: 11
buổi sáng se lạnh_w: 12
buổi sáng trong lành_w: 13
buổi sáng xanh_w: 14
buổi tối mát mẻ_w: 15
bà_w: 16
bài tập_w: 17
bàn ghế_w: 18
bàn phím_w: 19
bàn ăn_w: 20
bàn_w: 21
bán_w: 22
bánh mì_w: 23
bánh_w: 24
bãi biển rộng mênh mông_w: 25
bãi cát trắng tinh_w: 26
bãi cỏ xanh mướt_w: 27
bé_w: 28
béo_w: 29
bên cạnh_w: 30
bí_w: 31
bò_w: 32
bóng cây mát_w: 33
bóng_w: 34
bông hoa đẹp xinh_w: 35
bún_w: 36
bút chì_w: 37
bút_w: 38
bơ_w: 39
bưởi_w: 40
bạn bè_w: 41
bản chất_w: 42
bản tin_w: 43
bảo vệ_w: 44
bảy_w: 45
bất tiện_w: 46
bầu trời trong xanh_w: 47
bầu trời trong_w: 48
bầu trời đầy sao_w: 49
bầu_w: 50
bẩn_w: 51
bằng chứng_w: 52
bằng cách nào_w: 53
bền_w: 54
bệnh viện_w: 55
bốn_w: 56
bộ lạc_w: 57
bộ máy_w: 