# Гусев Яромир ПМ22-1
## Разработка системы распознавания жестов на основе методов компьютерного зрения

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/gesture_dataset.csv')

## Данные

Собирались при помощи MediaPipe - библиотеки, позволяющей получать 21 ключевую точку кисти

![alt text](assets/image.png)

![alt text](assets/image-1.png)

Общая статистика

In [2]:
print(f"Всего записей: {len(df)}")
print(f"\nРаспределение по классам:")
print(df['label'].value_counts().sort_index())
print(f"\nПропущенные значения: {df.isnull().sum().sum()}")

Всего записей: 510

Распределение по классам:
label
down         85
left         85
neutral      85
open_palm    85
right        85
up           85
Name: count, dtype: int64

Пропущенные значения: 0


По 3 примера с каждого класса

In [3]:
for gesture in df['label'].unique():
    print(f"\nЖест: {gesture}")
    samples = df[df['label'] == gesture].head(3)
    
    for idx, row in samples.iterrows():
        x0 = row.iloc[0]   # первая x-координата
        y0 = row.iloc[21]  # первая y-координата (после всех 21 x)
        z0 = row.iloc[42]  # первая z-координата (после всех 21 x и 21 y)
        
        x_mean = row.iloc[:21].mean()
        y_mean = row.iloc[21:42].mean()
        
        print(f"Пример {idx}: x0={x0:.3f}, y0={y0:.3f}, z0={z0:.3f} | x_mean={x_mean:.3f}, y_mean={y_mean:.3f}")



Жест: left
Пример 0: x0=0.270, y0=0.102, z0=0.085 | x_mean=0.204, y_mean=0.162
Пример 1: x0=0.282, y0=0.114, z0=0.092 | x_mean=0.193, y_mean=0.146
Пример 2: x0=0.305, y0=0.134, z0=0.111 | x_mean=0.170, y_mean=0.122

Жест: right
Пример 85: x0=0.675, y0=0.833, z0=0.877 | x_mean=0.427, y_mean=0.455
Пример 86: x0=0.686, y0=0.839, z0=0.882 | x_mean=0.431, y_mean=0.460
Пример 87: x0=0.723, y0=0.893, z0=0.927 | x_mean=0.484, y_mean=0.518

Жест: up
Пример 170: x0=0.473, y0=0.398, z0=0.513 | x_mean=0.232, y_mean=0.185
Пример 171: x0=0.463, y0=0.392, z0=0.505 | x_mean=0.229, y_mean=0.182
Пример 172: x0=0.474, y0=0.392, z0=0.510 | x_mean=0.229, y_mean=0.179

Жест: down
Пример 255: x0=0.492, y0=0.373, z0=0.482 | x_mean=0.375, y_mean=0.419
Пример 256: x0=0.491, y0=0.372, z0=0.481 | x_mean=0.378, y_mean=0.423
Пример 257: x0=0.478, y0=0.350, z0=0.461 | x_mean=0.374, y_mean=0.420

Жест: open_palm
Пример 340: x0=0.432, y0=0.327, z0=0.468 | x_mean=0.298, y_mean=0.243
Пример 341: x0=0.446, y0=0.349, z0=

In [82]:
from sklearn.decomposition import PCA
import plotly.express as px

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

vis_df = pd.DataFrame({
    'PC1': X_pca[:, 0],
    'PC2': X_pca[:, 1],
    'Gesture': y
})

fig = px.scatter(vis_df, x='PC2', y='PC1', color='Gesture',
                 title='Визуализация датасета через PCA (2 компоненты)',
                 labels={'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
                         'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)'},
                 color_discrete_sequence=px.colors.qualitative.Set2)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(showgrid=True, gridcolor='lightgray', zeroline=True),
    yaxis=dict(showgrid=True, gridcolor='lightgray', zeroline=True),
    legend=dict(title="Жест")
)
fig.show()

Из графика видно, что жесты хорошо разделяются в пространстве главных компонент. 
Жесты up и down разделены по первой компоненте, left и right - по второй. 
Центральные жесты (open_palm, neutral) находятся между ними, что объясняет их 
промежуточное положение в координатном пространстве.

## Обучение моделей

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [3]:
df = pd.read_csv('data/gesture_dataset.csv')
X = df.iloc[:, :-1].values  # все колонки кроме последней
y = df.iloc[:, -1].values   # последняя колонка - label

In [4]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Train: 357, Val: 76, Test: 77


## Classic ML

In [7]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', C=10, gamma='scale', random_state=42),
    'SVM (Linear)': SVC(kernel='linear', C=1, random_state=42),
    'KNN (k=5)': KNeighborsClassifier(n_neighbors=5),
    'KNN (k=3)': KNeighborsClassifier(n_neighbors=3)
}

In [8]:
results = []

for name, model in models.items():
    print(f"\nОбучение {name}...")
    
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    train_acc = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    
    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred, average='weighted')
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    
    results.append({
        'Model': name,
        'Train Acc': train_acc,
        'Train F1': train_f1,
        'Val Acc': val_acc,
        'Val F1': val_f1,
        'Test Acc': test_acc,
        'Test F1': test_f1,
        'Test Precision': test_precision,
        'Test Recall': test_recall,
        'Overfitting': train_f1 - test_f1  # разница показывает переобучение
    })
    
    print(f"{name} - Test F1: {test_f1:.4f}, Test Acc: {test_acc:.4f}")


Обучение Random Forest...
Random Forest - Test F1: 0.9870, Test Acc: 0.9870

Обучение SVM (RBF)...
SVM (RBF) - Test F1: 1.0000, Test Acc: 1.0000

Обучение SVM (Linear)...
SVM (Linear) - Test F1: 1.0000, Test Acc: 1.0000

Обучение KNN (k=5)...
KNN (k=5) - Test F1: 0.9870, Test Acc: 0.9870

Обучение KNN (k=3)...
KNN (k=3) - Test F1: 1.0000, Test Acc: 1.0000


In [9]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Train Acc,Train F1,Val Acc,Val F1,Test Acc,Test F1,Test Precision,Test Recall,Overfitting
0,Random Forest,1.0,1.0,1.0,1.0,0.987013,0.986994,0.987941,0.987013,0.013006
1,SVM (RBF),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,SVM (Linear),0.991597,0.991591,0.973684,0.973528,1.0,1.0,1.0,1.0,-0.008409
3,KNN (k=5),1.0,1.0,1.0,1.0,0.987013,0.986994,0.987941,0.987013,0.013006
4,KNN (k=3),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [15]:
fig = go.Figure()

metrics = ['Test Acc', 'Test F1', 'Test Precision', 'Test Recall']
for metric in metrics:
    fig.add_trace(go.Bar(
        name=metric,
        x=results_df['Model'],
        y=results_df[metric],
        text=results_df[metric].round(3),
        textposition='auto',
    ))

fig.update_layout(
    title='Сравнение моделей по метрикам',
    xaxis_title='Модель',
    yaxis_title='Значение метрики',
    barmode='group',
    plot_bgcolor='white',
    yaxis=dict(range=[0, 1], showgrid=True, gridcolor='lightgray'),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Bar(
    x=results_df['Model'],
    y=results_df['Overfitting'],
    marker_color=['red' if x > 0.1 else 'green' for x in results_df['Overfitting']],
    text=results_df['Overfitting'].round(3),
    textposition='auto'
))
fig2.update_layout(
    title='Анализ переобучения (Train F1 - Test F1)',
    xaxis_title='Модель',
    yaxis_title='Overfitting Gap',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig2.show()


In [None]:
best_model_name = results_df.loc[results_df['Test F1'].idxmax(), 'Model']
best_model = models[best_model_name]

y_test_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_test_pred)

fig = px.imshow(cm, 
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=le.classes_,
                y=le.classes_,
                text_auto=True,
                color_continuous_scale='Blues',
                title=f'Confusion Matrix - {best_model_name}')
fig.update_xaxes(side="bottom")
fig.show()

In [17]:
print(f"{best_model_name} - Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

SVM (RBF) - Classification Report:
              precision    recall  f1-score   support

        down       1.00      1.00      1.00        13
        left       1.00      1.00      1.00        12
     neutral       1.00      1.00      1.00        13
   open_palm       1.00      1.00      1.00        13
       right       1.00      1.00      1.00        13
          up       1.00      1.00      1.00        13

    accuracy                           1.00        77
   macro avg       1.00      1.00      1.00        77
weighted avg       1.00      1.00      1.00        77



### Выводы по Classical ML моделям
Обучены 5 классических моделей. Лучшие результаты показали SVM Linear (F1=1.0, время=0.41ms) 
и SVM RBF (F1=1.0, время=0.73ms). Random Forest и KNN показали немного хуже F1=0.987, 
что связано с небольшим количеством ошибок на сложных граничных случаях...

## DL models

In [44]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [45]:
class GestureDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = GestureDataset(X_train, y_train)
val_dataset = GestureDataset(X_val, y_val)
test_dataset = GestureDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [46]:
class GestureMLP(nn.Module):
    def __init__(self, input_dim=63, num_classes=6):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.network(x)

In [None]:
mlp_model = GestureMLP()
mlp_criterion = nn.CrossEntropyLoss()
mlp_optimizer = torch.optim.AdamW(mlp_model.parameters(), lr=0.001)
mlp_scheduler = torch.optim.lr_scheduler.StepLR(mlp_optimizer, step_size=10, gamma=0.5)

label_to_language = {i: name for i, name in enumerate(le.classes_)}

In [None]:
from tqdm.notebook import tqdm
import copy

def train_model(train_loader, val_loader, test_loader, model, criterion, optimizer, scheduler,
               epochs=50, log_every=5, early_stopping_patience=5, draw_met=True, draw_tqdm=True):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_loss_history = []
    train_f1_history = []
    val_loss_history = []
    val_f1_history = []

    best_val_f1 = 0.0
    patience_counter = 0
    best_model_state = None

    if draw_tqdm:
        epoch_bar = tqdm(range(epochs), desc='Training', leave=True)
    else:
        epoch_bar = range(epochs)

    for epoch in epoch_bar:
        model.train()
        epoch_train_loss = 0.0
        all_train_preds = []
        all_train_labels = []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_train_preds.extend(preds.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())

        train_loss = epoch_train_loss / len(train_loader)
        train_f1 = f1_score(all_train_labels, all_train_preds, average='weighted')
        train_loss_history.append(train_loss)
        train_f1_history.append(train_f1)

        model.eval()
        epoch_val_loss = 0.0
        all_val_preds = []
        all_val_labels = []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                epoch_val_loss += loss.item()
                _, preds = torch.max(outputs, 1)
                all_val_preds.extend(preds.cpu().numpy())
                all_val_labels.extend(labels.cpu().numpy())

        val_loss = epoch_val_loss / len(val_loader)
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')
        val_loss_history.append(val_loss)
        val_f1_history.append(val_f1)


        if draw_tqdm:
            epoch_bar.set_postfix({
                'train_loss': f'{train_loss:.4f}',
                'val_loss': f'{val_loss:.4f}',
                'val_f1': f'{val_f1:.4f}'
            })

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = copy.deepcopy(model.state_dict())
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print(f'\nEarly stopping at epoch {epoch + 1}')
                break

        scheduler.step()
        if draw_tqdm:
            if (epoch + 1) % log_every == 0:
                print(f'Epoch: {epoch + 1}/{epochs} |  Val Loss: {val_loss:.4f} | Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}')

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    if draw_met:
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            y=train_f1_history,
            name='Train F1',
            line=dict(color="#00aa88"),
            mode='lines+markers'
        ))
        fig.add_trace(go.Scatter(
            y=val_f1_history,
            name='Val F1',
            line=dict(color="#fbb725"),
            mode='lines+markers'
        ))
        fig.update_layout(
            xaxis_title='Epoch',
            yaxis_title='F1 Score',
            plot_bgcolor='white',
            xaxis=dict(showgrid=True, gridcolor='lightgray'),
            yaxis=dict(showgrid=True, gridcolor='lightgray'),
            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
        )
        fig.show()

    model.eval()
    all_test_preds = []
    all_test_labels = []
    test_loss = 0.0

    with torch.no_grad():
        for inputs, labels in test_loader:  
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_test_preds.extend(preds.cpu().numpy())
            all_test_labels.extend(labels.cpu().numpy())

    test_loss = test_loss / len(test_loader)
    test_f1 = f1_score(all_test_labels, all_test_preds, average='weighted')
    if draw_tqdm:
        print(f'\nFinal Test Metrics:')
        print(f'Test Loss: {test_loss:.4f} | Test F1: {test_f1:.4f}')

        print(f'\nClassification report on test')
        print(classification_report(all_test_labels, all_test_preds, target_names=[label_to_language[i] for i in sorted(label_to_language.keys())]))

    return model

In [49]:
trained_mlp = train_model(train_loader, val_loader, test_loader, mlp_model, 
                         mlp_criterion, mlp_optimizer, mlp_scheduler, 
                         epochs=50, log_every=5, early_stopping_patience=10)

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 5/50 |  Val Loss: 1.1763 | Train F1: 0.3614 | Val F1: 0.4720
Epoch: 10/50 |  Val Loss: 0.5479 | Train F1: 0.6839 | Val F1: 0.7465
Epoch: 15/50 |  Val Loss: 0.4381 | Train F1: 0.8030 | Val F1: 0.7991
Epoch: 20/50 |  Val Loss: 0.3679 | Train F1: 0.8266 | Val F1: 0.8125
Epoch: 25/50 |  Val Loss: 0.3431 | Train F1: 0.7977 | Val F1: 0.8025
Epoch: 30/50 |  Val Loss: 0.3226 | Train F1: 0.8492 | Val F1: 0.8470
Epoch: 35/50 |  Val Loss: 0.3216 | Train F1: 0.8483 | Val F1: 0.8756
Epoch: 40/50 |  Val Loss: 0.3051 | Train F1: 0.8617 | Val F1: 0.8710

Early stopping at epoch 44



Final Test Metrics:
Test Loss: 0.2581 | Test F1: 0.9220

Classification report on test
              precision    recall  f1-score   support

        down       1.00      1.00      1.00        13
        left       1.00      1.00      1.00        12
     neutral       0.83      0.77      0.80        13
   open_palm       0.79      0.85      0.81        13
       right       0.93      1.00      0.96        13
          up       1.00      0.92      0.96        13

    accuracy                           0.92        77
   macro avg       0.92      0.92      0.92        77
weighted avg       0.92      0.92      0.92        77



In [39]:
class GestureConv1D(nn.Module):
    def __init__(self, input_dim=63, num_classes=6):
        super().__init__()
        # Reshape: (batch, 63) -> (batch, 3, 21) - 3 канала (x,y,z), 21 точка
        self.conv_net = nn.Sequential(
            nn.Conv1d(in_channels=3, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.MaxPool1d(2),  # -> (batch, 32, 10)
            
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(2),  # -> (batch, 64, 5)
            
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.AdaptiveAvgPool1d(1)  # -> (batch, 128, 1)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )
    
    def forward(self, x):
        # Reshape: (batch, 63) -> (batch, 3, 21)
        batch_size = x.size(0)
        x = x.view(batch_size, 3, 21)  # 3 координаты (x,y,z), 21 точка
        x = self.conv_net(x)
        x = self.classifier(x)
        return x

In [None]:
conv_model = GestureConv1D()
conv_criterion = nn.CrossEntropyLoss()
conv_optimizer = torch.optim.AdamW(conv_model.parameters(), lr=0.001)
conv_scheduler = torch.optim.lr_scheduler.StepLR(conv_optimizer, step_size=10, gamma=0.5)

In [51]:
trained_conv = train_model(train_loader, val_loader, test_loader, conv_model, 
                          conv_criterion, conv_optimizer, conv_scheduler, 
                          epochs=50, log_every=5, early_stopping_patience=10)

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 5/50 |  Val Loss: 0.1321 | Train F1: 0.9575 | Val F1: 1.0000
Epoch: 10/50 |  Val Loss: 0.0115 | Train F1: 0.9972 | Val F1: 1.0000

Early stopping at epoch 15



Final Test Metrics:
Test Loss: 0.1105 | Test F1: 1.0000

Classification report on test
              precision    recall  f1-score   support

        down       1.00      1.00      1.00        13
        left       1.00      1.00      1.00        12
     neutral       1.00      1.00      1.00        13
   open_palm       1.00      1.00      1.00        13
       right       1.00      1.00      1.00        13
          up       1.00      1.00      1.00        13

    accuracy                           1.00        77
   macro avg       1.00      1.00      1.00        77
weighted avg       1.00      1.00      1.00        77



Сравнение моделей

In [57]:
def get_predictions(model, loader, device): 
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            outputs = model(inputs)  
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    return np.array(all_labels), np.array(all_preds)


def evaluate_pytorch_model(model, train_loader, val_loader, test_loader, model_name):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    y_train_true, y_train_pred = get_predictions(model, train_loader, device)
    train_acc = accuracy_score(y_train_true, y_train_pred)
    train_f1 = f1_score(y_train_true, y_train_pred, average='weighted')
    
    y_val_true, y_val_pred = get_predictions(model, val_loader, device)
    val_acc = accuracy_score(y_val_true, y_val_pred)
    val_f1 = f1_score(y_val_true, y_val_pred, average='weighted')
    
    y_test_true, y_test_pred = get_predictions(model, test_loader, device)
    test_acc = accuracy_score(y_test_true, y_test_pred)
    test_f1 = f1_score(y_test_true, y_test_pred, average='weighted')
    test_precision = precision_score(y_test_true, y_test_pred, average='weighted')
    test_recall = recall_score(y_test_true, y_test_pred, average='weighted')
    
    return {
        'Model': model_name,
        'Train Acc': train_acc,
        'Train F1': train_f1,
        'Val Acc': val_acc,
        'Val F1': val_f1,
        'Test Acc': test_acc,
        'Test F1': test_f1,
        'Test Precision': test_precision,
        'Test Recall': test_recall,
        'Overfitting': train_f1 - test_f1
    }


In [58]:
dl_results = []
dl_results.append(evaluate_pytorch_model(trained_mlp, train_loader, val_loader, 
                                         test_loader, 'MLP'))
dl_results.append(evaluate_pytorch_model(trained_conv, train_loader, val_loader, 
                                         test_loader, 'Conv1D'))

dl_results_df = pd.DataFrame(dl_results)
dl_results_df

Unnamed: 0,Model,Train Acc,Train F1,Val Acc,Val F1,Test Acc,Test F1,Test Precision,Test Recall,Overfitting
0,MLP,0.929972,0.929399,0.894737,0.8923,0.922078,0.921962,0.923624,0.922078,0.007436
1,Conv1D,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [59]:
fig = go.Figure()

metrics = ['Test Acc', 'Test F1', 'Test Precision', 'Test Recall']
for metric in metrics:
    fig.add_trace(go.Bar(
        name=metric,
        x=dl_results_df['Model'],
        y=dl_results_df[metric],
        text=dl_results_df[metric].round(3),
        textposition='auto',
    ))

fig.update_layout(
    title='Сравнение DL моделей по метрикам',
    xaxis_title='Модель',
    yaxis_title='Значение метрики',
    barmode='group',
    plot_bgcolor='white',
    yaxis=dict(range=[0, 1], showgrid=True, gridcolor='lightgray'),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Bar(
    x=dl_results_df['Model'],
    y=dl_results_df['Overfitting'],
    marker_color=['red' if x > 0.1 else 'green' for x in dl_results_df['Overfitting']],
    text=dl_results_df['Overfitting'].round(3),
    textposition='auto'
))
fig2.update_layout(
    title='Анализ переобучения DL моделей (Train F1 - Test F1)',
    xaxis_title='Модель',
    yaxis_title='Overfitting Gap',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig2.show()

In [61]:
best_dl_idx = dl_results_df['Test F1'].idxmax()
best_dl_name = dl_results_df.loc[best_dl_idx, 'Model']

if best_dl_name == 'MLP':
    best_dl_model = trained_mlp
else:
    best_dl_model = trained_conv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_dl_model.to(device)
best_dl_model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = best_dl_model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

cm = confusion_matrix(all_labels, all_preds)

fig = px.imshow(cm, 
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=le.classes_,
                y=le.classes_,
                text_auto=True,
                color_continuous_scale='Blues',
                title=f'Confusion Matrix - {best_dl_name} (Best DL Model)')
fig.update_xaxes(side="bottom")
fig.show()


In [62]:
print(f"\n{best_dl_name} - Classification Report:")
print(classification_report(all_labels, all_preds, target_names=le.classes_))


Conv1D - Classification Report:
              precision    recall  f1-score   support

        down       1.00      1.00      1.00        13
        left       1.00      1.00      1.00        12
     neutral       1.00      1.00      1.00        13
   open_palm       1.00      1.00      1.00        13
       right       1.00      1.00      1.00        13
          up       1.00      1.00      1.00        13

    accuracy                           1.00        77
   macro avg       1.00      1.00      1.00        77
weighted avg       1.00      1.00      1.00        77



### Выводы по Deep Learning моделям

**MLP (Multilayer Perceptron):**
- Test F1 = 0.922 - неплохой результат, но заметно хуже классических моделей
- Overfitting = 0.007 - минимальное переобучение
- Inference Time = 0.38 ms - самая быстрая модель из всех
- Показывает что для данной задачи простая архитектура недостаточна

**Conv1D:**
- Test F1 = 1.0 - идеальная точность на тесте
- Overfitting = 0.0 - нет признаков переобучения
- Inference Time = 0.88 ms - быстрее многих sklearn моделей
- Сверточная архитектура хорошо извлекает пространственные паттерны из последовательности landmarks

**Общий вывод:** Conv1D показала результаты на уровне лучших классических моделей (SVM, KNN), 
что подтверждает качество датасета. Однако для дальнейшего использования предпочтительнее SVM Linear из-за 
меньшего времени инференса (0.41 vs 0.88 ms) при идентичной точности.


## Сравнение ВСЕХ моделей, в том числе по времени инференса

In [63]:
import time

all_results_df = pd.concat([results_df, dl_results_df], ignore_index=True)

inference_times = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Замер времени инференса sklearn моделей

In [64]:
for name, model in models.items():
    start = time.time()
    for _ in range(100):
        _ = model.predict(X_test)
    elapsed = (time.time() - start) / 100 * 1000
    inference_times.append({'Model': name, 'Inference Time (ms)': elapsed})
    print(f"{name}: {elapsed:.2f} ms")

Random Forest: 4.16 ms
SVM (RBF): 0.73 ms
SVM (Linear): 0.41 ms
KNN (k=5): 6.42 ms
KNN (k=3): 6.64 ms


Замер времени инференса PyTorch моделей

In [65]:
pytorch_models = {
    'MLP': trained_mlp,
    'Conv1D': trained_conv
}

test_batch = torch.FloatTensor(X_test).to(device)

for name, model in pytorch_models.items():
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        for _ in range(10):
            _ = model(test_batch)
    
    start = time.time()
    with torch.no_grad():
        for _ in range(100):
            _ = model(test_batch)
    elapsed = (time.time() - start) / 100 * 1000
    inference_times.append({'Model': name, 'Inference Time (ms)': elapsed})
    print(f"{name}: {elapsed:.2f} ms")

MLP: 0.38 ms
Conv1D: 0.88 ms


Итоговое сравнение

In [66]:
inference_df = pd.DataFrame(inference_times)
final_results = all_results_df.merge(inference_df, on='Model')

final_results

Unnamed: 0,Model,Train Acc,Train F1,Val Acc,Val F1,Test Acc,Test F1,Test Precision,Test Recall,Overfitting,Inference Time (ms)
0,Random Forest,1.0,1.0,1.0,1.0,0.987013,0.986994,0.987941,0.987013,0.013006,4.158442
1,SVM (RBF),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.728817
2,SVM (Linear),0.991597,0.991591,0.973684,0.973528,1.0,1.0,1.0,1.0,-0.008409,0.412803
3,KNN (k=5),1.0,1.0,1.0,1.0,0.987013,0.986994,0.987941,0.987013,0.013006,6.423197
4,KNN (k=3),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,6.638033
5,MLP,0.929972,0.929399,0.894737,0.8923,0.922078,0.921962,0.923624,0.922078,0.007436,0.380311
6,Conv1D,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.879846


Отсортированный рейтинг моделей

In [74]:
final_results_sorted = final_results.sort_values(
    ['Test F1', 'Inference Time (ms)'], 
    ascending=[False, True]
)

final_results_sorted[['Model', 'Test F1', 'Test Acc', 'Inference Time (ms)', 
                            'Overfitting']]

Unnamed: 0,Model,Test F1,Test Acc,Inference Time (ms),Overfitting
2,SVM (Linear),1.0,1.0,0.412803,-0.008409
1,SVM (RBF),1.0,1.0,0.728817,0.0
6,Conv1D,1.0,1.0,0.879846,0.0
4,KNN (k=3),1.0,1.0,6.638033,0.0
0,Random Forest,0.986994,0.987013,4.158442,0.013006
3,KNN (k=5),0.986994,0.987013,6.423197,0.013006
5,MLP,0.921962,0.922078,0.380311,0.007436


Финальная визуализация метрик

In [83]:
fig = go.Figure()

metrics = ['Test Acc', 'Test F1', 'Test Precision', 'Test Recall']
for metric in metrics:
    fig.add_trace(go.Bar(
        name=metric,
        x=final_results['Model'],
        y=final_results[metric],
        text=final_results[metric].round(3),
        textposition='auto',
    ))

fig.update_layout(
    title='Сравнение всех моделей по метрикам',
    xaxis_title='Модель',
    yaxis_title='Значение метрики',
    barmode='group',
    plot_bgcolor='white',
    height=500,
    yaxis=dict(range=[0.9, 1.01], showgrid=True, gridcolor='lightgray'),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig.show()

fig2 = go.Figure()

ml_models_list = list(models.keys())
final_results['Type'] = final_results['Model'].apply(
    lambda x: 'Deep Learning' if x in ['MLP', 'Conv1D'] else 'Classical ML'
)

for model_type in ['Classical ML', 'Deep Learning']:
    data = final_results[final_results['Type'] == model_type]
    fig2.add_trace(go.Scatter(
        x=data['Inference Time (ms)'],
        y=data['Test F1'],
        mode='markers+text',
        name=model_type,
        text=data['Model'],
        textposition='top center',
        marker=dict(size=15),
        hovertemplate='<b>%{text}</b><br>F1: %{y:.4f}<br>Time: %{x:.2f} ms<extra></extra>'
    ))

fig2.update_layout(
    title='Trade-off: Качество (F1) vs Скорость инференса',
    xaxis_title='Inference Time (ms)',
    yaxis_title='Test F1 Score',
    plot_bgcolor='white',
    height=600,
    xaxis=dict(showgrid=True, gridcolor='lightgray', type='log'),
    yaxis=dict(showgrid=True, gridcolor='lightgray', range=[0.92, 1.01]),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig2.show()

fig3 = go.Figure()
fig3.add_trace(go.Bar(
    x=final_results['Model'],
    y=final_results['Inference Time (ms)'],
    text=final_results['Inference Time (ms)'].round(2),
    textposition='auto',
    marker_color=final_results['Inference Time (ms)'],
    marker_colorscale='Reds'
))
fig3.update_layout(
    title='Время инференса моделей',
    xaxis_title='Модель',
    yaxis_title='Время (ms)',
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig3.show()

Выбор лучшей модели

In [77]:
prod_candidates = final_results[final_results['Test F1'] >= 0.98]
best_prod_model = prod_candidates.loc[prod_candidates['Inference Time (ms)'].idxmin()]

print(f"Модель: {best_prod_model['Model']}")
print(f"Test F1: {best_prod_model['Test F1']:.4f}")
print(f"Test Accuracy: {best_prod_model['Test Acc']:.4f}")
print(f"Inference Time: {best_prod_model['Inference Time (ms)']:.2f} ms")
print(f"Overfitting Gap: {best_prod_model['Overfitting']:.4f}")

Модель: SVM (Linear)
Test F1: 1.0000
Test Accuracy: 1.0000
Inference Time: 0.41 ms
Overfitting Gap: -0.0084


Сохраняю лучшую модель

In [78]:
if best_prod_model['Model'] in ['MLP', 'Conv1D']:
    model_to_save = trained_mlp if best_prod_model['Model'] == 'MLP' else trained_conv
    torch.save(model_to_save.state_dict(), f"best_prod_model_{best_prod_model['Model']}.pth")
    print(f"\nМодель сохранена в best_prod_model_{best_prod_model['Model']}.pth")
else:
    import joblib
    model_to_save = models[best_prod_model['Model']]
    joblib.dump(model_to_save, f"best_prod_model_{best_prod_model['Model']}.pkl")
    print(f"\nМодель сохранена в best_prod_model_{best_prod_model['Model']}.pkl")


Модель сохранена в best_prod_model_SVM (Linear).pkl


## Итоговые выводы

1. Собран качественный датасет из 510 примеров (6 классов по 85 примеров)
2. Обучено и сравнено 7 моделей разных типов
3. Лучшая модель - SVM Linear: F1=1.0, время=0.41ms
4. Высокие метрики объясняются качественной подготовкой данных
5. Для дальнейшего использования выбран SVM Linear как оптимальный по скорости и точности

### Перспективы развития:
- Тестирование на других пользователях
- Расширение набора жестов
- Добавление динамических жестов через LSTM
