# Tennis Action Recognition - Data Exploration

This notebook explores the tennis action recognition dataset with COCO format annotations.

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import json

from src.data.dataset_processor import TennisDatasetProcessor

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

%matplotlib inline

## 1. Dataset Overview

In [None]:
# Initialize dataset processor
processor = TennisDatasetProcessor("../data/tennis_dataset")

# Load and process dataset
annotation_files = ["tennis_actions.json"]  # Replace with your actual annotation files
df = processor.process_dataset(annotation_files)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## 2. Action Distribution

In [None]:
# Action distribution
action_counts = df['action_name'].value_counts()
print("Action distribution:")
print(action_counts)

# Plot distribution
plt.figure(figsize=(10, 6))
action_counts.plot(kind='bar', color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
plt.title('Distribution of Tennis Actions', fontsize=16, fontweight='bold')
plt.xlabel('Action Type', fontsize=12)
plt.ylabel('Number of Images', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Keypoint Analysis

In [None]:
# Analyze keypoint statistics
keypoint_stats = df['num_keypoints'].describe()
print("Keypoint statistics:")
print(keypoint_stats)

# Plot keypoint distribution
plt.figure(figsize=(8, 5))
df['num_keypoints'].hist(bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribution of Number of Keypoints', fontsize=14, fontweight='bold')
plt.xlabel('Number of Keypoints', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.show()

## 4. Keypoint Visualization

In [None]:
def visualize_keypoints(image_path, keypoints, action_name):
    """Visualize keypoints on image"""
    # Load image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Reshape keypoints
    kpts = keypoints.reshape(-1, 2)
    
    # Denormalize keypoints
    h, w = image.shape[:2]
    kpts[:, 0] *= w
    kpts[:, 1] *= h
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.imshow(image)
    
    # Draw keypoints
    for i, (x, y) in enumerate(kpts):
        if x > 0 and y > 0:  # Only draw visible keypoints
            plt.plot(x, y, 'ro', markersize=8)
            plt.text(x+5, y-5, processor.keypoint_names[i], fontsize=8, color='white', 
                    bbox=dict(boxstyle='round,pad=0.2', facecolor='red', alpha=0.7))
    
    plt.title(f'Tennis Action: {action_name}', fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Visualize sample images for each action
for action in df['action_name'].unique():
    sample = df[df['action_name'] == action].iloc[0]
    if Path(sample['image_path']).exists():
        visualize_keypoints(sample['image_path'], sample['keypoints'], action)
    else:
        print(f"Image not found for {action}: {sample['image_path']}")

## 5. Feature Engineering Analysis

In [None]:
# Extract features for analysis
X, y = processor.extract_features_for_classical_ml(df)

print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Feature names: Keypoints (36) + Engineered features (6) = {X.shape[1]} total")

# Analyze feature distributions by action
feature_df = pd.DataFrame(X)
feature_df['action'] = y

# Plot some key features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Hand distance (feature 36)
axes[0, 0].boxplot([feature_df[feature_df['action'] == i][36].values for i in range(4)])
axes[0, 0].set_title('Hand Distance by Action')
axes[0, 0].set_xticklabels(processor.action_categories.values())

# Shoulder distance (feature 37)
axes[0, 1].boxplot([feature_df[feature_df['action'] == i][37].values for i in range(4)])
axes[0, 1].set_title('Shoulder Distance by Action')
axes[0, 1].set_xticklabels(processor.action_categories.values())

# Left arm angle (feature 38)
axes[1, 0].boxplot([feature_df[feature_df['action'] == i][38].values for i in range(4)])
axes[1, 0].set_title('Left Arm Angle by Action')
axes[1, 0].set_xticklabels(processor.action_categories.values())

# Right arm angle (feature 39)
axes[1, 1].boxplot([feature_df[feature_df['action'] == i][39].values for i in range(4)])
axes[1, 1].set_title('Right Arm Angle by Action')
axes[1, 1].set_xticklabels(processor.action_categories.values())

plt.tight_layout()
plt.show()

## 6. Data Quality Assessment

In [None]:
# Check for missing or invalid data
print("Data Quality Assessment:")
print(f"Total samples: {len(df)}")
print(f"Missing image paths: {df['image_path'].isna().sum()}")
print(f"Missing keypoints: {df['keypoints'].isna().sum()}")
print(f"Invalid keypoint shapes: {sum(kp.shape != (36,) for kp in df['keypoints'])}")

# Check image accessibility
missing_images = 0
for img_path in df['image_path']:
    if not Path(img_path).exists():
        missing_images += 1

print(f"Missing image files: {missing_images}")
print(f"Data completeness: {((len(df) - missing_images) / len(df)) * 100:.1f}%")

## 7. Train/Validation/Test Split Analysis

In [None]:
# Create data splits
train_df, val_df, test_df = processor.create_train_val_test_split(df)

# Analyze splits
splits_info = {
    'Train': train_df,
    'Validation': val_df,
    'Test': test_df
}

# Plot split distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, (split_name, split_df) in enumerate(splits_info.items()):
    action_counts = split_df['action_name'].value_counts()
    action_counts.plot(kind='bar', ax=axes[idx], color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    axes[idx].set_title(f'{split_name} Set\n({len(split_df)} samples)', fontweight='bold')
    axes[idx].set_xlabel('Action Type')
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print detailed split information
print("\nDetailed Split Information:")
for split_name, split_df in splits_info.items():
    print(f"\n{split_name} Set:")
    print(f"  Total samples: {len(split_df)}")
    print(f"  Action distribution:")
    for action, count in split_df['action_name'].value_counts().items():
        percentage = (count / len(split_df)) * 100
        print(f"    {action}: {count} ({percentage:.1f}%)")

## 8. Save Processed Data

In [None]:
# Save processed data
output_dir = "../data/processed"
processor.save_processed_data(train_df, val_df, test_df, output_dir)

print(f"Processed data saved to {output_dir}")
print("Files created:")
print("  - train_data.pkl")
print("  - val_data.pkl")
print("  - test_data.pkl")

## Summary

This notebook provided a comprehensive exploration of the tennis action recognition dataset:

1. **Dataset Overview**: Loaded and processed COCO format annotations
2. **Action Distribution**: Analyzed the balance of different tennis actions
3. **Keypoint Analysis**: Examined keypoint quality and completeness
4. **Feature Engineering**: Created additional features from keypoint data
5. **Data Quality**: Assessed data completeness and validity
6. **Data Splits**: Created stratified train/validation/test splits
7. **Data Export**: Saved processed data for model training

The dataset appears to be well-balanced with 500 images per action class, and the keypoint annotations provide rich information for action recognition.