# YOLO Person Detection Model Training

This notebook trains a YOLO model for person detection and position tracking for crowd analysis.

## Dataset Requirements
- Images with people in various crowd scenarios
- YOLO format annotations (class_id x_center y_center width height)
- Class 0: person

## Output
- Trained YOLO model weights
- Position data for each detected person
- Bounding box coordinates normalized to image dimensions


In [None]:
# Install required packages
%pip install ultralytics torch torchvision opencv-python numpy matplotlib pillow

# Import libraries
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import yaml
from ultralytics import YOLO
import os
import time
from google.colab import drive


In [None]:
# Mount Google Drive to access dataset
drive.mount('/content/drive')

# Set dataset path (update this path to your dataset location)
DATASET_PATH = '/content/drive/MyDrive/CrowdProject/dataset'

# Verify dataset structure
if os.path.exists(DATASET_PATH):
    print(f"Dataset found at: {DATASET_PATH}")
    print("\nDataset structure:")
    for root, dirs, files in os.walk(DATASET_PATH):
        level = root.replace(DATASET_PATH, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files
            print(f"{subindent}{file}")
        if len(files) > 5:
            print(f"{subindent}... and {len(files) - 5} more files")
else:
    print(f"Dataset not found at: {DATASET_PATH}")
    print("Please upload your dataset to Google Drive and update DATASET_PATH")


In [None]:
# Create YOLO dataset configuration file
config_content = f"""
# YOLO Dataset Configuration for Person Detection
path: {DATASET_PATH}
train: images/train
val: images/val
test: images/test

# Classes
nc: 1  # number of classes
names: ['person']  # class names
"""

# Write config file
config_path = '/content/dataset.yaml'
with open(config_path, 'w') as f:
    f.write(config_content)

print(f"Dataset configuration created at: {config_path}")
print("\nConfiguration content:")
print(config_content)


In [None]:
# Load YOLO model (using pre-trained YOLOv8)
model = YOLO('yolov8n.pt')  # nano version for faster training

# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


In [None]:
# Train the model
results = model.train(
    data=config_path,
    epochs=100,
    imgsz=640,
    batch=16,
    device=device,
    project='/content/runs/detect',
    name='person_detection',
    save_period=10,  # Save checkpoint every 10 epochs
    patience=20,     # Early stopping patience
    lr0=0.01,        # Initial learning rate
    lrf=0.01,        # Final learning rate
    momentum=0.937,  # SGD momentum
    weight_decay=0.0005,  # Weight decay
    warmup_epochs=3,      # Warmup epochs
    warmup_momentum=0.8,  # Warmup momentum
    warmup_bias_lr=0.1,   # Warmup bias learning rate
    box=7.5,              # Box loss gain
    cls=0.5,              # Class loss gain
    dfl=1.5,              # DFL loss gain
    augment=True,         # Enable augmentation
    hsv_h=0.015,          # Image HSV-Hue augmentation
    hsv_s=0.7,            # Image HSV-Saturation augmentation
    hsv_v=0.4,            # Image HSV-Value augmentation
    degrees=0.0,          # Image rotation (+/- deg)
    translate=0.1,        # Image translation (+/- fraction)
    scale=0.5,            # Image scale (+/- gain)
    shear=0.0,            # Image shear (+/- deg)
    perspective=0.0,      # Image perspective (+/- fraction)
    flipud=0.0,           # Image flip up-down (probability)
    fliplr=0.5,           # Image flip left-right (probability)
    mosaic=1.0,           # Image mosaic (probability)
    mixup=0.0,            # Image mixup (probability)
    copy_paste=0.0,       # Segment copy-paste (probability)
)

print("Training completed!")


In [None]:
# Validate the trained model
metrics = model.val()

print("\nValidation Results:")
print(f"mAP50: {metrics.box.map50:.4f}")
print(f"mAP50-95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")


In [None]:
# Test inference on sample images
def test_inference(model, image_path):
    """
    Test inference and return person positions
    
    Returns:
    - image with annotations
    - list of person positions [x_center, y_center, width, height]
    """
    results = model(image_path, conf=0.5)
    
    person_positions = []
    annotated_image = None
    
    for result in results:
        # Get annotated image
        annotated_image = result.plot()
        
        # Extract person detections (class 0)
        boxes = result.boxes
        if boxes is not None:
            for box in boxes:
                # Get class and confidence
                cls = int(box.cls[0])
                conf = float(box.conf[0])
                
                if cls == 0 and conf > 0.5:  # Person class with confidence > 0.5
                    # Get bounding box coordinates (normalized)
                    x_center, y_center, width, height = box.xywh[0].cpu().numpy()
                    
                    # Convert to image coordinates if needed
                    img_height, img_width = result.orig_shape
                    x_center_px = x_center * img_width
                    y_center_px = y_center * img_height
                    width_px = width * img_width
                    height_px = height * img_height
                    
                    person_positions.append({
                        'x_center': float(x_center_px),
                        'y_center': float(y_center_px),
                        'width': float(width_px),
                        'height': float(height_px),
                        'confidence': float(conf)
                    })
    
    return annotated_image, person_positions

# Test on a sample image (replace with your test image path)
sample_image_path = '/content/sample_test.jpg'

if os.path.exists(sample_image_path):
    annotated_img, positions = test_inference(model, sample_image_path)
    
    print(f"Detected {len(positions)} persons:")
    for i, pos in enumerate(positions):
        print(f"Person {i+1}: Center=({pos['x_center']:.1f}, {pos['y_center']:.1f}), "
              f"Size=({pos['width']:.1f}, {pos['height']:.1f}), "
              f"Confidence={pos['confidence']:.3f}")
    
    # Display annotated image
    if annotated_img is not None:
        plt.figure(figsize=(12, 8))
        plt.imshow(cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB))
        plt.title(f"Person Detection Results - {len(positions)} persons detected")
        plt.axis('off')
        plt.show()
else:
    print("Sample image not found. Please upload a test image to test inference.")


In [None]:
# Save the trained model
model_path = '/content/drive/MyDrive/CrowdProject/models/best_person_detection.pt'

# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Export model to different formats
model.export(format='onnx')  # Export to ONNX for faster inference
model.export(format='torchscript')  # Export to TorchScript

# Copy best model to Google Drive
import shutil
best_model_path = '/content/runs/detect/person_detection/weights/best.pt'
if os.path.exists(best_model_path):
    shutil.copy2(best_model_path, model_path)
    print(f"Best model saved to: {model_path}")
    
    # Also save the ONNX version
    onnx_path = '/content/drive/MyDrive/CrowdProject/models/best_person_detection.onnx'
    if os.path.exists('/content/runs/detect/person_detection/weights/best.onnx'):
        shutil.copy2('/content/runs/detect/person_detection/weights/best.onnx', onnx_path)
        print(f"ONNX model saved to: {onnx_path}")
else:
    print("Best model not found. Training may have failed.")


In [None]:
# Create inference script for the backend
inference_script = '''
import cv2
import numpy as np
from ultralytics import YOLO
import json
from typing import List, Dict, Tuple

class PersonDetector:
    def __init__(self, model_path: str, conf_threshold: float = 0.5):
        """
        Initialize the person detector
        
        Args:
            model_path: Path to the trained YOLO model
            conf_threshold: Confidence threshold for detections
        """
        self.model = YOLO(model_path)
        self.conf_threshold = conf_threshold
    
    def detect_persons(self, image: np.ndarray) -> List[Dict]:
        """
        Detect persons in an image and return their positions
        
        Args:
            image: Input image as numpy array
            
        Returns:
            List of dictionaries containing person positions and metadata
        """
        results = self.model(image, conf=self.conf_threshold)
        
        person_positions = []
        
        for result in results:
            boxes = result.boxes
            if boxes is not None:
                for box in boxes:
                    cls = int(box.cls[0])
                    conf = float(box.conf[0])
                    
                    if cls == 0 and conf > self.conf_threshold:  # Person class
                        # Get bounding box coordinates
                        x_center, y_center, width, height = box.xywh[0].cpu().numpy()
                        
                        # Convert to image coordinates
                        img_height, img_width = result.orig_shape
                        x_center_px = x_center * img_width
                        y_center_px = y_center * img_height
                        width_px = width * img_width
                        height_px = height * img_height
                        
                        person_positions.append({
                            'id': len(person_positions),
                            'x_center': float(x_center_px),
                            'y_center': float(y_center_px),
                            'width': float(width_px),
                            'height': float(height_px),
                            'confidence': float(conf),
                            'normalized_x': float(x_center),
                            'normalized_y': float(y_center),
                            'normalized_width': float(width),
                            'normalized_height': float(height)
                        })
        
        return person_positions
    
    def get_detection_summary(self, person_positions: List[Dict]) -> Dict:
        """
        Get a summary of detections
        
        Args:
            person_positions: List of person positions
            
        Returns:
            Dictionary with detection summary
        """
        if not person_positions:
            return {
                'total_persons': 0,
                'average_confidence': 0.0,
                'positions': []
            }
        
        avg_confidence = sum(pos['confidence'] for pos in person_positions) / len(person_positions)
        
        return {
            'total_persons': len(person_positions),
            'average_confidence': avg_confidence,
            'positions': person_positions,
            'timestamp': int(time.time() * 1000)  # milliseconds
        }

# Example usage
if __name__ == "__main__":
    # Initialize detector
    detector = PersonDetector('best_person_detection.pt')
    
    # Load test image
    image = cv2.imread('test_image.jpg')
    
    # Detect persons
    positions = detector.detect_persons(image)
    
    # Get summary
    summary = detector.get_detection_summary(positions)
    
    # Print results
    print(json.dumps(summary, indent=2))
'''

# Save inference script
inference_path = '/content/drive/MyDrive/CrowdProject/models/person_detector.py'
with open(inference_path, 'w') as f:
    f.write(inference_script)

print(f"Inference script saved to: {inference_path}")


## Dataset Structure Requirements

Your dataset should be organized as follows:

```
dataset/
├── images/
│   ├── train/
│   │   ├── img001.jpg
│   │   ├── img002.jpg
│   │   └── ...
│   ├── val/
│   │   ├── img101.jpg
│   │   ├── img102.jpg
│   │   └── ...
│   └── test/
│       ├── img201.jpg
│       ├── img202.jpg
│       └── ...
└── labels/
    ├── train/
    │   ├── img001.txt
    │   ├── img002.txt
    │   └── ...
    ├── val/
    │   ├── img101.txt
    │   ├── img102.txt
    │   └── ...
    └── test/
        ├── img201.txt
        ├── img202.txt
        └── ...
```

## Annotation Format

Each `.txt` file should contain one line per object:
```
class_id x_center y_center width height
```

Where:
- `class_id`: 0 (for person)
- All coordinates are normalized (0-1)
- `x_center, y_center`: center of bounding box
- `width, height`: width and height of bounding box

### Example annotation file (img001.txt):
```
0 0.5 0.3 0.2 0.4
0 0.8 0.7 0.15 0.3
```

This represents two persons in the image.

## Training Process

1. Upload your dataset to Google Drive
2. Update the `DATASET_PATH` in the Colab notebook
3. Run the training cells
4. The trained model will be saved to Google Drive

## Model Output

The trained model will output:
- Person detection with confidence scores
- Bounding box coordinates (both pixel and normalized)
- Position data for each detected person

## Inference Format

The model returns position data in this format:
```json
{
  "total_persons": 2,
  "average_confidence": 0.85,
  "positions": [
    {
      "id": 0,
      "x_center": 320.5,
      "y_center": 240.3,
      "width": 64.2,
      "height": 128.4,
      "confidence": 0.92,
      "normalized_x": 0.5,
      "normalized_y": 0.3,
      "normalized_width": 0.2,
      "normalized_height": 0.4
    }
  ],
  "timestamp": 1640995200000
}
```


# YOLO Person Detection Model Training

This notebook trains a YOLO model for person detection and position tracking for crowd analysis.

## Dataset Requirements
- Images with people in various crowd scenarios
- YOLO format annotations (class_id x_center y_center width height)
- Class 0: person

## Output
- Trained YOLO model weights
- Position data for each detected person
- Bounding box coordinates normalized to image dimensions


In [None]:

%pip install ultralytics torch torchvision opencv-python numpy matplotlib pillow

import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import yaml
from ultralytics import YOLO
import os
import time
from google.colab import drive


In [None]:
# Google Drive 
drive.mount('/content/drive')

DATASET_PATH = '/content/drive/MyDrive/CrowdProject/dataset'

if os.path.exists(DATASET_PATH):
    print(f"Dataset found at: {DATASET_PATH}")
    print("\nDataset structure:")
    for root, dirs, files in os.walk(DATASET_PATH):
        level = root.replace(DATASET_PATH, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files
            print(f"{subindent}{file}")
        if len(files) > 5:
            print(f"{subindent}... and {len(files) - 5} more files")
else:
    print(f"Dataset not found at: {DATASET_PATH}")
    print("Please upload your dataset to Google Drive and update DATASET_PATH")


In [None]:
# YOLO dataset
config_content = f"""
# YOLO Dataset Configuration for Person Detection
path: {DATASET_PATH}
train: images/train
val: images/val
test: images/test

# Classes
nc: 1  # number of classes
names: ['person']  # class names
"""

# Write config file
config_path = '/content/dataset.yaml'
with open(config_path, 'w') as f:
    f.write(config_content)

print(f"Dataset configuration created at: {config_path}")
print("\nConfiguration content:")
print(config_content)


In [None]:
# YOLOv8
model = YOLO('yolov8n.pt') 

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


In [None]:
results = model.train(
    data=config_path,
    epochs=100,
    imgsz=640,
    batch=16,
    device=device,
    project='/content/runs/detect',
    name='person_detection',
    save_period=10,  # Save checkpoint every 10 epochs
    patience=20,     # Early stopping patience
    lr0=0.01,        # Initial learning rate
    lrf=0.01,        # Final learning rate
    momentum=0.937,  # SGD momentum
    weight_decay=0.0005,  # Weight decay
    warmup_epochs=3,      # Warmup epochs
    warmup_momentum=0.8,  # Warmup momentum
    warmup_bias_lr=0.1,   # Warmup bias learning rate
    box=7.5,              # Box loss gain
    cls=0.5,              # Class loss gain
    dfl=1.5,              # DFL loss gain
    augment=True,         # Enable augmentation
    hsv_h=0.015,          # Image HSV-Hue augmentation
    hsv_s=0.7,            # Image HSV-Saturation augmentation
    hsv_v=0.4,            # Image HSV-Value augmentation
    degrees=0.0,          # Image rotation (+/- deg)
    translate=0.1,        # Image translation (+/- fraction)
    scale=0.5,            # Image scale (+/- gain)
    shear=0.0,            # Image shear (+/- deg)
    perspective=0.0,      # Image perspective (+/- fraction)
    flipud=0.0,           # Image flip up-down (probability)
    fliplr=0.5,           # Image flip left-right (probability)
    mosaic=1.0,           # Image mosaic (probability)
    mixup=0.0,            # Image mixup (probability)
    copy_paste=0.0,       # Segment copy-paste (probability)
)

print("Training completed!")
