# Импорт библиотек

In [1]:
import os
import numpy as np
import cv2
import time
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Input, Concatenate, Dropout, BatchNormalization, Activation
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras.metrics import MeanIoU, Recall, Precision
import kagglehub

In [2]:
# Download latest version
path = kagglehub.dataset_download("nikhilroxtomar/person-segmentation")

Downloading from https://www.kaggle.com/api/v1/datasets/download/nikhilroxtomar/person-segmentation?dataset_version_number=1...


100%|██████████| 461M/461M [00:23<00:00, 20.9MB/s]

Extracting files...





# Получение данных из датасета

In [3]:
images_path = sorted(glob(os.path.join(path, "people_segmentation/images/*.jpg")))
masks_path = sorted(glob(os.path.join(path, "people_segmentation/masks/*.png")))

# Разделение на тренировочную и тестовые выборки изображений и масок

In [4]:
train_images, test_images, train_masks, test_masks = train_test_split(images_path, masks_path, test_size=0.2, random_state=42)

# Вспомогательные функции

In [5]:
def read_image(path):
    x = cv2.imread(path, cv2.IMREAD_COLOR)
    x = cv2.resize(x, (256, 256))
    x = x / 255.0  # Нормализация
    x = x.astype(np.float32)
    return x

In [6]:
def read_mask(path):
    x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    x = cv2.resize(x, (256, 256))
    x = x.astype(np.float32)
    x = np.expand_dims(x, axis=-1)  # Добавляем размерность
    return x

In [7]:
def preprocess(image_path, mask_path):
    def f(image_path, mask_path):
        image_path = image_path.decode()  # Декодируем путь к изображению
        mask_path = mask_path.decode()  # Декодируем путь к маске
        x = read_image(image_path)  # Читаем и обрабатываем изображение
        y = read_mask(mask_path)  # Читаем и обрабатываем маску
        return x, y

    # Используем tf.numpy_function для обработки
    image, mask = tf.numpy_function(f, [image_path, mask_path], [tf.float32, tf.float32])
    image.set_shape([256, 256, 3])  # Для изображений RGB
    mask.set_shape([256, 256, 1])  # Для масок (1 канал)
    return image, mask

In [8]:
def tf_dataset(images, masks, batch_size=8):
    dataset = tf.data.Dataset.from_tensor_slices((images, masks))
    dataset = dataset.shuffle(buffer_size=5000)
    dataset = dataset.map(preprocess)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

# U-Net

In [9]:
def build_unet(input_shape=(256, 256, 3)):
    inputs = Input(input_shape)

    # Encoder
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = MaxPooling2D((2, 2))(c1)

    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = MaxPooling2D((2, 2))(c2)

    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(p2)
    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    c4 = Conv2D(512, (3, 3), activation='relu', padding='same')(p3)
    c4 = Conv2D(512, (3, 3), activation='relu', padding='same')(c4)
    p4 = MaxPooling2D(pool_size=(2, 2))(c4)

    # Bottleneck
    c5 = Conv2D(1024, (3, 3), activation='relu', padding='same')(p4)
    c5 = Conv2D(1024, (3, 3), activation='relu', padding='same')(c5)

    # Decoder
    u6 = UpSampling2D((2, 2))(c5)
    u6 = Concatenate()([u6, c4])
    c6 = Conv2D(512, (3, 3), activation='relu', padding='same')(u6)
    c6 = Conv2D(512, (3, 3), activation='relu', padding='same')(c6)

    u7 = UpSampling2D((2, 2))(c6)
    u7 = Concatenate()([u7, c3])
    c7 = Conv2D(256, (3, 3), activation='relu', padding='same')(u7)
    c7 = Conv2D(256, (3, 3), activation='relu', padding='same')(c7)

    u8 = UpSampling2D((2, 2))(c7)
    u8 = Concatenate()([u8, c2])
    c8 = Conv2D(128, (3, 3), activation='relu', padding='same')(u8)
    c8 = Conv2D(128, (3, 3), activation='relu', padding='same')(c8)

    u9 = UpSampling2D((2, 2))(c8)
    u9 = Concatenate()([u9, c1])
    c9 = Conv2D(64, (3, 3), activation='relu', padding='same')(u9)
    c9 = Conv2D(64, (3, 3), activation='relu', padding='same')(c9)

    outputs = Conv2D(1, (1, 1), activation='sigmoid')(c9)

    model = Model(inputs, outputs, name="U-Net")
    return model

# DeepLab

In [10]:
def build_deeplab(input_shape=(256, 256, 3), num_classes=1):
    base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=input_shape)
    x = base_model.output

    # Поэтапное увеличение разрешения
    x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (16, 16)

    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (32, 32)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (64, 64)

    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (128, 128)

    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (256, 256)

    # Финальный слой для маски
    outputs = Conv2D(num_classes, (1, 1), activation='sigmoid', padding='same')(x)

    model = Model(inputs=base_model.input, outputs=outputs)
    return model


# SegNet

In [11]:
def build_segnet(input_shape=(256, 256, 3), num_classes=1):
    inputs = Input(shape=input_shape)

    # Encoder
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = MaxPooling2D((2, 2))(c1)

    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = MaxPooling2D((2, 2))(c2)

    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(p2)
    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    # Decoder
    u3 = UpSampling2D((2, 2))(p3)
    d3 = Conv2D(256, (3, 3), activation='relu', padding='same')(u3)
    d3 = Conv2D(256, (3, 3), activation='relu', padding='same')(d3)

    u2 = UpSampling2D((2, 2))(d3)
    d2 = Conv2D(128, (3, 3), activation='relu', padding='same')(u2)
    d2 = Conv2D(128, (3, 3), activation='relu', padding='same')(d2)

    u1 = UpSampling2D((2, 2))(d2)
    d1 = Conv2D(64, (3, 3), activation='relu', padding='same')(u1)
    d1 = Conv2D(64, (3, 3), activation='relu', padding='same')(d1)

    outputs = Conv2D(num_classes, (1, 1), activation='sigmoid')(d1)

    model = Model(inputs, outputs, name="SegNet")
    return model

# Обучение

In [12]:
# Функция для обучения модели
def train_model(model, train_dataset, val_dataset, epochs=10, model_name="model"):
    # Определяем метрики
    metrics = [
        'accuracy',
        MeanIoU(num_classes=2),
        Recall(),
        Precision()
    ]

    # Компилируем модель
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)

    # Директория для сохранения модели и метрик
    if not os.path.exists("models"):
        os.makedirs("models")

    # Определяем колбэки для сохранения модели и метрик
    model_checkpoint = ModelCheckpoint(
        os.path.join("models", f"{model_name}.keras"),
        monitor='val_loss',
        save_best_only=True,
        mode='min',
        verbose=1
    )

    csv_logger = CSVLogger(os.path.join("models", f"{model_name}_metrics.csv"))

    # Обучаем модель
    model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        callbacks=[model_checkpoint, csv_logger]
    )

In [13]:
# Создание датасетов
train_dataset = tf_dataset(train_images, train_masks, batch_size=8)
val_dataset = tf_dataset(test_images, test_masks, batch_size=8)

In [14]:
# Обучение U-Net
unet_model = build_unet()
train_model(unet_model, train_dataset, val_dataset, epochs=10, model_name="unet_model")

Epoch 1/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 524ms/step - accuracy: 0.7404 - loss: 2.6076 - mean_io_u: 0.2666 - precision: 0.4381 - recall: 0.0695
Epoch 1: val_loss improved from inf to 0.46934, saving model to models/unet_model.keras
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 598ms/step - accuracy: 0.7404 - loss: 2.6046 - mean_io_u: 0.2665 - precision: 0.4382 - recall: 0.0696 - val_accuracy: 0.7816 - val_loss: 0.4693 - val_mean_io_u: 0.2500 - val_precision: 0.6871 - val_recall: 0.2679
Epoch 2/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 440ms/step - accuracy: 0.7972 - loss: 0.4437 - mean_io_u: 0.2694 - precision: 0.6352 - recall: 0.4081
Epoch 2: val_loss improved from 0.46934 to 0.44073, saving model to models/unet_model.keras
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 525ms/step - accuracy: 0.7972 - loss: 0.4437 - mean_io_u: 0.2694 - precision: 0.6352 - recall: 0.4081 - val_accur

In [15]:
# Обучение DeepLab
deeplab_model = build_deeplab()
train_model(deeplab_model, train_dataset, val_dataset, epochs=10, model_name="deeplab_model")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Epoch 1/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step - accuracy: 0.8226 - loss: 0.4074 - mean_io_u_1: 0.2667 - precision_1: 0.6966 - recall_1: 0.4968
Epoch 1: val_loss improved from inf to 0.29130, saving model to models/deeplab_model.keras
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 389ms/step - accuracy: 0.8226 - loss: 0.4072 - mean_io_u_1: 0.2667 - precision_1: 0.6967 - recall_1: 0.4971 - val_accuracy: 0.8708 - val_loss: 0.2913 - val_mean_io_u_1: 0.2500 - val_precision_1: 0.6905 - val_recall_1: 0.8966
Epoch 2/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.9062 - loss: 0.2283 - mean_io_u_1: 0.2676 - precision_1: 0.8173 - recall_1: 0.8121
Epoch 2: val_

In [16]:
# Обучение SegNet
segnet_model = build_segnet()
train_model(segnet_model, train_dataset, val_dataset, epochs=10, model_name="segnet_model")

Epoch 1/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step - accuracy: 0.7347 - loss: 0.5804 - mean_io_u_2: 0.2660 - precision_2: 0.2107 - recall_2: 0.0101
Epoch 1: val_loss improved from inf to 0.50539, saving model to models/segnet_model.keras
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 311ms/step - accuracy: 0.7347 - loss: 0.5804 - mean_io_u_2: 0.2659 - precision_2: 0.2108 - recall_2: 0.0101 - val_accuracy: 0.7444 - val_loss: 0.5054 - val_mean_io_u_2: 0.2500 - val_precision_2: 0.7590 - val_recall_2: 6.2466e-04
Epoch 2/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - accuracy: 0.7572 - loss: 0.4853 - mean_io_u_2: 0.2681 - precision_2: 0.5522 - recall_2: 0.1579
Epoch 2: val_loss improved from 0.50539 to 0.46272, saving model to models/segnet_model.keras
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 302ms/step - accuracy: 0.7572 - loss: 0.4853 - mean_io_u_2: 0.2681 - precision_2:

# Сегментация

In [26]:
deeplab_model = load_model('models/deeplab_model.keras')
segnet_model = load_model('models/segnet_model.keras')
unet_model = load_model('models/unet_model.keras')

In [17]:
# Функция сегментации кадра и наложения на фон
def segment_frame(model, frame, background_color, img_size=(256, 256)):
    # Предобработка кадра
    input_img = cv2.resize(frame, img_size) / 255.0
    input_img = np.expand_dims(input_img, axis=0)

    # Предсказание маски
    mask = model.predict(input_img)[0]
    mask = cv2.resize(mask, (frame.shape[1], frame.shape[0]))
    mask = (mask > 0.5).astype(np.uint8)  # бинаризация маски

    # Наложение фона
    mask_rgb = np.stack([mask] * 3, axis=-1)  # преобразование маски в RGB
    background = np.full_like(frame, background_color)  # создаём фон
    segmented_frame = np.where(mask_rgb == 1, frame, background)
    return segmented_frame

In [18]:
def process_video(input_path, output_name, model, background_color=(0, 0, 0), img_size=(256, 256)):
    if not os.path.exists("videos"):
        os.makedirs("videos")

    # Открываем входное видео
    cap = cv2.VideoCapture(input_path)

    # Получаем параметры видео
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Инициализируем объект записи видео
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(os.path.join("videos", f"{output_name}.mp4"), fourcc, fps, (frame_width, frame_height))

    # Измерение времени обработки видео
    start_time = time.time()

    # Обработка каждого кадра
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Преобразуем и сегментируем кадр
        segmented_frame = segment_frame(model, frame, background_color, img_size)

        # Записываем обработанный кадр в выходное видео
        out.write(segmented_frame)

    end_time = time.time()
    processing_time = end_time - start_time
    cap.release()
    out.release()

    # Выводим время обработки
    print(f"Время обработки видео {output_name} заняло: {processing_time:.2f} seconds")

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# Задаём путь и запускаем обработку
input_video_path = '/content/drive/MyDrive/Colab Notebooks/CV/LR5/input.mp4'
output_paths = {
    "deeplab": "deeplab_output",
    "segnet": "segnet_output",
    "unet": "unet_output"
}
background_color = (0, 0, 0)  # Чёрный фон

In [22]:
process_video(input_video_path, output_paths["deeplab"], deeplab_model, background_color)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

In [23]:
process_video(input_video_path, output_paths["segnet"], segnet_model, background_color)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms

In [24]:
process_video(input_video_path, output_paths["unet"], unet_model, background_color)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms

# Выводы
Быстрее всех отработала модель SegNet


Чище всего отработала модель DeepLab, на результатах от U-Net и SegNet видны значительные артефакты