Голдобин Денис ФИТ-2-21

In [23]:
import os
import numpy as np
import cv2
from glob import glob
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Input, Concatenate
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Model

In [None]:
# Путь к данным
data_path = "../input/person-segmentation/people_segmentation"
images_path = sorted(glob(os.path.join(data_path, 'images/*.jpg')))
masks_path = sorted(glob(os.path.join(data_path, 'masks/*.png')))

In [None]:
train_images, test_images, train_masks, test_masks = train_test_split(images_path, masks_path, test_size=0.2, random_state=42)

In [None]:
# Функции для чтения изображений и масок
def read_image(path):
    x = cv2.imread(path, cv2.IMREAD_COLOR)
    x = cv2.resize(x, (256, 256))
    x = x / 255.0  # Нормализация
    x = x.astype(np.float32)  
    return x

In [None]:
def read_mask(path):
    x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    x = cv2.resize(x, (256, 256))
    x = x.astype(np.float32)  
    x = np.expand_dims(x, axis=-1)  # Добавляем размерность
    return x

In [None]:
def preprocess(image_path, mask_path):
    def f(image_path, mask_path):
        image_path = image_path.decode()  # Декодируем путь к изображению
        mask_path = mask_path.decode()  # Декодируем путь к маске
        x = read_image(image_path)  # Читаем и обрабатываем изображение
        y = read_mask(mask_path)  # Читаем и обрабатываем маску
        return x, y

    # Используем tf.numpy_function для обработки
    image, mask = tf.numpy_function(f, [image_path, mask_path], [tf.float32, tf.float32])
    image.set_shape([256, 256, 3])  # Для изображений RGB
    mask.set_shape([256, 256, 1])  # Для масок (1 канал)
    return image, mask

In [None]:
def tf_dataset(images, masks, batch_size=8):
    dataset = tf.data.Dataset.from_tensor_slices((images, masks))
    dataset = dataset.shuffle(buffer_size=5000)
    dataset = dataset.map(preprocess)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [None]:
def build_unet(input_shape=(256, 256, 3)):
    inputs = Input(input_shape)

    # Encoder
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = MaxPooling2D((2, 2))(c1)

    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = MaxPooling2D((2, 2))(c2)

    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(p2)
    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    c4 = Conv2D(512, (3, 3), activation='relu', padding='same')(p3)
    c4 = Conv2D(512, (3, 3), activation='relu', padding='same')(c4)
    p4 = MaxPooling2D(pool_size=(2, 2))(c4)

    # Bottleneck
    c5 = Conv2D(1024, (3, 3), activation='relu', padding='same')(p4)
    c5 = Conv2D(1024, (3, 3), activation='relu', padding='same')(c5)

    # Decoder
    u6 = UpSampling2D((2, 2))(c5)
    u6 = Concatenate()([u6, c4])
    c6 = Conv2D(512, (3, 3), activation='relu', padding='same')(u6)
    c6 = Conv2D(512, (3, 3), activation='relu', padding='same')(c6)

    u7 = UpSampling2D((2, 2))(c6)
    u7 = Concatenate()([u7, c3])
    c7 = Conv2D(256, (3, 3), activation='relu', padding='same')(u7)
    c7 = Conv2D(256, (3, 3), activation='relu', padding='same')(c7)

    u8 = UpSampling2D((2, 2))(c7)
    u8 = Concatenate()([u8, c2])
    c8 = Conv2D(128, (3, 3), activation='relu', padding='same')(u8)
    c8 = Conv2D(128, (3, 3), activation='relu', padding='same')(c8)

    u9 = UpSampling2D((2, 2))(c8)
    u9 = Concatenate()([u9, c1])
    c9 = Conv2D(64, (3, 3), activation='relu', padding='same')(u9)
    c9 = Conv2D(64, (3, 3), activation='relu', padding='same')(c9)

    outputs = Conv2D(1, (1, 1), activation='sigmoid')(c9)

    model = Model(inputs, outputs, name="U-Net")
    return model

In [None]:
from tensorflow.keras.layers import Conv2D, UpSampling2D
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras import Model

def build_deeplab(input_shape=(256, 256, 3), num_classes=1):
    base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=input_shape)
    x = base_model.output
    
    # Поэтапное увеличение разрешения
    x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (16, 16)

    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (32, 32)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (64, 64)

    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (128, 128)

    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # (256, 256)

    # Финальный слой для маски
    outputs = Conv2D(num_classes, (1, 1), activation='sigmoid', padding='same')(x)

    model = Model(inputs=base_model.input, outputs=outputs)
    return model


In [None]:
# SegNet model
def build_segnet(input_shape=(256, 256, 3), num_classes=1):
    inputs = Input(shape=input_shape)

    # Encoder
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = MaxPooling2D((2, 2))(c1)

    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = MaxPooling2D((2, 2))(c2)

    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(p2)
    c3 = Conv2D(256, (3, 3), activation='relu', padding='same')(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    # Decoder
    u3 = UpSampling2D((2, 2))(p3)
    d3 = Conv2D(256, (3, 3), activation='relu', padding='same')(u3)
    d3 = Conv2D(256, (3, 3), activation='relu', padding='same')(d3)

    u2 = UpSampling2D((2, 2))(d3)
    d2 = Conv2D(128, (3, 3), activation='relu', padding='same')(u2)
    d2 = Conv2D(128, (3, 3), activation='relu', padding='same')(d2)

    u1 = UpSampling2D((2, 2))(d2)
    d1 = Conv2D(64, (3, 3), activation='relu', padding='same')(u1)
    d1 = Conv2D(64, (3, 3), activation='relu', padding='same')(d1)

    outputs = Conv2D(num_classes, (1, 1), activation='sigmoid')(d1)

    model = Model(inputs, outputs, name="SegNet")
    return model

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, UpSampling2D, Input
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras.metrics import MeanIoU, Recall, Precision
# Функция для обучения модели
def train_model(model, train_dataset, val_dataset, epochs=10, model_name="model"):

    # Компилируем модель
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)

    # Директория для сохранения модели и метрик
    if not os.path.exists("models"):
        os.makedirs("models")

    # Определяем колбэки для сохранения модели и метрик
    model_checkpoint = ModelCheckpoint(
        os.path.join("models", f"{model_name}.keras"), 
        monitor='val_loss', 
        save_best_only=True, 
        mode='min',
        verbose=1
    )
    
    csv_logger = CSVLogger(os.path.join("models", f"{model_name}_metrics.csv"))

    # Обучаем модель
    model.fit(
        train_dataset, 
        validation_data=val_dataset, 
        epochs=epochs, 
        callbacks=[model_checkpoint, csv_logger]
    )

# Создание датасетов
train_dataset = tf_dataset(train_images, train_masks, batch_size=8)
val_dataset = tf_dataset(test_images, test_masks, batch_size=8)







In [None]:
# Обучение моделей
unet_model = build_unet()


print("Training U-Net model...")
train_model(unet_model, train_dataset, val_dataset, epochs=10, model_name="unet_model")

In [None]:
deeplab_model = build_deeplab()

print("Training deeplab_model...")
train_model(deeplab_model, train_dataset, val_dataset, epochs=10, model_name="deeplab_model")

In [None]:
segnet_model = build_segnet()
print("Training segnet_model...")
train_model(segnet_model, train_dataset, val_dataset, epochs=10, model_name="segnet_model")

Обработка видео

In [32]:
import cv2
import numpy as np
import tensorflow as tf
import time
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os

In [33]:
deeplab_model_path = "../input/deeplab/keras/default/1/deeplab_model.keras"
segnet_model_path = "../input/segnet/keras/default/1/segnet_model.keras"
unet_model_path = "../input/unet/keras/default/1/unet_model.keras"

In [34]:
deeplab_model = load_model(deeplab_model_path)
segnet_model = load_model(segnet_model_path)
unet_model = load_model(unet_model_path)

In [35]:
# Функция сегментации кадра и наложения на фон
def segment_frame(model, frame, background_color, img_size=(256, 256)):
    # Предобработка кадра
    input_img = cv2.resize(frame, img_size) / 255.0
    input_img = np.expand_dims(input_img, axis=0)
    
    # Предсказание маски
    mask = model.predict(input_img)[0]
    mask = cv2.resize(mask, (frame.shape[1], frame.shape[0]))
    mask = (mask > 0.5).astype(np.uint8)  # бинаризация маски
    
    # Наложение фона
    mask_rgb = np.stack([mask] * 3, axis=-1)  # преобразование маски в RGB
    background = np.full_like(frame, background_color)  # создаём чёрный фон
    segmented_frame = np.where(mask_rgb == 1, frame, background)
    return segmented_frame

In [41]:
def process_video(input_path, output_name, model, background_color=(0, 0, 0), img_size=(256, 256)):
    if not os.path.exists("videos"):
        os.makedirs("videos")
        
    cap = cv2.VideoCapture(input_path)
    
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # инициализация
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(os.path.join("videos", f"{output_name}.mp4"), fourcc, fps, (frame_width, frame_height))
    
    start_time = time.time()

    # Обработка каждого кадра
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # сегментация
        segmented_frame = segment_frame(model, frame, background_color, img_size)
        
        # обработка кадра
        out.write(segmented_frame)
    
    end_time = time.time()
    processing_time = end_time - start_time
    cap.release()
    out.release()
    
    print(f"Время обработки видео {output_name} заняло: {processing_time:.2f} seconds")

In [42]:
input_video_path = '../input/videosample1/140828-776043783_small.mp4' #путь исходного видео
output_paths = {
    "deeplab": "deeplab_output",
    "segnet": "segnet_output",
    "unet": "unet_output"
}
background_color = (0, 128, 0)

In [43]:
process_video(input_video_path, output_paths["deeplab"], deeplab_model, background_color)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

In [44]:
process_video(input_video_path, output_paths["segnet"], segnet_model, background_color)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18

In [45]:
process_video(input_video_path, output_paths["unet"], unet_model, background_color)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18

**Вывод**
Наилучший результат выдала модель deepLab, на обработанных кадров нет шумов и человек выделяется на каждом кадре без обрезки. Segnet и Unet выдали схожие результаты, но по скорости работы unet был быстрее.