In [1]:
%pwd

'/home/tamerlan/Masters/thesis/yolov5'

In [12]:
import os
import torch
import utils
import cv2
import numpy as np
import matplotlib
import sys
from math import ceil

display = utils.notebook_init()  # checks

YOLOv5 🚀 2024-4-24 Python-3.9.19 torch-2.2.2+cu121 CUDA:0 (NVIDIA GeForce GTX 1050, 4039MiB)


Setup complete ✅ (8 CPUs, 15.5 GB RAM, 227.3/383.9 GB disk)


In [3]:
def colorize(
    value: np.ndarray, vmin: float = None, vmax: float = None, cmap: str = "afmhot"
):
    if value.ndim > 2:
        return value
    invalid_mask = value == -1

    # normalize
    vmin = value.min() if vmin is None else vmin
    vmax = value.max() if vmax is None else vmax
    value = (value - vmin) / (vmax - vmin)  # vmin..vmax

    # set color
    cmapper = matplotlib.cm.get_cmap(cmap)
    value = cmapper(value, bytes=True)  # (nxmx4)
    value[invalid_mask] = 255
    img = value[..., :3]
    return img

In [None]:
!python detect.py --weights yolov5s.pt --img 640 --conf 0.50 --source data/images
# display.Image(filename='runs/detect/exp/zidane.jpg', width=600)

In [46]:
!python segment/predict.py --weights yolov5m-seg.pt --img 640 --conf 0.3 --source data/images --save-txt

[34m[1msegment/predict: [0mweights=['yolov5m-seg.pt'], source=data/images, data=data/coco128.yaml, imgsz=[640, 640], conf_thres=0.3, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=True, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/predict-seg, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1, retina_masks=False
YOLOv5 🚀 v7.0-304-g22361691 Python-3.9.19 torch-2.2.2+cu121 CUDA:0 (NVIDIA GeForce GTX 1050, 4039MiB)

Fusing layers... 
YOLOv5m-seg summary: 301 layers, 21971597 parameters, 0 gradients, 70.8 GFLOPs
image 1/36 /home/tamerlan/Masters/thesis/yolov5/data/images/cam_01_00001.jpg: 384x640 3 persons, 5 cars, 1 motorcycle, 3 trucks, 225.7ms
image 2/36 /home/tamerlan/Masters/thesis/yolov5/data/images/cam_01_00060.jpg: 384x640 5 persons, 1 bicycle, 3 cars, 1 motorcycle, 2 buss, 3 trucks, 154.1ms
image 3/36 

In [51]:
def distance_between_points(point1, point2):
    return np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)

In [82]:
def to_pixel_coords(relative_coords, im_shape_wh):
    return list(round(coord * dimension) for coord, dimension in zip(relative_coords, im_shape_wh))


def get_binary_mask_of_object(segment_line, im_shape_wh, im_shape):
    points = list(map(float, segment_line.split()))
    points.pop(0)
    points = np.array(points)
    points = points.reshape(-1, 2)

    pixels = []

    for point in points:
        pixels.append(to_pixel_coords(point, im_shape_wh))

    pixels = np.array([pixels], dtype=np.int32)

    binary_mask = np.zeros(im_shape, dtype=np.int32)

    binary_mask = np.float32(cv2.fillPoly(binary_mask, pts=pixels, color=255))
    
    binary_mask = cv2.blur(binary_mask, (31, 31))
    _, binary_mask = cv2.threshold(binary_mask, 254, 255, cv2.THRESH_BINARY)
    return binary_mask

def draw_object_vectors(image_path, object_segments_path, depth_pt_path, output_file):
    im = cv2.imread(image_path)
    im_shape = (im.shape)[0:2]


    im_shape_wh = [im_shape[1], im_shape[0]] # w, h

    with open(object_segments_path) as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            # if i == 0:
            binary_mask = get_binary_mask_of_object(line, im_shape_wh, im_shape)

            min_depth, max_depth = 1.0, 50.0

            depth = torch.load(depth_pt_path, map_location=torch.device('cpu'))
            depth = depth[0].squeeze().cpu().numpy()

            depth = cv2.resize(depth,
                    (1920,1080), # width, height
                    interpolation=cv2.INTER_CUBIC)

            
            image_array = colorize(depth, min_depth, max_depth)
            binary_mask_3ch = cv2.cvtColor(binary_mask, cv2.COLOR_GRAY2BGR)
            image_array = cv2.bitwise_and(np.float32(image_array), binary_mask_3ch)

            depth_of_object = cv2.bitwise_and(np.float32(depth), binary_mask)

            # убираем все выбросы
            outliers_to_median(depth_of_object)

            # можем искать расположение точки максимума сразу, т.к. все кроме точек объекта - 0
            max_point = np.unravel_index(np.argmax(depth_of_object), depth_of_object.shape) # y, x

            
            # поиск индекса минимума среди элементов depth_of_object 
            # индексы которых совпадают с индексами binary_mask > 0
            min_index = np.argmin(depth_of_object[binary_mask > 0]) 

            # индексы где binary_mask > 0
            indices_nonzero = np.transpose((binary_mask > 0).nonzero())

            # расположение точки минимума на изображении
            min_point = indices_nonzero[min_index]

            im = cv2.arrowedLine(im, (max_point[1], max_point[0]), (min_point[1], min_point[0]), 
                                                color=(0, 0, 255), thickness=3)
            
    cv2.imwrite(output_file, im) 


def find_close_vals(depth_of_object, val, eps=0.3):
    result_ineq = (depth_of_object < val + eps) & (depth_of_object > val - eps) & (depth_of_object != 0)
    indices = np.transpose(result_ineq.nonzero())
    return indices

def outliers_to_median(depth_of_object, percentile=0.05):
    nonzero_points_mask = (depth_of_object != 0)
    nonzero_points = depth_of_object[nonzero_points_mask]
    nonzero_points.sort()
    print(nonzero_points[0], nonzero_points[-1])

    min_percentile_index = ceil(len(nonzero_points)*percentile) - 1
    min_percentile = nonzero_points[min_percentile_index]

    max_percentile_index = ceil(len(nonzero_points)*(1-percentile)) - 1
    max_percentile = nonzero_points[max_percentile_index]

    # median_index = ceil(len(nonzero_points)*(0.5)) - 1
    # median = nonzero_points[median_index]

    outliers_mask = nonzero_points_mask & ((depth_of_object <= min_percentile) | (depth_of_object >= max_percentile))
    depth_of_object[outliers_mask] = 0


def draw_max_points(image_path, object_segments_path, depth_pt_path, output_file):

    im = cv2.imread(image_path)
    im_shape = (im.shape)[0:2]
    im_shape_wh = [im_shape[1], im_shape[0]] # w, h


    depth = torch.load(depth_pt_path, map_location=torch.device('cpu'))
    depth = depth.squeeze().cpu().numpy()


    # интерполяция для idisc
    # depth = cv2.resize(depth,
    #         (1920,1080), # width, height
    #         interpolation=cv2.INTER_CUBIC)


    with open("/home/tamerlan/Masters/thesis/yolov5/tmp/matrix_values.txt", "w") as file:
        for row in depth:
            formatted_row = ''.join('{:<5}'.format(num) for num in row)
            file.write(formatted_row.strip() + '\n')

    with open(object_segments_path) as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            # это важно!!! позволяет приводить 255, 0 -> 1, 0
            binary_mask = np.uint8(get_binary_mask_of_object(line, im_shape_wh, im_shape))

            mass_y, mass_x = np.where(binary_mask > 0)
            
            # игнорируем мелкие предметы
            if mass_x.shape[0] <= 100 or mass_y.shape[0] <= 100:
                continue

            #############################
            # создание раскрашенной карты глубины для проверки
            # min_depth, max_depth = 1.0, 400.0
            # image_array = colorize(depth, min_depth, max_depth)
            # binary_mask_3ch = cv2.cvtColor(binary_mask, cv2.COLOR_GRAY2BGR)
            # image_array = cv2.bitwise_and(np.float32(image_array), binary_mask_3ch)
            
            # print(depth.shape)
            # cv2.imwrite(f'/home/tamerlan/Masters/thesis/yolov5/tmp/depth_check_{i}.png', image_array)

            #############################

            depth = np.float32(depth)
            depth_of_object = np.float32(cv2.bitwise_and(depth, depth, mask = binary_mask))

            # убираем все выбросы, мы их зануляем
            outliers_to_median(depth_of_object)

            # создание текстовых файлов для проверки
            # with open(f"/home/tamerlan/Masters/thesis/yolov5/tmp/matrix_values_{i}.txt", "w") as file:
            #     for row in depth_of_object:
            #         formatted_row = ''.join('{:<5}'.format(num) for num in row)
            #         file.write(formatted_row.strip() + '\n')
            

            # индексы где binary_mask > 0 т.е. сегментированная область, еще depth_of_object > 0
            indices_nonzero = np.transpose(((binary_mask > 0) & (depth_of_object > 0)).nonzero())

            # max - самый яркий элемент -> самый близкий к камере
            # поиск индекса maximuma среди элементов depth_of_object 
            # индексы которых совпадают с индексами binary_mask > 0
            max_index = np.argmax(depth_of_object[(binary_mask > 0) & (depth_of_object > 0)]) 

            max_point = tuple(indices_nonzero[max_index])# y, x
            
            # min - самый тусклый элемент -> наиболее отдаленный от камеры
            # поиск индекса минимума среди элементов depth_of_object 
            # индексы которых совпадают с индексами binary_mask > 0
            min_index = np.argmin(depth_of_object[(binary_mask > 0) & (depth_of_object > 0)]) 

            # расположение точки минимума на изображении
            min_point = tuple(indices_nonzero[min_index])

            eps = 0.5

            max_val = depth_of_object[max_point]
            min_val = depth_of_object[min_point]
            
            max_indices = np.int32(find_close_vals(depth_of_object, val = max_val, eps = eps))
            min_indices = np.int32(find_close_vals(depth_of_object, val = min_val, eps = eps))
            
            overlay = im.copy()
            overlay[max_indices[:,0],max_indices[:,1]] = (0,0,255)
            overlay[min_indices[:,0],min_indices[:,1]] = (255,0,0)

            alpha = 0.3
            
            im = cv2.addWeighted(overlay, alpha, im, 1 - alpha, 0) 
            
    cv2.imwrite(output_file, im)


            


# binary_mask = cv2.threshold(predicted_mask, 0.5, 1, cv2.THRESH_BINARY)[1]

In [79]:
da1 = cv2.imread('/home/tamerlan/Masters/thesis/Depth-Anything/frame_copy_depth.png', cv2.IMREAD_GRAYSCALE)
da1_tensor = torch.from_numpy(da1)
print(da1_tensor.shape)
torch.save(da1_tensor, '/home/tamerlan/Masters/thesis/yolov5/tmp/da1_frame_copy.pt')


# type(da1)

# cv2.imwrite('check_da.png', da1) 


torch.Size([1080, 1920])


In [17]:
tmp_dir = '/home/tamerlan/Masters/thesis/yolov5/tmp/'

image_path = '/home/tamerlan/Masters/thesis/yolov5/data/images/frame_75.jpg'
object_segments_path = '/home/tamerlan/Masters/thesis/yolov5/runs/predict-seg/exp5/labels/frame_75.txt'
# depth_pt_path = '/home/tamerlan/Masters/thesis/yolov5/tmp/depth.pt'
da1_pt_path = tmp_dir + 'da1_frame75.pt'


In [None]:
draw_object_vectors(image_path, object_segments_path, da1_pt_path, '/home/tamerlan/Masters/thesis/yolov5/tmp/check_da.png')

In [18]:
draw_max_points(image_path, object_segments_path, da1_pt_path, tmp_dir + 'check_da_maxmin.png')

61.0 75.0
18.0 22.0
131.0 136.0
135.0 140.0
8.0 17.0
188.0 248.0
20.0 37.0
58.0 92.0


In [83]:
name = "cars1"

video = cv2.VideoCapture(f"/home/tamerlan/Masters/thesis/datasets/{name}.mp4")

frame_count = 0

ok, frame = video.read()

if not video.isOpened():
    print('Error')
    sys.exit()
else:
    frame_count += 1
    w = 1920
    h = 1080
    frame = cv2.resize(frame, (w, h))


# video_out = cv2.VideoWriter(tmp_dir + "res_cars1.mp4", cv2.VideoWriter_fourcc(*'mp4v'), 10, (w,h))

center_frames = []
result_im = None

used_frames = {275, 300}

while True:
    ok, frame = video.read()
    if not ok:
        break
    frame_count += 1


    frame = cv2.resize(frame, (w, h))
    tmpCopy = frame.copy()

    

    if frame_count in used_frames:
        video_objects_path = f'/home/tamerlan/Masters/thesis/yolov5/runs/predict-seg/exp6/labels/{name}_{frame_count}.txt'
        masked_img = frame.copy()

        if frame_count == 275:
            result_im = frame.copy()
            cv2.imwrite(tmp_dir + "frame_copy.png", result_im)

        points_on_frame = []

        with open(video_objects_path) as f:
            lines = f.readlines()
                
            for i, line in enumerate(lines):
                object_type = int(line.split()[0]) 

                # игнорируем все что не является машиной
                if object_type != 2:
                    continue

                binary_mask = np.uint8(get_binary_mask_of_object(line, (w,h), (h,w)))

                mass_y, mass_x = np.where(binary_mask > 0)


                # игнорируем мелкие предметы
                if mass_x.shape[0] <= 100 or mass_y.shape[0] <= 100:
                    continue

                cent_x = int(np.average(mass_x))
                cent_y = int(np.average(mass_y))

                points_on_frame.append((cent_x,cent_y))

                overlay = frame.copy()
                overlay[binary_mask > 0] = (255,0,255)

                alpha = 0.3
                
                masked_img = cv2.circle(masked_img, (cent_x,cent_y), radius=2, color=(0, 0, 255), thickness=-1)
                masked_img = cv2.addWeighted(overlay, alpha, masked_img, 1 - alpha, 0) 

        center_frames.append(points_on_frame)           

        cv2.imwrite(tmp_dir + f"video_frame_{frame_count}.png", masked_img)      

    # video_out.write(tmpCopy)
    
distance_eps = 10

begs = []
ends = []
for center1 in center_frames[0]:
    begs.append(center1)
    for i, center2 in enumerate(center_frames[1]):
        if center2 is not None and distance_between_points(center1, center2) < distance_eps:
            begs.pop()
            center_frames[1][i] = None
            break


beg = begs[0]
end = center_frames[1][[i for i, val in enumerate(center_frames[1]) if val != None][0]]
print(beg, end)


draw_max_points(tmp_dir + "frame_copy.png", 
                f'/home/tamerlan/Masters/thesis/yolov5/runs/predict-seg/exp6/labels/{name}_275.txt', 
                tmp_dir + 'da1_frame_copy.pt', 
                tmp_dir + 'check_da_maxmin_frame.png')

cv2.arrowedLine(result_im, beg, end, color=(0, 0, 255), thickness=3)
cv2.imwrite(tmp_dir + f"check_vector.png", result_im)  
    

# video_out.release()


(1730, 197) (1556, 362)
100.0 103.0
46.0 60.0
86.0 120.0
107.0 150.0
57.0 85.0


True

In [16]:
# segmentation in video
!python segment/predict.py --weights yolov5s-seg.pt --conf 0.25 --source /home/tamerlan/Masters/thesis/datasets/cars1.mp4 --save-txt

[34m[1msegment/predict: [0mweights=['yolov5s-seg.pt'], source=/home/tamerlan/Masters/thesis/datasets/cars1.mp4, data=data/coco128.yaml, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=True, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/predict-seg, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1, retina_masks=False
YOLOv5 🚀 2024-4-24 Python-3.9.19 torch-2.2.2+cu121 CUDA:0 (NVIDIA GeForce GTX 1050, 4039MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt to yolov5s-seg.pt...
100%|██████████████████████████████████████| 14.9M/14.9M [00:02<00:00, 7.35MB/s]

Fusing layers... 
YOLOv5s-seg summary: 224 layers, 7611485 parameters, 0 gradients, 26.4 GFLOPs
video 1/1 (1/750) /home/tamerlan/Masters/thesis/datasets/cars1.mp4: 384x640 2 persons