In [15]:
import os

import pandas as pd

from prepare_data.statistics import calculate_stats
from prepare_data.utils import create_folder

from sklearn.model_selection import train_test_split
import shutil

In [2]:
data_dir = '../data/cleared_images_max'
labels_dir = '../data/prepare_data/labels'
save_dir = '../data/yolov_data'

## Calculate Statistics for sep train - test - val

In [3]:
df = pd.DataFrame(columns=['file_name', 'path_to_image', 'path_to_label'])
for root, dirs, files in os.walk(data_dir):
    for file in files:
        path_to_image = os.path.join(root, file.replace('_label.', '.'))
        path_to_label = os.path.join(labels_dir, file.replace('_label.', '.').replace('jpg', 'txt'))
        df = pd.concat((df, pd.DataFrame({'file_name': file.split('.')[0],
                                          'path_to_image': path_to_image,
                                          'path_to_label': path_to_label}, index=[0])), ignore_index=True)

In [4]:
df

Unnamed: 0,file_name,path_to_image,path_to_label
0,8 (2)_label,../data/cleared_images_max/8 (2).jpg,../data/prepare_data/labels/8 (2).txt
1,12 (3)_label,../data/cleared_images_max/12 (3).jpg,../data/prepare_data/labels/12 (3).txt
2,4 (1)_label,../data/cleared_images_max/4 (1).jpg,../data/prepare_data/labels/4 (1).txt
3,4 (137)_label,../data/cleared_images_max/4 (137).jpg,../data/prepare_data/labels/4 (137).txt
4,5 (163)_label,../data/cleared_images_max/5 (163).jpg,../data/prepare_data/labels/5 (163).txt
...,...,...,...
92,7 (194)_label,../data/cleared_images_max/7 (194).jpg,../data/prepare_data/labels/7 (194).txt
93,5 (124)_label,../data/cleared_images_max/5 (124).jpg,../data/prepare_data/labels/5 (124).txt
94,1 (8)_label,../data/cleared_images_max/1 (8).jpg,../data/prepare_data/labels/1 (8).txt
95,11 (60)_label,../data/cleared_images_max/11 (60).jpg,../data/prepare_data/labels/11 (60).txt


In [5]:
stat = calculate_stats(df)

In [6]:
stat.groupby('video_id').sum()[['pat0', 'pat1', 'pat2', 'pat3', 'pat4']]

Unnamed: 0_level_0,pat0,pat1,pat2,pat3,pat4
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,31,2,3,0,0
3,2,0,1,1,1
4,23,9,14,4,4
5,58,4,17,8,5
6,0,0,5,0,0
7,71,7,26,16,20
8,0,0,8,4,0
9,0,1,4,5,0
11,0,0,4,0,0
12,0,27,0,0,2


Труба - 12, 11, 9, 8, 6, 

In [7]:
tube = [12, 11, 9, 8, 6]
plane = [1, 3, 4, 5, 7]

In [8]:
train_tube, test_tube = train_test_split(tube, test_size=0.2, random_state=43, shuffle=True)
train_plane, test_plane = train_test_split(plane, test_size=0.2, random_state=43, shuffle=True)

train = train_plane + train_tube
test = test_plane + test_tube


In [14]:
test

[5, 8]

In [9]:
stat[stat['video_id'].isin(test)].groupby('video_id').sum()[['pat0', 'pat1', 'pat2', 'pat3', 'pat4']].sum()


pat0    58
pat1     4
pat2    25
pat3    12
pat4     5
dtype: object

In [10]:
stat[stat['video_id'].isin(train)].groupby('video_id').sum()[['pat0', 'pat1', 'pat2', 'pat3', 'pat4']].sum()
    

pat0    127
pat1     46
pat2     57
pat3     26
pat4     27
dtype: object

In [12]:
stat['train_test'] = stat['video_id'].map(lambda x: 'train' if x in train else 'test')

In [13]:
stat

Unnamed: 0,path_to_label,path_to_image,name,video_id,frame_id,pat0,pat1,pat2,pat3,pat4,train_test
0,../data/prepare_data/labels/8 (2).txt,../data/cleared_images_max/8 (2).jpg,8 (2)_label,8,2,0,0,2,1,0,test
1,../data/prepare_data/labels/12 (3).txt,../data/cleared_images_max/12 (3).jpg,12 (3)_label,12,3,0,0,0,0,0,train
2,../data/prepare_data/labels/4 (1).txt,../data/cleared_images_max/4 (1).jpg,4 (1)_label,4,1,0,0,1,0,1,train
3,../data/prepare_data/labels/4 (137).txt,../data/cleared_images_max/4 (137).jpg,4 (137)_label,4,137,1,0,1,0,0,train
4,../data/prepare_data/labels/5 (163).txt,../data/cleared_images_max/5 (163).jpg,5 (163)_label,5,163,10,2,2,0,1,test
...,...,...,...,...,...,...,...,...,...,...,...
92,../data/prepare_data/labels/7 (194).txt,../data/cleared_images_max/7 (194).jpg,7 (194)_label,7,194,0,0,1,0,0,train
93,../data/prepare_data/labels/5 (124).txt,../data/cleared_images_max/5 (124).jpg,5 (124)_label,5,124,0,0,1,0,0,test
94,../data/prepare_data/labels/1 (8).txt,../data/cleared_images_max/1 (8).jpg,1 (8)_label,1,8,7,0,0,0,0,train
95,../data/prepare_data/labels/11 (60).txt,../data/cleared_images_max/11 (60).jpg,11 (60)_label,11,60,0,0,1,0,0,train


In [18]:
path_to_save_yolov = r'../data/yolov_data/wersek_yolov'

create_folder(os.path.join(path_to_save_yolov))
create_folder(os.path.join(path_to_save_yolov, "labels"))
create_folder(os.path.join(path_to_save_yolov, "labels", "train"))
create_folder(os.path.join(path_to_save_yolov, "labels", "test"))
create_folder(os.path.join(path_to_save_yolov, "labels", "val"))

create_folder(os.path.join(path_to_save_yolov, "images"))
create_folder(os.path.join(path_to_save_yolov, "images", "train"))
create_folder(os.path.join(path_to_save_yolov, "images", "test"))
create_folder(os.path.join(path_to_save_yolov, "images", "val"))


In [19]:
def create_yolov_dataset(data: pd.DataFrame, path_to_save: str):
    for i, row in data.iterrows():
        train_test = row['train_test']
                
        name = row['name']
        
        path_to_label = row['path_to_label']
        path_to_image = path_to_label.replace('labels', 'images').replace('.txt', '.jpg')


        shutil.copy(path_to_image, os.path.join(path_to_save, 'images', train_test, name + '.jpg'))
        shutil.copy(path_to_label, os.path.join(path_to_save, 'labels', train_test, name + '.txt'))
        
        if train_test == 'test':
            shutil.copy(path_to_image, os.path.join(path_to_save, 'images', 'val', name + '.jpg'))
            shutil.copy(path_to_label, os.path.join(path_to_save, 'labels', 'val', name + '.txt'))



In [20]:
path_to_save = r'../data/yolov_data/wersek_yolov'
create_yolov_dataset(stat, path_to_save)