#### This notebook will create dataset as expected by yolo

In [None]:
from configs import config

import os
import json
import shutil
from tqdm import tqdm
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
with open(config.POLYGON_PATH, 'r') as json_file:
    polygon_list = list(json_file)

In [None]:
img_id = []
classes = []
coords = []
for polygon in polygon_list:
    result = json.loads(polygon)
    image_id = result['id']
    for re in result['annotations']:
        img_id.append(image_id)
        classes.append(re['type'])
        coords.append(re['coordinates'])
    
    
df = pd.DataFrame([img_id,classes,coords]).T
df.columns = ['id','type','coords']
df = df.explode(column='coords').reset_index(drop=True)

In [None]:
df_blood = df[df['type']=='blood_vessel'].reset_index(drop=True)

In [None]:
# train-valid split 
img_id = df_blood['id'].unique()
train_ids = img_id[:1600]
valid_ids = img_id[1600:]

val_df = df_blood[df_blood['id'].isin(valid_ids)]
train_df = df_blood[df_blood['id'].isin(train_ids)]

In [None]:
DEST_PATH = config.DEST_DIR
Path(DEST_PATH).mkdir(exist_ok=True)

_ = Path(f"{DEST_PATH}/dataset.yaml").write_text(f"""path: {DEST_PATH}
train: /data/yolo/train/images
val: /data/yolo/val/images

nc: 1
names: ['0']
                                                
"""
)

In [None]:
def create_label_file(path: Path, coordinates, image_size):
    label_txt = ''
    for coordinate in coordinates:
        label_txt += '0 '
        # Parse the string representation of coordinates into a list
        coor_list = json.loads(coordinate)
        coor_array = np.array(coor_list).astype(float)
        coor_array /= float(image_size)
        coor_str = ' '.join(coor_array.flatten().astype(str))
        label_txt += f'{coor_str}\n'
    path.write_text(label_txt)

In [None]:
for mode in ['train', 'val']:
    image_folder = Path(DEST_PATH) / f"{mode}" / "images"
    image_folder.mkdir(parents=True, exist_ok=True)

    label_folder = Path(DEST_PATH) / f"{mode}" / "labels"
    label_folder.mkdir(parents=True, exist_ok=True)

    # Get the dataframe based on the current mode
    df = locals().get(f"{mode}_df")
    grouped = df.groupby('id')

    # Iterate over each image_id and its corresponding group_df
    for i, group_df in tqdm(grouped, total=len(grouped)):
        # Extract file_name, width, and height from the first row of group_df
        image_path = os.path.join(config.TRAIN_PATH, f"{i}.tif")
        # Extract the bounding boxes from each row in the group_df
        coords = [row.coords for _, row in group_df.iterrows()]

        # Define paths for the image and label files
        img_path = image_folder / f"{i}.tif"
        label_path = label_folder / f"{i}.txt"
        shutil.copy(image_path, img_path)
        create_label_file(label_path, coords, image_size=512)