In [66]:
import pandas as pd
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm
import yaml

In [87]:
TRAIN_INFO_PATH = "data/train.csv"
TEST_INFO_PATH = "data/test_phase1.csv"
TRAIN_IMAGES_PATH = "data/train_images/"
TEST_IMAGES_PATH = "data/test_images_phase1/"
DATA_PATH = "yolov7/data/mosquito_alert/"
IMAGE_EXTENSION = '.jpeg'
IMAGE_EXTENSION_LEN = len(IMAGE_EXTENSION)
DATA_YAML_PATH = "yolov7/data/mosquito_alert.yaml"

In [22]:
df = pd.read_csv(TRAIN_INFO_PATH)

In [41]:
df

Unnamed: 0,img_fName,img_w,img_h,bbx_xtl,bbx_ytl,bbx_xbr,bbx_ybr,class_label
0,92715872-3287-4bff-aa61-7047973e5c02.jpeg,2448,3264,1301,1546,1641,2096,albopictus
1,b0f7cc74-2272-4756-a387-38bcaf6965c8.jpeg,3024,4032,900,1897,1950,2990,albopictus
2,82df4b68-0f45-4afe-9215-48488bf3720e.jpeg,768,1024,220,58,659,808,albopictus
3,331ad30a-7564-4478-b863-7bc760adf5a1.jpeg,3456,4608,1169,2364,1586,2826,albopictus
4,1a46dbfb-104e-466b-88d7-98958d7b1fe9.jpeg,1024,1365,129,231,697,1007,culex
...,...,...,...,...,...,...,...,...
8020,ca1468e3-1337-48b3-b378-135758317454.jpeg,4160,2080,2055,704,2494,998,albopictus
8021,c2e77768-61ba-4e27-907d-fe3198e8f5ce.jpeg,3024,4032,1073,875,1830,1503,culiseta
8022,d7521d94-92f7-40a1-9cd5-74a0b67ca98b.jpeg,1024,1365,341,485,761,1123,albopictus
8023,03098cc0-ee82-4fa0-b045-579951b7855c.jpeg,2128,4608,869,1235,2128,2631,culex


In [42]:
df.columns = ['image_filename', 'img_w', 'img_h', 'bbx_xtl', 'bbx_ytl', 'bbx_xbr', 'bbx_ybr', 'class_label']

In [46]:
print("Training Data")
print(f"Number of images: {len(df)}")
print(f"Number of classes: {df.class_label.nunique()}")
print(f"Classes:{list(df.class_label.unique())}")
print(f"Class counts:\n{df['class_label'].value_counts()}")

Training Data
Number of images: 8025
Number of classes: 6
Classes:['albopictus', 'culex', 'anopheles', 'culiseta', 'japonicus/koreicus', 'aegypti']
Class counts:
albopictus            3567
culex                 3544
culiseta               492
japonicus/koreicus     321
anopheles               63
aegypti                 38
Name: class_label, dtype: int64


In [47]:
df_train, df_valid = train_test_split(df,
                                    stratify=df['class_label'].values, 
                                    test_size=0.2)

In [48]:
print("Train Data")
print(f"Number of images: {len(df_train)}")
print(f"Number of classes: {df_train.class_label.nunique()}")
print(f"Classes:{list(df_train.class_label.unique())}")
print(f"Class counts:\n{df_train['class_label'].value_counts()}")

Train Data
Number of images: 6420
Number of classes: 6
Classes:['culex', 'culiseta', 'albopictus', 'anopheles', 'japonicus/koreicus', 'aegypti']
Class counts:
albopictus            2854
culex                 2835
culiseta               394
japonicus/koreicus     257
anopheles               50
aegypti                 30
Name: class_label, dtype: int64


In [49]:
print("Valid Data")
print(f"Number of images: {len(df_valid)}")
print(f"Number of classes: {df_valid.class_label.nunique()}")
print(f"Classes:{list(df_valid.class_label.unique())}")
print(f"Class counts:\n{df_valid['class_label'].value_counts()}")

Valid Data
Number of images: 1605
Number of classes: 6
Classes:['culiseta', 'albopictus', 'culex', 'japonicus/koreicus', 'anopheles', 'aegypti']
Class counts:
albopictus            713
culex                 709
culiseta               98
japonicus/koreicus     64
anopheles              13
aegypti                 8
Name: class_label, dtype: int64


In [67]:
classes = list(df.class_label.sort_values().unique())
num_classes = len(classes)
labels_dict = {label:i for i,label in enumerate(classes)}
print(labels_dict)


{'aegypti': 0, 'albopictus': 1, 'anopheles': 2, 'culex': 3, 'culiseta': 4, 'japonicus/koreicus': 5}


In [93]:
if os.path.exists(DATA_PATH):
    shutil.rmtree(DATA_PATH)
os.mkdir(DATA_PATH)

images_path = DATA_PATH + "images/"

if os.path.exists(images_path):
    shutil.rmtree(images_path)
os.mkdir(images_path)
    
labels_path = DATA_PATH + "labels/"
if os.path.exists(labels_path):
    shutil.rmtree(labels_path)
os.mkdir(labels_path)
    
training_images_path = images_path + 'train/'
validation_images_path = images_path + 'valid/'
training_labels_path = labels_path + 'train/'
validation_labels_path = labels_path +'valid/'
    
os.mkdir(training_images_path)
os.mkdir(validation_images_path)
os.mkdir(training_labels_path)
os.mkdir(validation_labels_path)


In [92]:
def convert_to_yolov7_format(df, images_path, labels_path):
    for index, row in tqdm(df.iterrows()):
        class_id = labels_dict[row['class_label']]
        b_center_x = ((row["bbx_xtl"] + row["bbx_xbr"]) / 2)/row["img_w"]
        b_center_y = ((row["bbx_ytl"] + row["bbx_ybr"]) / 2)/row["img_h"]
        b_width    = (row["bbx_xbr"] - row["bbx_xtl"])/row["img_w"]
        b_height   = (row["bbx_ybr"] - row["bbx_ytl"])/row["img_h"]

        label_file = open(labels_path +  row['image_filename'][0:len(row['image_filename']) - IMAGE_EXTENSION_LEN]  +".txt","w")
        label_str = "{} {:.3f} {:.3f} {:.3f} {:.3f}".format(class_id, b_center_x, b_center_y, b_width, b_height)
        label_file.writelines(label_str)
        label_file.close()

        image_file = row['image_filename']
        src_image = TRAIN_IMAGES_PATH + image_file
        shutil.copy(src_image, images_path)

In [94]:
# Convert data to yolo format
convert_to_yolov7_format(df_train, training_images_path, training_labels_path)
convert_to_yolov7_format(df_valid, validation_images_path, validation_labels_path)

6420it [00:21, 303.51it/s]
1605it [00:07, 218.21it/s]


In [78]:
validation_images_path

'yolov7/data/mosquito_alert/images/valid/'

In [95]:
d = {'A':'a', 'B':{'C':'c', 'D':'d', 'E':'e'}}
data_config = {}
data_config["train"] = 'data/mosquito_alert/images/train/'
data_config["val"] = 'data/mosquito_alert/images/valid/'
data_config["nc"] = num_classes
data_config["names"] = classes

with open(DATA_YAML_PATH, 'w') as yaml_file:
    yaml.dump(data_config, yaml_file)