<a href="https://colab.research.google.com/github/tselane2110/stray-dogs-detection-system/blob/main/Model%20Training/data_preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pre-Processing for Stanford Dog Dataset

## 1. Uploading the fypdataset.zip and annotations.zip file

In [None]:
!unzip /content/fypdataset.zip

## 2. Functions to convert annotation files from xml to txt (yolov5 version)

In [None]:
import glob
import os
import pickle
import xml.etree.ElementTree as ET
from os import listdir, getcwd
from os.path import join

dirs = ['fypdataset']
classes = [
           "Chihuahua", "toy_terrier", "Rhodesian_ridgeback", "basset", "beagle", "bloodhound", "bluetick", "black-and-tan_coonhound", "Walker_hound", "English_foxhound", "redbone", "Italian_greyhound", "whippet", "Ibizan_hound", "Saluki", "Weimaraner", "Staffordshire_bullterrier", "American_Staffordshire_terrier", "golden_retriever", "Labrador_retriever", "Chesapeake_Bay_retriever", "German_short-haired_pointer", "Brittany_spaniel", "kuvasz", "schipperke", "malinois", "kelpie", "Rottweiler", "miniature_pinscher", "EntleBucher", "boxer", "Great_Dane", "Saint_Bernard", "Eskimo_dog", "basenji", "Leonberg", "Pembroke", "Cardigan", "Mexican_hairless", "dingo", "African_hunting_dog"       ]

def getImagesInDir(dir_path):
    image_list = []
    for filename in os.listdir(dir_path+"/images"):
      image_list.append(filename)

    return image_list

def convert(size, box):
    dw = 1./(size[0])
    dh = 1./(size[1])
    x = (box[0] + box[1])/2.0 - 1
    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

def convert_annotation(dir_path, output_path, image_path):
    basename = os.path.basename(image_path)
    basename_no_ext = os.path.splitext(basename)[0]
    # image_path is like n02085620_10074.jpg
    # basename_no_ext is like n02085620_10074
    # dir_path is /content/fypdataset/
    # output_path is /content/fypdataset/yolo
    # in_file should be: /content/fypdataset/labels/n02085620_10074
    in_file = open(dir_path + '/labels/' + basename_no_ext)
    out_file = open(output_path + "/" + basename_no_ext + '.txt', 'w')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)

    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult)==1:
            continue
        cls_id = 0
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text),
             float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        bb = convert((w,h), b)
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')

cwd = getcwd()

for dir_path in dirs:
    full_dir_path = cwd + '/' + dir_path
    print(full_dir_path)
    output_path = full_dir_path +'/yolo'
    print(output_path)

    if not os.path.exists(output_path):
        os.makedirs(output_path)
        print("output path created!")

    image_paths = getImagesInDir(full_dir_path)
    list_file = open(full_dir_path + '.txt', 'w')

    for image_path in image_paths:
        list_file.write(image_path + '\n')
        convert_annotation(full_dir_path, output_path, image_path)
    list_file.close()

    print("Finished processing: " + dir_path)

/content/fypdataset
/content/fypdataset/yolo
Finished processing: fypdataset


## 3. Train, Val, Test split:

In [None]:
from sklearn.model_selection import train_test_split
import re
# split the image data into train, val, and test sets

path_to_annotation_files = []
path_to_image_files = []
annotation_root= "/content/fypdataset/yolo"
root_path = "/content/fypdataset"
for class_id in os.listdir(annotation_root):
  annotation_path = os.path.join(annotation_root, class_id)       #/content/fypdataset/yolo/n02085620_1298.txt
  text=class_id                         # we need i == n02110806_3971.jpg
  d=re.search ('\d+_\d+', text)
  if d.group(0)!=None:
    b=d.group(0)
    x="n"+b+".jpg"     #x= n02085620_1298.jpg
  image_path = os.path.join(root_path, "images/"+x)   #/content/fypdataset/images/n02085620_1298.jpg
  path_to_annotation_files.append(annotation_path)
  path_to_image_files.append(image_path)



print("no of images: ", len(path_to_annotation_files))
print("no of labels: ", len(path_to_image_files))

train_images, val_images, train_annotations, val_annotations= train_test_split(path_to_image_files, path_to_annotation_files,
                                                                               test_size=0.2, random_state=1)

val_images, test_images, val_annotations, test_annotations=train_test_split(val_images, val_annotations,
                                                                           test_size=0.5, random_state=1)



no of images:  6876
no of labels:  6876


In [None]:
print(train_images) #/content/fypdataset/images/n02085620_10074.jpg
print(train_annotations) #/content/fypdataset/yolo/n02085620_1073.txt

## 4. moving the train, test and val split files into their respective folders:

* I created 2 new folders as images1 and labels1 in fypdataset folder, and then I created training, testing and validation folders within each of them. So that I can delete the old images and yolo folder (I also deleted the initial labels/annotations folder)

In [None]:
import shutil

def moves_files_to_folder(list_of_files, destination_folder):
  for f in list_of_files:
    try:
      shutil.copy(f, destination_folder)
    except:
      print(f)
      assert False

moves_files_to_folder(train_images, "/content/fypdataset/images1/training")
moves_files_to_folder(val_images, "/content/fypdataset/images1/validation")
moves_files_to_folder(test_images, "/content/fypdataset/images1/testing")
moves_files_to_folder(train_annotations, "/content/fypdataset/labels1/training")
moves_files_to_folder(val_annotations, "/content/fypdataset/labels1/validation")
moves_files_to_folder(test_annotations, "/content/fypdataset/labels1/testing")



## 5. Now deleting the old images and yolo folder:

In [None]:
shutil.rmtree("/content/fypdataset/images")
shutil.rmtree("/content/fypdataset/yolo")

## 6. converting fypdataset into zipped file,, and downloading it, so that we dont lose our data.


In [None]:

!zip /content/content/fypdataset.zip -r /content/content/fypdataset
# from google.colab import files
# files.download("/content/fypdataset.zip")

# you can also just double-click on the zip file as shown in the files tab