# Dataset Organiser

> This notebook is created to sort the files into train and test set, as well as, damage and undamage region. 
> <p> Note: All pre-event are taking as undamage

In [53]:
import json
import numpy as np
import glob
import shutil
import os
import random 


random.seed(42)

## Prepare folder for storing training and testing dataset

The aim is to arange the dataset in this format.
<pre>
|__train
    |_____damage
    |_____no_damage
|__test
    |_____damage
    |_____no_damage
<pre/>

In [54]:
dirpath = os.path.join('.', 'dataset')

trainpath = os.path.join(dirpath, 'train')
testpath = os.path.join(dirpath, 'test')
damagepath = os.path.join(dirpath, 'damage')
no_damagepath = os.path.join(dirpath, 'no_damage')

train_damage = os.path.join(trainpath, 'damage')
train_no_damage = os.path.join(trainpath, 'no_damage')
test_damage = os.path.join(testpath, 'damage')
test_no_damage = os.path.join(testpath, 'no_damage')

if os.path.exists(dirpath):

    # Delete directory
    shutil.rmtree(dirpath)

    # Make directory
    os.mkdir(dirpath)

    os.mkdir(trainpath)
    os.mkdir(testpath)
    os.mkdir(damagepath)
    os.mkdir(no_damagepath)

    os.mkdir(train_damage)
    os.mkdir(train_no_damage)
    os.mkdir(test_damage)
    os.mkdir(test_no_damage)


else:
    os.mkdir(dirpath)

    os.mkdir(trainpath)
    os.mkdir(testpath)
    os.mkdir(damagepath)
    os.mkdir(no_damagepath)

    os.mkdir(train_damage)
    os.mkdir(train_no_damage)
    os.mkdir(test_damage)
    os.mkdir(test_no_damage)



In [55]:
# Store the directory of all pre disaster event
pre_disaster_metadata = glob.glob(os.path.abspath(os.path.join('.', 'selected_data', 'train', 'labels', '*pre*')))

# Store the directory of all post disaster event
post_disaster_metadata = glob.glob(r"D:\\uoc\AI4Good-Flood_Detection_Using_Deep_Learning\selected_data\\train\\labels\\*post*")

In [56]:
# Know the available status of the properties damage level


for metadata in post_disaster_metadata:
    with open(metadata) as f:
        try:
            status = []
            json_object = json.load(f)
            properties = json_object['features']['lng_lat']
            for property in properties:
                status.append(property['properties']['subtype'])
            unique_status, status_count = np.unique(np.array(status), return_counts=True)
            file_name = metadata.split('\\')[-1].split('.')[0]
            file_path = r"D:\\uoc\AI4Good-Flood_Detection_Using_Deep_Learning\selected_data\\train\\images"
            image_location = os.path.join(file_path, file_name + '.png')
            
            if 'destroyed' in status or 'major-damage' in status or 'minor-damage' in status:
                final_dir = os.path.abspath(os.path.join(damagepath, file_name + '.png'))
                shutil.copy(image_location, final_dir)
            else:
                final_dir = os.path.abspath(os.path.join(no_damagepath,  file_name + '.png'))
                shutil.copy(image_location, final_dir)

        except:
            print('Error occured')

In [57]:
damaged_region = glob.glob(os.path.abspath(os.path.join('.', 'dataset', 'damage', '*')))
undamaged_region = glob.glob(os.path.abspath(os.path.join('.', 'dataset', 'no_damage', '*')))
total_image = len(glob.glob(os.path.abspath(os.path.join('.', 'selected_data', 'train', 'labels', '*'))))

print("Post flood event, their are {} images of damaged locations and {} images of undamaged locations\n".format(len(damaged_region), 
            len(undamaged_region)))
print(f"The total available images is {total_image}.")

Post flood event, their are 109 images of damaged locations and 170 images of undamaged locations

The total available images is 558.


In [58]:
for metadata in pre_disaster_metadata:
    file_name = metadata.split('\\')[-1].split('.')[0]
    file_path = r"D:\\uoc\AI4Good-Flood_Detection_Using_Deep_Learning\selected_data\\train\\images"
    image_location = os.path.join(file_path, file_name + '.png')
    final_dir = os.path.abspath(os.path.join(no_damagepath,  file_name + '.png'))
    shutil.copy(image_location, final_dir)
    

In [59]:
damaged_region = glob.glob(os.path.abspath(os.path.join('.', 'dataset', 'damage', '*')))
undamaged_region = glob.glob(os.path.abspath(os.path.join('.', 'dataset', 'no_damage', '*')))
total_image = len(glob.glob(os.path.abspath(os.path.join('.', 'selected_data', 'train', 'labels', '*'))))

print("Post flood event, their are {} images of damaged locations and {} images of undamaged locations\n".format(len(damaged_region), 
            len(undamaged_region)))
print(f"The total available images is {total_image}.")

Post flood event, their are 109 images of damaged locations and 449 images of undamaged locations

The total available images is 558.


## Split data into Training and testset

In [60]:
random.Random(42).shuffle(undamaged_region)
random.Random(42).shuffle(damaged_region)

no_undamage = len(undamaged_region)
no_damage = len(damaged_region)

no_undamage_test = int(no_undamage * 0.2)
no_damage_test = int(no_damage * 0.2)

train_undamage_images = undamaged_region[no_undamage_test:]
train_damage_images = damaged_region[no_damage_test:]
test_undamage_images = undamaged_region[:no_undamage_test]
test_damage_images = damaged_region[:no_damage_test]

assert len(train_undamage_images) + len(test_undamage_images) == no_undamage
assert len(train_damage_images) + len(test_damage_images) == no_damage


In [61]:
train_undamage_images[0]

'd:\\uoc\\AI4Good-Flood_Detection_Using_Deep_Learning\\dataset\\no_damage\\midwest-flooding_00000082_pre_disaster.png'

In [62]:
def train_test_split( image_paths:list, final_path:str):
    for image in image_paths:
        file_name = image.split('\\')[-1].split('.')[0]
        final_dir = os.path.join(final_path, file_name  + '.png')
        shutil.copy(image, final_dir)

In [63]:
train_test_split(train_undamage_images, train_no_damage)
train_test_split(train_damage_images, train_damage)
train_test_split(test_undamage_images, test_no_damage)
train_test_split(test_damage_images, test_damage)