RarePlanes Github: https://github.com/jdc08161063/RarePlanes

RarePlanes Paper citation:

@misc{shermeyer2020rareplanes,
title={RarePlanes: Synthetic Data Takes Flight},
author={Jacob Shermeyer and Thomas Hossler and Adam Van Etten and Daniel Hogan and Ryan Lewis and Daeil Kim},
year={2020},
eprint={2006.02963},
archivePrefix={arXiv},
primaryClass={cs.CV}
}

# Packages

In [None]:
import os
import json
import shutil
from pathlib import Path
from collections import Counter
import geopandas as gpd
import glob
from glob import glob
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset Analysis

## Number of Planes in Tiled Images that They Gave (json file)

In [None]:
def geojson_role_counter(path):
    total = 0
    with open(path, "r") as f:
        data = json.load(f)
    role_ids = []
    for ann in data["annotations"]:
        role_ids.append(ann["role_id"])
    counter = Counter(role_ids)
    for numbers in counter.values():
        total += numbers
    return(f"Total Number of Aircraft: {total}, In roles: {counter}")

train_counts = geojson_role_counter("/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Train_Coco_Annotations_tiled.json")
test_counts = geojson_role_counter("/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Test_Coco_Annotations_tiled.json")
print(f"Train: {train_counts},\n Test: {test_counts}")
print(f"Train+Test = {18393 + 6812}")

# More than the original given number of aircraft because of duplication and partial distribution

Train: Total Number of Aircraft: 18393, In roles: Counter({1.0: 10328, 2.0: 6123, 3.0: 1450, 4.0: 284, 6.0: 185, 7.0: 17, 5.0: 6}),
 Test: Total Number of Aircraft: 6812, In roles: Counter({2.0: 2874, 1.0: 2616, 3.0: 967, 4.0: 230, 6.0: 111, 7.0: 10, 5.0: 4})
Train+Test = 25205


In [None]:
with open("/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Train_Coco_Annotations_tiled.json") as f:
    data = json.load(f)
    annotations = data["annotations"]

    partial_count = sum(1 for ann in annotations if ann.get("partialDec", 0.0) < 1.0)
    truncated_count = sum(1 for ann in annotations if ann.get("truncated", 0.0) == 1.0)

    print(f"Partial aircraft: {partial_count}")
    print(f"Truncated aircraft: {truncated_count}")

with open("/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Test_Coco_Annotations_tiled.json") as f:
    data = json.load(f)
    annotations = data["annotations"]

    partial_count = sum(1 for ann in annotations if ann.get("partialDec", 0.0) < 1.0)
    truncated_count = sum(1 for ann in annotations if ann.get("truncated", 0.0) == 1.0)

    print(f"Partial aircraft: {partial_count}")
    print(f"Truncated aircraft: {truncated_count}")


Partial aircraft: 4756
Truncated aircraft: 4756
Partial aircraft: 2409
Truncated aircraft: 2409


## Number of Planes in the Original Hand-Annotated Annotations.geojson (geojson file)

In [None]:
def geojson_count(path):
  total = 0
  role_ids = []
  with open(path, "r") as f:
      data = json.load(f)
      for feature in data["features"]:
              role_id = feature["properties"].get("role_id")
              total += 1
              role_ids.append(role_id)
  return(f"Total: {total}, {Counter(role_ids)}")

In [None]:
print(geojson_count("/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Public_All_Annotations.geojson"))
print(25025 - 14707)

# There are extra (25202 - 14707) 10318 instances which is not hand-annotated. Potentially, duplications and/or partial aircraft.

Total: 14707, Counter({1: 8002, 2: 5132, 3: 1098, 4: 283, 6: 171, 7: 15, 5: 6})
10318


## Checking Overlap between Train and Test Set

In [None]:
def extract_image_names(folder):
    image_names = set()
    for filename in os.listdir(folder):
        if filename.endswith(".png"):
            base = filename.split("_tile_")[0]
            image_names.add(base)
    return image_names

train_dir = '/content/drive/MyDrive/RarePlanes/Train/images'
test_dir = '/content/drive/MyDrive/RarePlanes/Test/images'

train_images = extract_image_names(train_dir)
test_images = extract_image_names(test_dir)

overlap = train_images & test_images

if overlap is True:
    print("Overlapping image:")
else:
    print("No overlap")

No overlap


# Original Number of Images and Aircraft on Train and Test Sets

In [None]:
import json

def count_classes_original(json_path):
    with open(json_path) as f:
        data = json.load(f)

    civil = 0
    military = 0
    na = 0

    for annotation in data["annotations"]:
        role = annotation.get("role", "").lower()
        if role == "civil" or "civil" in role:
            civil += 1
        elif role == "military" or "military" in role:
            military += 1
        else: print("none")

    total = civil + military

    return f"Civil: {civil} ({civil / total:.2%}), Military: {military} ({military / total:.2%}))"

In [None]:
print(f"Train set: {count_classes_original('/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Train_Coco_Annotations_tiled.json')}")
print(f"Test set: {count_classes_original('/content/drive/MyDrive/RarePlanes/Metadata_Annotations/RarePlanes_Test_Coco_Annotations_tiled.json')}")

Train set: Civil: 17901 (97.33%), Military: 492 (2.67%))
Test set: Civil: 6457 (94.79%), Military: 355 (5.21%))


In [None]:
# Original number of images, train: 5815
# Original number of images, test: 2710

# Creating New Tiled Dataset with Original Images (Overlap=0)

## On Docker

**Create Tiles**
*   **Train**

python /code/rareplanes_package/create_tiles.py \
  --image_dir /code/RarePlanes/real/tarballs/train/PS-RGB_cog \
  --geojson_dir /code/RarePlanes/real/tarballs/train/geojson_aircraft \
  --tile_image_dir /code/RarePlanes/real/train/images \
  --tile_geojson_dir /code/RarePlanes/real/train/geojson \
  --tile_size 1024 \
  --overlap 0


* **Test**

python /code/rareplanes_package/create_tiles.py \
  --image_dir /code/RarePlanes/real/tarballs/test/PS-RGB_cog \
  --geojson_dir /code/RarePlanes/real/tarballs/test/geojson_aircraft \
  --tile_image_dir /code/RarePlanes/real/test/images \
  --tile_geojson_dir /code/RarePlanes/real/test/geojson \
  --tile_size 1024 \
  --overlap 0

**Merge Files in Geojson Folder**

python merge_geojson.py \
  --input_dir /code/RarePlanes/real/train/geojson \
  --output_file /code/RarePlanes/real/train/train_geojson.geojson

python merge_geojson.py \
  --input_dir /code/RarePlanes/real/test/geojson \
  --output_file /code/RarePlanes/real/test/test_geojson.geojson



## On Drive

In [None]:
# Number of aircraft new version tiled train and test sets:

print(geojson_count("/content/drive/MyDrive/RarePlanes/Train/train_geojson.geojson")) # 11365
print(geojson_count("/content/drive/MyDrive/RarePlanes/Test/test_geojson.geojson")) # 3998

# Still more than the original because of partial aircraft.

Total: 11365, Counter({1: 6567, 2: 3751, 3: 751, 4: 174, 6: 109, 7: 9, 5: 4})
Total: 3998, Counter({2: 1690, 1: 1625, 3: 485, 4: 126, 6: 64, 7: 6, 5: 2})


In [None]:
11365+3998

15363

## Cleaning Partial Aircraft

In [None]:
def clean_partial_aircraft(input,output):
    gdf = gpd.read_file(input)
    gdf_clean = gdf[
    (gdf["partialDec"].fillna(1.0) == 1.0) &
    (gdf["truncated"].fillna(0) == 0)
]
    gdf_clean.to_file(output, driver="GeoJSON")

    print(f"Original aircraft: {len(gdf)}")
    print(f"Remaining aircraft: {len(gdf_clean)}")
    print(f"Removed aircraft: {len(gdf) - len(gdf_clean)}")

In [None]:
# Number of aircraft after cleaning partial

train = clean_partial_aircraft("/content/drive/MyDrive/RarePlanes/Train/train_geojson.geojson","/content/drive/MyDrive/RarePlanes/Train/train_geojson_clean.geojson")
test = clean_partial_aircraft("/content/drive/MyDrive/RarePlanes/Test/test_geojson.geojson", "/content/drive/MyDrive/RarePlanes/Test/test_geojson_clean.geojson")

Original aircraft: 11365
Remaining aircraft: 9842
Removed aircraft: 1523
Original aircraft: 3998
Remaining aircraft: 3310
Removed aircraft: 688


In [None]:
# Final geojson

print(geojson_count("/content/drive/MyDrive/RarePlanes/Train/train_geojson_clean.geojson")) # 11365
print(geojson_count("/content/drive/MyDrive/RarePlanes/Test/test_geojson_clean.geojson")) # 3998

Total: 9842, Counter({1: 6074, 2: 3062, 3: 456, 4: 131, 6: 106, 7: 9, 5: 4})
Total: 3310, Counter({1: 1511, 2: 1349, 3: 282, 4: 102, 6: 59, 7: 6, 5: 1})


## Role Simplified (6 categories to 2 categories)

In [None]:
def role_simplify(input, output):
    role_map = {
    "Small Civil Transport/Utility": "civil",
    "Medium Civil Transport/Utility": "civil",
    "Large Civil Transport/Utility": "civil",
    "Military Transport/Utility/AWAC": "military",
    "Military Bomber": "military",
    "Military Fighter/Interceptor/Attack": "military",
    "Military Trainer": "military"
}
    gdf = gpd.read_file(input)
    gdf["role_simplified"] = gdf["role"].map(role_map).fillna("unknown")
    result = gdf.to_file(output, driver= "GeoJSON")
    return result

In [None]:
(role_simplify("/content/drive/MyDrive/RarePlanes/Train/train_geojson_clean.geojson", "/content/drive/MyDrive/RarePlanes/Train/train_geojson_clean.geojson"))
(role_simplify("/content/drive/MyDrive/RarePlanes/Test/test_geojson_clean.geojson", "/content/drive/MyDrive/RarePlanes/Test/test_geojson_clean.geojson"))

## Updating Image and Label Files According to New .geojson

1. Creating a set of valid aircraft (clean_aircraft_keys) using cat_id and a consistent coordinate hash.

2. Looping over tile GeoJSONs and filtering out aircraft not in the clean set.

3. Writing cleaned GeoJSONs only for tiles that contain valid aircraft.

4. Then using those cleaned GeoJSONs to sync files, keeping only what's needed.

## Train

In [None]:
with open("/content/drive/MyDrive/RarePlanes/Train/train_geojson_clean.geojson") as f:
    merged_clean = json.load(f)

clean_aircraft_keys = set()
for feature in merged_clean["features"]:
    cat_id = feature["properties"].get("cat_id")
    coords = feature["geometry"]["coordinates"]
    if coords and coords[0]:
        key = f"{cat_id}_{coords[0][0][0]:.6f}_{coords[0][0][1]:.6f}"
        clean_aircraft_keys.add(key)

### Labels (Geojson) Folder

In [None]:
tile_dir = Path("/content/drive/MyDrive/RarePlanes/Train/geojson")
output_dir = Path("/content/drive/MyDrive/RarePlanes/Train/geojson_cleaned")
output_dir.mkdir(parents=True, exist_ok=True)

for geo_file in tile_dir.glob("*.geojson"):
    with open(geo_file) as f:
        tile_data = json.load(f)

    filtered_features = []
    for feature in tile_data["features"]:
        cat_id = feature["properties"].get("cat_id")
        coords = feature["geometry"]["coordinates"]
        key = f"{cat_id}_{coords[0][0][0]:.6f}_{coords[0][0][1]:.6f}"
        if key in clean_aircraft_keys:
            filtered_features.append(feature)

    if filtered_features:
        with open(output_dir / geo_file.name, "w") as f:
            json.dump({
                "type": "FeatureCollection",
                "features": filtered_features
            }, f, indent=2)

In [None]:
geojson_files = glob.glob("/content/drive/MyDrive/RarePlanes/Train/geojson_cleaned/*.geojson")
print(f"Number of files: {len(geojson_files)}")


Number of files: 1889


### Image Folder

In [None]:
import os
from pathlib import Path

cleaned_geojson_dir = Path("/content/drive/MyDrive/RarePlanes/Train/geojson_cleaned") #output of the previous
image_tile_dir = Path("/content/drive/MyDrive/RarePlanes/Train/images")

valid_tiles = {f.stem for f in cleaned_geojson_dir.glob("*.geojson")}

png_files_to_keep = set()
for file in image_tile_dir.glob("*.png"):
    stem = file.stem
    if stem in valid_tiles:
        png_files_to_keep.add(stem)
    else:
        file.unlink()

In [None]:
images = glob.glob("/content/drive/MyDrive/RarePlanes/Train/images/*.png")
print(f"Number of files: {len(images)}")

Number of files: 1889


### Number of Aircraft after Cleaning (To Check the Match)

In [None]:
geojson_folder = "/content/drive/MyDrive/RarePlanes/Train/geojson_cleaned"
total = 0

for filename in os.listdir(geojson_folder):
    if filename.endswith(".geojson"):
        file_path = os.path.join(geojson_folder, filename)
        with open(file_path, 'r') as f:
            data = json.load(f)
            features = data.get("features", [])
            total += len(features)

print(f"Total aircraft: {total}")

# Train + Test in Train Folder = 9842 aircraft >>>> Correct


Total aircraft: 9842


## Test

In [None]:
with open("/content/drive/MyDrive/RarePlanes/Test/test_geojson_clean.geojson") as f:
    merged_clean = json.load(f)

clean_aircraft_keys = set()
for feature in merged_clean["features"]:
    cat_id = feature["properties"].get("cat_id")
    coords = feature["geometry"]["coordinates"]
    if coords and coords[0]:
        key = f"{cat_id}_{coords[0][0][0]:.6f}_{coords[0][0][1]:.6f}"
        clean_aircraft_keys.add(key)

### Labels (Geojson) Folder

In [None]:
tile_dir = Path("/content/drive/MyDrive/RarePlanes/Test/geojson")
output_dir = Path("/content/drive/MyDrive/RarePlanes/Test/geojson_cleaned")
output_dir.mkdir(parents=True, exist_ok=True)

for geo_file in tile_dir.glob("*.geojson"):
    with open(geo_file) as f:
        tile_data = json.load(f)

    filtered_features = []
    for feature in tile_data["features"]:
        cat_id = feature["properties"].get("cat_id")
        coords = feature["geometry"]["coordinates"]
        key = f"{cat_id}_{coords[0][0][0]:.6f}_{coords[0][0][1]:.6f}"
        if key in clean_aircraft_keys:
            filtered_features.append(feature)

    if filtered_features:
        with open(output_dir / geo_file.name, "w") as f:
            json.dump({
                "type": "FeatureCollection",
                "features": filtered_features
            }, f, indent=2)

In [None]:
geojson_files = glob.glob("/content/drive/MyDrive/RarePlanes/Test/geojson_cleaned/*.geojson")
print(f"Number of files: {len(geojson_files)}")


Number of files: 831


### Image Folder

In [None]:
cleaned_geojson_dir = Path("/content/drive/MyDrive/RarePlanes/Test/geojson_cleaned") #output of the previous
image_tile_dir = Path("/content/drive/MyDrive/RarePlanes/Test/images")

valid_tiles = {f.stem for f in cleaned_geojson_dir.glob("*.geojson")}

png_files_to_keep = set()
for file in image_tile_dir.glob("*.png"):
    stem = file.stem
    if stem in valid_tiles:
        png_files_to_keep.add(stem)
    else:
        file.unlink()

### Number of Aircraft after Cleaning (To Check the Match with .geojson)

In [None]:
geojson_folder = "/content/drive/MyDrive/RarePlanes/Test/geojson_cleaned"
total = 0

for filename in os.listdir(geojson_folder):
    if filename.endswith(".geojson"):
        file_path = os.path.join(geojson_folder, filename)
        with open(file_path, 'r') as f:
            data = json.load(f)
            features = data.get("features", [])
            total += len(features)

print(f"Total aircraft: {total}")


Total aircraft: 3310


# Train-Test Distribution

In [None]:
def count_classes(json_path):
    with open(json_path) as f:
            data = json.load(f)

    civil = 0
    military = 0
    na = 0
    for feature in data["features"]:
            if (feature["properties"]["role_simplified"]) == "civil":
                    civil += 1
            elif (feature["properties"]["role_simplified"]) == "military":
                    military += 1
            else:
                    na +=1

    all = military+civil+na

    return (f"Civil: {civil} ({civil / all}, Military: {military} ({military / all}), Any of them: {na}")

In [None]:
print(f"Train set: {count_classes('/content/drive/MyDrive/RarePlanes/Train/train_geojson_clean.geojson')}")
print(f"Test set: {count_classes('/content/drive/MyDrive/RarePlanes/Test/test_geojson_clean.geojson')}")

# The dataset has an extreme class imbalance in both train and test sets.

Train set: Civil: 9592 (0.9745986588091852, Military: 250 (0.025401341190814875), Any of them: 0
Test set: Civil: 3142 (0.9492447129909366, Military: 168 (0.05075528700906345), Any of them: 0


* COCO styled .json files are created on Docker by role_simplified as the label:

python /code/rareplanes_package/create_coco_real.py \
  --image_dir /code/Dataset_RarePlanes/train/images \
  --geojson_dir /code/Dataset_RarePlanes/train/geojson_clean \
  --output_path /code/roboflow_train_coco.json \
  --category_attribute role_simplified

python /code/rareplanes_package/create_coco_real.py \
  --image_dir /code/Dataset_RarePlanes/test/images \
  --geojson_dir /code/Dataset_RarePlanes/test/geojson_clean \
  --output_path /code/roboflow_test_coco.json \
  --category_attribute role_simplified

* They are uploaded on Roboflow and YOLO labels are extracted as .txt files

### Aircraft Keys

In [None]:
# Aircraf IDs (will used for XGBoost train-test)

def label_class_ids_to_named_df(label_folder):
    data = []

    label_files = sorted(glob(os.path.join(label_folder, "*.txt")))

    for label_file in label_files:
        base = os.path.splitext(os.path.basename(label_file))[0] #file name

        with open(label_file, 'r') as f:
            for i, line in enumerate(f):
                if line.strip():
                    parts = line.strip().split()
                    class_id = int(parts[0])
                    name = f"{base}_{i+1}"
                    data.append({"image": name, "class_id": class_id})

    return pd.DataFrame(data)

gt_train_labels_folder = "/content/drive/MyDrive/YOLO_roboflow_/train/labels"
gt_test_labels_folder = "/content/drive/MyDrive/YOLO_roboflow_/test/labels"

gt_train = label_class_ids_to_named_df(gt_train_labels_folder)
gt_test = label_class_ids_to_named_df(gt_test_labels_folder)

In [None]:
gt_train.to_csv("/content/drive/MyDrive/YOLO_roboflow_/train/gt_train.csv", index=False)
gt_test.to_csv("/content/drive/MyDrive/YOLO_roboflow_/test/gt_test.csv", index=False)

## Checking Overlap between Train and Test Set after Processing (for Data Leakage)

In [None]:
def extract_image_names(folder):
    image_names = set()
    for filename in os.listdir(folder):
        if filename.endswith(".png"):
            base = filename.split("_tile_")[0]
            image_names.add(base)
    return image_names

train_dir = '/content/drive/MyDrive/YOLO_roboflow_/train/images'
test_dir = '/content/drive/MyDrive/YOLO_roboflow_/test/images'

train_images = extract_image_names(train_dir)
test_images = extract_image_names(test_dir)

overlap = train_images & test_images

if overlap is True:
    print("Overlapping image:")
else:
    print("No overlap")

No overlap
