In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import io
import shutil
import re 
import time

RANDOM_SEED = 8675309

## Set up PASCAL_2010 image dataset

In [2]:
DATASETS_DIR = r"../datasets"
PASCAL_2010_DIR = os.path.join(DATASETS_DIR, r"PASCAL2010")

PASCAL_2010_ORIGINAL_IMAGES_DIR = r"G:\PASCAL_tinker\datasets\VOC2010_Context\JPEGImages" # RENAME TO WHERE THE JPEGImages Directory of Pascal2010 is located
PASCAL_2010_ORIGINAL_LABELS_DIR = r"G:\PASCAL_tinker\datasets\VOC2010_Context\SegmentationRawClass" # RENAME TO WHERE THE SegmentationRawData Directory of Pascal2010 is located

PASCAL_2010_IMAGES_DIR = os.path.join(PASCAL_2010_DIR, r"images")
PASCAL_2010_LABELS_DIR = os.path.join(PASCAL_2010_DIR, r"labels")


def set_up_pascal2010_dataset(
    original_images_dir=PASCAL_2010_ORIGINAL_IMAGES_DIR, 
    original_labels_dir=PASCAL_2010_ORIGINAL_LABELS_DIR, 
    images_dir=PASCAL_2010_IMAGES_DIR,
    labels_dir=PASCAL_2010_LABELS_DIR, 
    zfill=8):
    for i,matrix_file in enumerate(os.listdir(original_labels_dir)):
        deconstructed_file_name = re.split('\_|\.', matrix_file)
        keep_image_file_name = deconstructed_file_name[0] + "_" + deconstructed_file_name[1] + "." + "jpg"
        
        mat_label = io.loadmat(os.path.join(original_labels_dir, matrix_file))['LabelMap']
        new_label_file_name = "PASCAL2010" + "_" + str(i).zfill(zfill) + "." + "npy"
        new_image_file_name = "PASCAL2010" + "_" + str(i).zfill(zfill) + "." + "jpg"
        
        np.save(os.path.join(labels_dir, new_label_file_name), mat_label)
        shutil.copyfile(os.path.join(original_images_dir, keep_image_file_name), os.path.join(images_dir, new_image_file_name))

        
def run_set_up_pascal2010_dataset():
    set_up_pascal2010_dataset()


# Set up ADE image dataset

In [2]:
''' We already have the unprocessed .npy files processed from the .png segmentation files of ADE. Let's rename them to a compatible convetion '''

DATASETS_DIR = r"../datasets"
ADE_DIR = os.path.join(DATASETS_DIR, r"ADE")
ADE_LABELS_DIR = os.path.join(ADE_DIR, r"labels")
ADE_IMAGES_DIR = os.path.join(ADE_DIR, r"images")


def rename_ade_labels(labels_dir=ADE_LABELS_DIR, zfill=8):
    for label_file in os.listdir(labels_dir):
        deconstructed_file_name = re.split('\_|\.', label_file)
        new_file_name = "ADE" + "_" + str(int(deconstructed_file_name[-2])).zfill(zfill) + "." + deconstructed_file_name[-1]
        os.rename(os.path.join(labels_dir, label_file), os.path.join(labels_dir, new_file_name))
            
            
def rename_ade_images(images_dir=ADE_IMAGES_DIR, zfill=8):
    for image_file in os.listdir(images_dir):
        deconstructed_file_name = re.split('\_|\.', image_file)
        new_file_name = "ADE" + "_" + str(int(deconstructed_file_name[-2])).zfill(zfill) + "." + deconstructed_file_name[-1]
        os.rename(os.path.join(images_dir, image_file), os.path.join(images_dir, new_file_name))
        
        
def run_setup_ade_image_dataset():
    rename_ade_images()
    rename_ade_labels()


## Create ADE Dataframe 

In [2]:
DATASETS_DIR = r"../datasets"
ADE_DIR = os.path.join(DATASETS_DIR, r"ADE")
ADE_LABELS_DIR = os.path.join(ADE_DIR, r"labels")
ADE_DATAFRAMES_DIR = os.path.join(ADE_DIR, r"dataframes")
ADE_MISC_DIR = os.path.join(ADE_DIR, "miscellaneous")     
ADE_CLASS_FILE =  os.path.join(ADE_MISC_DIR, r"class_list.txt")

ADE_CLASS_ID_INVALID = 0


def get_ADE_classes(class_file):
    classes_df = pd.read_csv(class_file, header=0, sep='\t', lineterminator='\n')
    classes = np.asarray(classes_df['Name'])
    for i in range(len(classes)):
        classes[i] = re.split(',', classes[i])[0]
        classes[i] = classes[i].rstrip()
    return classes


def get_num_instances(directory, extension):
    # Return the number of instances in the dataset
    instances = 0
    for file in os.listdir(directory):
        if re.split('\_|\.', file)[-1] == extension:
            instances += 1
    return instances


def save_dataframe(df, dataset_dir, name, index=False):
    dataframes_dir = os.path.join(dataset_dir, "dataframes")     
    df.to_csv(os.path.join(dataframes_dir, name), index=index)

    
def load_dataframe(filename, dataset_dir, header=None, sep=',', index_col=0):
    dataframes_dir = os.path.join(dataset_dir, "dataframes")     
    return pd.read_csv(os.path.join(dataframes_dir, filename), header=header, sep=sep, index_col=index_col, lineterminator='\n')


def create_initial_df(dataset_dir, class_id_invalid):
    ''' The initial ADE has features: id, width, height, scene, and invalid ratio '''
    images_dir = os.path.join(dataset_dir, "images")     
    labels_dir = os.path.join(dataset_dir, "labels")     
    
    total_num_images = get_num_instances(images_dir, "jpg")
    total_num_labels = get_num_instances(labels_dir, "npy")
    assert(total_num_images==total_num_labels)
    total_num_instances = total_num_images
    
    id_col = np.zeros(total_num_instances, dtype=np.int64)   
    width_col = np.zeros(total_num_instances, dtype=np.int64)   
    height_col = np.zeros(total_num_instances, dtype=np.int64)
    invalid_class_node_ratio_col = np.zeros(total_num_instances, dtype=np.float64)   

    for label_file in os.listdir(labels_dir):
        label = np.load(os.path.join(labels_dir, label_file))
        
        instance_id = int(re.split('\_|\.', label_file)[-2])
        width = label.shape[1]
        height = label.shape[0]

        id_col[instance_id] = instance_id
        width_col[instance_id] = width
        height_col[instance_id] = height
        invalid_class_node_ratio_col[instance_id] = ((label == class_id_invalid).sum())/(width*height)
    
    # Create scene column using the text file given
    miscellaneous_dir = os.path.join(dataset_dir, "miscellaneous")     
    scene_categories_file = os.path.join(miscellaneous_dir, "scene_categories.txt")
    if os.path.isfile(scene_categories_file) and os.access(scene_categories_file, os.R_OK):
        scene_col = (pd.read_csv(scene_categories_file, header=None, sep=' ', lineterminator='\n').iloc[:,1]).to_numpy() 
        assert(scene_col.shape[0] == total_num_instances)
        initial_df = pd.DataFrame({
            "id" : id_col,
            "width" : width_col,
            "height" : height_col,
            "scene" : scene_col,
            "invalid" : invalid_class_node_ratio_col,           
            })
    else:
        initial_df = pd.DataFrame({
            "id" : id_col,
            "width" : width_col,
            "height" : height_col,
            "invalid" : invalid_class_node_ratio_col,           
            })

    #save_dataframe(df=initial_df, dataset_dir=dataset_dir, name=name)

    return initial_df


def create_class_proportion_df(dataset_dir, classes):
    ''' Create pixel-ratio of each class to measure its apparency in an image. We can use pearson correlation to see which classes can be binned together '''
    images_dir = os.path.join(dataset_dir, "images")     
    labels_dir = os.path.join(dataset_dir, "labels")     

    total_num_images = get_num_instances(images_dir, "jpg")
    total_num_labels = get_num_instances(labels_dir, "npy")
    assert(total_num_images==total_num_labels)
    total_num_instances = total_num_images
    
    num_valid_classes=len(classes)
    class_proportion = np.arange(1, num_valid_classes+1, 1, dtype=np.float64)   
    class_proportion = np.tile(class_proportion,(total_num_instances,1))
    
    def proportion(class_id, label):
        return ((label == class_id).sum())/label.size
    
    # https://stackoverflow.com/questions/4495882/numpy-vectorize-using-lists-as-arguments
    # Use vectorization for speed-up
    def curry_proportion(label):
        def proportion_curried(class_id):
            return proportion(class_id, label)  
        return proportion_curried

    for label_file in os.listdir(labels_dir):
        label_deconstructed = re.split('\_|\.', label_file)     
        instance_id = int(label_deconstructed[-2])
                
        label = np.load(os.path.join(labels_dir, label_file)).flatten()
        vector_proportion = np.vectorize(curry_proportion(label))
        class_proportion[instance_id] = vector_proportion(class_proportion[instance_id])
    
    classes_df = pd.DataFrame(class_proportion, columns=classes)
    
    #save_dataframe(df=classes_df, dataset_dir=dataset_dir, name=name)
    # Time elapsed: 1445.3204731941223
    return classes_df
    

def create_master_df(dataset_dir, classes, class_id_invalid, name):
    ''' Merge all the dataframes by column '''
    initial_df=create_initial_df(dataset_dir=dataset_dir, class_id_invalid=class_id_invalid)
    class_proportion_df=create_class_proportion_df(dataset_dir=dataset_dir, classes=classes)
    
    assert(initial_df.shape[0] == class_proportion_df.shape[0])
    master_df = pd.concat([initial_df, class_proportion_df], axis=1)
    
    # Track the id of the instance
    master_df.reset_index(inplace=True)
    
    # Remove carriage return from the column names
    master_df_cols = master_df.columns.values
    for i in range(len(master_df_cols)):
        master_df_cols[i] = master_df_cols[i].rstrip()
    master_df.columns = master_df_cols
    
    # Add a column to indicate what dataset this instance belongs to
    dataset_cols = [os.path.basename(dataset_dir) for _ in range(master_df.shape[0])]
    master_df.insert(loc=2, column='dataset', value=dataset_cols)
    
    save_dataframe(df=master_df, dataset_dir=dataset_dir, name=name)

    return master_df

    
def create_ade_dataframe():
    start = time.time()
    ade_classes = get_ADE_classes(class_file=ADE_CLASS_FILE)
    create_master_df(dataset_dir=ADE_DIR,classes=ade_classes, class_id_invalid=ADE_CLASS_ID_INVALID, name="ADE_master.csv")
    end = time.time()
    print("Time elapsed: {}".format(end-start)) #Time elapsed: 1928.032068014145

Time elapsed: 0.0029981136322021484


## Create PASCAL Dataframe 

In [6]:
DATASETS_DIR = r"../datasets"
PASCAL2010_DIR = os.path.join(DATASETS_DIR, r"PASCAL2010")
PASCAL2010_LABELS_DIR = os.path.join(PASCAL2010_DIR, r"labels")
PASCAL2010_DATAFRAMES_DIR = os.path.join(PASCAL2010_DIR, r"dataframes")
PASCAL2010_MISC_DIR = os.path.join(PASCAL2010_DIR, "miscellaneous")     
PASCAL2010_CLASS_FILE =  os.path.join(PASCAL2010_MISC_DIR, r"class_list.txt")

PASCAL2010_CLASS_ID_INVALID = 431

def get_PASCAL2010_classes(class_file):
    ''' Move unknown from 431 to 0 and slide the rest down? '''
    classes_df = pd.read_csv(class_file, header=None, engine = 'python-fwf', lineterminator='\r\n')
    classes = np.asarray(classes_df).flatten()
    for i in range(len(classes)):
        classes[i] = re.split(': ', classes[i])[1]
        classes[i] = classes[i].rstrip()
   
    #classes = list(classes)
    #classes.remove("unknown")
        
    return np.asarray(classes)


def rearrange_PASCAL2010_df(dataset_dir, name):
    ''' Change unknown column to the invalid column '''
    df = load_dataframe(filename=name, header=0, dataset_dir=dataset_dir, index_col=None)
    
    df["invalid"] = df["unknown"].values
    df.drop(["unknown"], axis=1, inplace=True)
    
    save_dataframe(df=df, dataset_dir=dataset_dir, name=name)


def create_pascal_dataframe():
    start = time.time()
    pascal2010_classes=get_PASCAL2010_classes(class_file=PASCAL2010_CLASS_FILE)
    pascal2010_master_df=create_master_df(dataset_dir=PASCAL2010_DIR, classes=pascal2010_classes, class_id_invalid=PASCAL_CLASS_ID_INVALID, name="PASCAL2010_master.csv")
    rearrange_PASCAL2010_df(dataset_dir=PASCAL2010_DIR, name="PASCAL2010_master.csv")
    end = time.time()
    print("Time elapsed: {}".format(end-start)) #Time elapsed: 1489.9916152954102

Time elapsed: 0.009996652603149414


## Separate ADE dataset by indoor and outdoor

In [9]:
DATASETS_DIR = r"../datasets"
ADE_DIR = os.path.join(DATASETS_DIR, r"ADE")

def separate_ade_indoor_outdoor():
    df_ade = load_dataframe(filename="ADE_master.csv", dataset_dir=ADE_DIR, header=0, index_col=None)
    print(df_ade.shape[0])


    # Indoor
    has_wall = df_ade['wall'] > 0.0
    print(has_wall.sum())

    has_floor = df_ade['floor'] > 0.0
    print(has_floor.sum())

    has_wall_or_floor = np.logical_or(has_wall, has_floor)
    print(has_wall_or_floor.sum())

    wall_more_sky = df_ade['wall'] > df_ade['sky']
    print(wall_more_sky.sum())

    # Either is a picture of house with floor or if no floor, there would be walls (wall > sky means to include possible windows)
    indoor = np.logical_or(wall_more_sky, has_floor)
    print("Indoor: {}".format(indoor.sum()))

    x_indoor = np.logical_xor(indoor, wall_more_sky)
    print("XIndoor: {}".format(x_indoor.sum()))
    print(df_ade['id'][x_indoor])


    # Outdoor
    has_sky = df_ade['sky'] > 0.0
    print(has_sky.sum())

    has_road = df_ade['road'] > 0.0
    print(has_road.sum())

    has_sidewalk = df_ade['sidewalk'] > 0.0
    print("Sidewalk: {}".format(has_sidewalk.sum()))

    has_road_no_sidewalk = np.logical_and(has_road, ~has_sidewalk)
    print("Road no sidewalk: {}".format(has_road_no_sidewalk.sum()))
    #print(df_ade['id'][has_road_no_sidewalk])


    # Isolate possible sidewalk/road environments using cars/trucks/vans as reference
    has_car = df_ade['car'] > 0.0
    has_truck = df_ade['truck'] > 0.0
    has_van = df_ade['van'] > 0.0

    has_vehicle = np.logical_or(np.logical_or(has_car, has_truck), has_van)
    has_road_no_vehicle = np.logical_and(has_road, ~has_vehicle)

    print(has_road_no_vehicle.sum())

    has_grass = df_ade['grass'] > 0.0
    has_grass_no_indoor = np.logical_and(has_grass, ~indoor)
    print("Walkable grass: {}".format(has_grass_no_indoor.sum()))

    print(list(df_ade['id'][has_grass_no_indoor]))
    
    
def run_separate_ade_indoor_outdoor():
    separate_ade_indoor_outdoor()


22210
12755
10237
13055
11453
Indoor: 11821
XIndoor: 368
588        588
944        944
951        951
960        960
968        968
         ...  
21870    21870
21956    21956
22089    22089
22121    22121
22145    22145
Name: id, Length: 368, dtype: int64
9040
4397
Sidewalk: 3372
Road no sidewalk: 1694
1360
Walkable grass: 2446
[562, 563, 571, 572, 579, 582, 933, 934, 941, 945, 950, 969, 970, 973, 975, 976, 979, 980, 982, 983, 984, 986, 987, 988, 991, 992, 994, 996, 1002, 1003, 1004, 1005, 1007, 1010, 1011, 1012, 1014, 1015, 1018, 1021, 1024, 1025, 1029, 1032, 1033, 1034, 1036, 1038, 1080, 1084, 1085, 1086, 1090, 1313, 1315, 1323, 1324, 1369, 1376, 1403, 1411, 1414, 1417, 1422, 1424, 1428, 1429, 1432, 1438, 1458, 1466, 1481, 1482, 1483, 1505, 1507, 1510, 1511, 1516, 1520, 1527, 1528, 1530, 1535, 1545, 1546, 1547, 1588, 1589, 1621, 1853, 1854, 2037, 2039, 2040, 2071, 2081, 2082, 2083, 2087, 2091, 2093, 2096, 2107, 2111, 2116, 2258, 2317, 2352, 2354, 2355, 2357, 2360, 2362, 2363, 2430,

## Decide whether to interpolate based on the missing on each dataset

In [25]:
DATASETS_DIR = r"../datasets"
PASCAL2010_DIR = os.path.join(DATASETS_DIR, r"PASCAL2010")
ADE_DIR = os.path.join(DATASETS_DIR, r"ADE")


def print_missing_data_quantiles():
    df_ade = load_dataframe(filename="ADE_master.csv", dataset_dir=ADE_DIR, header=0, index_col=None)
    df_pascal2010 = load_dataframe(filename="PASCAL2010_master.csv", dataset_dir=PASCAL2010_DIR, header=0, index_col=None)


    ade_invalid_quantiles = df_ade['invalid'].quantile([0, 0.05, 0.25,0.5,0.75, 0.90, 0.95, 1.0])
    print(ade_invalid_quantiles)

    pascal2010_invalid_quantiles = df_pascal2010['invalid'].quantile([0, 0.05, 0.25,0.5,0.75, 0.90, 0.95, 1.0])
    print(pascal2010_invalid_quantiles)


    df_master = pd.concat([df_ade, df_pascal2010])
    master_invalid_quantiles = df_master['invalid'].quantile([i*0.01 for i in range(0, 100+5, 5)])
    print(master_invalid_quantiles)
    
def run_print_missing_data_quantiles():
    print_missing_data_quantiles()

'''
# Do not interpolate erroneous data that has too much invalid labels 
# Erroneous data 1: images with large amount of invalid labels
# Here we will cull 2/10 (20%) of the data with the most invalid nodes
qt, bins = pd.qcut(df_ade['invalid'], q=10, precision=0, retbins=True)
print(qt)
threshold_invalid_ratio = bins[8]
print("Threshold invalid: {}".format(threshold_invalid_ratio))
#ADE_df_cleaned = ADE_df[ADE_df['invalid'] <= threshold_invalid_ratio]
'''


0.00    0.000000
0.05    0.007164
0.25    0.014094
0.50    0.027032
0.75    0.091356
0.90    0.269002
0.95    0.423342
1.00    1.000000
Name: invalid, dtype: float64
0.00    0.000000
0.05    0.000000
0.25    0.000000
0.50    0.000000
0.75    0.009342
0.90    0.074192
0.95    0.212617
1.00    0.958399
Name: invalid, dtype: float64
0.00    0.000000
0.05    0.000000
0.10    0.000000
0.15    0.000000
0.20    0.000616
0.25    0.006334
0.30    0.009039
0.35    0.011261
0.40    0.013458
0.45    0.015784
0.50    0.018402
0.55    0.021820
0.60    0.026721
0.65    0.033970
0.70    0.045306
0.75    0.062729
0.80    0.090853
0.85    0.137828
0.90    0.223347
0.95    0.379932
1.00    1.000000
Name: invalid, dtype: float64


'\n# Do not interpolate erroneous data that has too much invalid labels \n# Erroneous data 1: images with large amount of invalid labels\n# Here we will cull 2/10 (20%) of the data with the most invalid nodes\nqt, bins = pd.qcut(df_ade[\'invalid\'], q=10, precision=0, retbins=True)\nprint(qt)\nthreshold_invalid_ratio = bins[8]\nprint("Threshold invalid: {}".format(threshold_invalid_ratio))\n#ADE_df_cleaned = ADE_df[ADE_df[\'invalid\'] <= threshold_invalid_ratio]\n'

## Create the first train_split_test via  

In [59]:
# combine only by invalid cases, so we can train_test_split and then find out how muhc to interpolate based ontrain
DATASETS_DIR = r"../datasets"
PASCAL2010_DIR = os.path.join(DATASETS_DIR, r"PASCAL2010")
ADE_DIR = os.path.join(DATASETS_DIR, r"ADE")


df_ade = load_dataframe(filename="ADE_master.csv", dataset_dir=ADE_DIR, header=0, index_col=None)
df_pascal2010 = load_dataframe(filename="PASCAL2010_master.csv", dataset_dir=PASCAL2010_DIR, header=0, index_col=None)


df_ade = df_ade.iloc[:,:7]
print(df_ade.head())

ade_invalid = df_ade['invalid']
print(ade_invalid)


   index  id dataset  width  height             scene   invalid
0      0   0     ADE    683     512  airport_terminal  0.098448
1      1   1     ADE    711     512  airport_terminal  0.036494
2      2   2     ADE    683     512       art_gallery  0.200949
3      3   3     ADE    384     251          badlands  0.006111
4      4   4     ADE    683     512          ball_pit  0.733860
0        0.098448
1        0.036494
2        0.200949
3        0.006111
4        0.733860
           ...   
22205    0.005130
22206    0.023865
22207    0.012251
22208    0.420563
22209    0.706917
Name: invalid, Length: 22210, dtype: float64


In [None]:
# Combine and Separate into two datasets: One for inside and one for outside
#     This is based on correlation between scene and wall; ceiling; sky [floor, wall, ceiling, sky]

## Decide whether to interpolate based on the missing  

## Combine the ADE and PASCAL image dataset 

In [35]:
ADE_JPEG_IMAGES_DIR = r"G:\segmentation_tinker\datasets\ADEChallengeData2016\images"
COMMON_JPEG_IMAGES_DIR = r"G:\segmentation_environment\datasets\ADE_PASCAL2010\images"
OFFSET = 22210
Z_FILL=8

# ADE INDEX IS 0-22209
# PASCAL INDEX 22210-33530

def rename_pascal(images_dir=PASCAL_2010_JPEG_IMAGES_DIR, common_images_dir=COMMON_JPEG_IMAGES_DIR, offset=OFFSET):
    for i, image_file in enumerate(os.listdir(images_dir)):
        deconstructed_file_name = re.split('\_|\.', image_file)
        new_image_file = "IMAGE" + "_" + str(offset+i).zfill(Z_FILL) + "." + deconstructed_file_name[2]
        shutil.copy(os.path.join(images_dir, image_file), os.path.join(common_images_dir, new_image_file))

        
def rename_ade(images_dir=ADE_JPEG_IMAGES_DIR, common_images_dir=COMMON_JPEG_IMAGES_DIR, offset=OFFSET):
    for i, image_file in enumerate(os.listdir(images_dir)):
        deconstructed_file_name = re.split('\_|\.', image_file)
        new_image_file = "IMAGE" + "_" + str(int(deconstructed_file_name[2])).zfill(Z_FILL) + "." + deconstructed_file_name[3]
        shutil.copy(os.path.join(images_dir, image_file), os.path.join(common_images_dir, new_image_file))

    
def run_combine_ade_pascal_images():
    rename_pascal()
    rename_ade()

## Combine the ADE and PASCAL labels dataset 

In [50]:
''' 
Note, the labels in the common labels directory hold two kinds of labels: 
one for ADE which has a class range of [0,150] and the other for PASCAL2010 which has a class range of [0, 459].
It is up to the labels in the methodology to be cleaned, preprocessed, and consistent.
The common labels directory should just be used as a baseline reference.
'''

COMMON_LABELS_DIR = r"G:\segmentation_environment\datasets\ADE_PASCAL2010\labels"
PASCAL_LABELS_DIR = os.path.join(PASCAL_2010_DIR, r"SegmentationRawClass")


def rename_ade_labels(labels_dir=COMMON_LABELS_DIR):
    for label_file in os.listdir(COMMON_LABELS_DIR):
        deconstructed_file_name = re.split('\_|\.', label_file)
        if deconstructed_file_name[0] == "ADE":
            new_file_name = "LABEL" + "_" + str(int(deconstructed_file_name[-2])).zfill(Z_FILL) + "." + deconstructed_file_name[-1]
            os.rename(os.path.join(labels_dir, label_file), os.path.join(labels_dir, new_file_name))


def concatenate_pascal_labels(labels_dir=COMMON_LABELS_DIR, pascal_labels_dir=PASCAL_LABELS_DIR, offset=OFFSET):
    for i, matrix_file in enumerate(os.listdir(pascal_labels_dir)):
        mat_label = io.loadmat(os.path.join(pascal_labels_dir, matrix_file))['LabelMap']
        label_file_name = "LABEL" + "_" + str(i+offset).zfill(Z_FILL) + "." + "npy"
        np.save(os.path.join(labels_dir, label_file_name), mat_label)

    
def run_combine_ade_pascal_labels():
    rename_ade_labels()
    concatenate_pascal_labels()

## Find any amount of invalid labels in ADE_PASCAL2010 to be candidates for interpolation or culling

In [12]:
PASCAL2010_CSV = r"G:\segmentation_master\datasets\PASCAL2010\dataframes\PASCAL2010_master.csv"
ADE_CSV = r"G:\segmentation_master\datasets\ADE\dataframes\ADE_master.csv"

def find_invalid_set(csv):
    df = pd.read_csv(csv, header=0, sep=',', lineterminator='\n')
    print((df['invalid'] != 0).sum())
    print(df[df['invalid'] != 0]['id'])
    
    
def run_find_invalid():
    invalid_set_ade = find_invalid_set(csv=ADE_CSV)
    invalid_set_pascal = find_invalid_set(csv=PASCAL2010_CSV)

0
Series([], Name: id, dtype: int64)
0
Series([], Name: id, dtype: int64)
