In [2]:
import aug_util as aug
import wv_util as wv
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import numpy as np
import csv
import tqdm
%matplotlib inline


In [5]:
#Loading our labels
coords1, chips1, classes1 = wv.get_labels('data/xView_train.geojson')

100%|██████████| 601937/601937 [00:02<00:00, 294968.26it/s]


In [6]:
# Grouped classes [1,2,3,4,5]
grouped_classes = [[11,12],[13],[17,18,20,21],\
       [19,23,24,25,28,29,60,61,65,26],[41,42,50,40,44,45,47,49]]
def transpose_class_counts(chips,classes):
    chip_names = np.unique(chips[5000:20000]) 
    ### Uncomment this to run over entire train dataset ###
    # chip_names = np.unique(chips) 
    results = np.zeros((len(chip_names),5))
    chip_strs = []
    for c_idx, c in tqdm.tqdm(enumerate(chip_names)):
        chip_strs.append(c)
        classes_chip = classes[chips==c]
        idx_filter = np.isin(classes_chip,grouped_classes[0])
        # initialize to all false
        for i,gc in (enumerate(grouped_classes)):
            is_in_idxs = np.isin(classes_chip,gc)
            classes_chip[is_in_idxs] = i
            idx_filter = np.logical_or(idx_filter,is_in_idxs)
        classes_chip = classes_chip[idx_filter]
        labels, counts = np.unique(classes_chip,return_counts=True)
        for label_idx,label in enumerate(labels):
            results[int(c_idx),int(label)] = counts[label_idx]
            pass
    chip_strs_col = np.array(chip_strs).reshape(-1,1)
    return (np.hstack((chip_strs_col,results)))

# make the table where filename is rowwise, columns are class
results = transpose_class_counts(chips1,classes1)

17it [00:00, 39.93it/s]


In [7]:
columns = \
    ['fname',
    'Small Aircraft',
    'Large Aircraft',
    'Small Vehicle',
    'Bus/Truck',
    'Boat']

df = pd.DataFrame(results).head()
df.columns = columns
df.head(10)

Unnamed: 0,fname,Small Aircraft,Large Aircraft,Small Vehicle,Bus/Truck,Boat
0,2268.tif,0.0,0.0,0.0,0.0,1.0
1,2270.tif,0.0,0.0,0.0,0.0,1.0
2,2278.tif,0.0,0.0,0.0,0.0,20.0
3,2279.tif,0.0,0.0,23.0,10.0,264.0
4,2281.tif,0.0,0.0,3.0,12.0,1.0


In [97]:
# the train test split code starts here

In [None]:
no_tif_col = results[:, 1:]

In [44]:
no_tif_col

array([['0.0', '0.0', '0.0', '0.0', '1.0'],
       ['0.0', '0.0', '0.0', '0.0', '1.0'],
       ['0.0', '0.0', '0.0', '0.0', '20.0'],
       ['0.0', '0.0', '23.0', '10.0', '264.0'],
       ['0.0', '0.0', '3.0', '12.0', '1.0'],
       ['0.0', '0.0', '134.0', '43.0', '159.0'],
       ['0.0', '0.0', '276.0', '179.0', '5.0'],
       ['0.0', '0.0', '52.0', '27.0', '5.0'],
       ['0.0', '0.0', '789.0', '50.0', '0.0'],
       ['0.0', '0.0', '180.0', '45.0', '0.0'],
       ['1.0', '1.0', '1295.0', '239.0', '0.0'],
       ['11.0', '2.0', '938.0', '52.0', '0.0'],
       ['0.0', '0.0', '1352.0', '239.0', '0.0'],
       ['42.0', '5.0', '1103.0', '60.0', '0.0'],
       ['0.0', '0.0', '833.0', '16.0', '0.0'],
       ['0.0', '0.0', '1172.0', '157.0', '0.0'],
       ['13.0', '14.0', '807.0', '45.0', '0.0']], dtype='<U32')

In [59]:
from sklearn.model_selection import train_test_split

In [85]:
def indToTifName(data, inds):
    res = []
    for ind in inds:
        res.append(data[ind][0])
    return res

In [86]:
def showDistribution(data, selected_indexes):
    res = []
    total = 0
    class_num = len(data[0])
    for i in range(class_num):
        for index in selected_indexes:
            total += float(data[index][i])
    for i in range(class_num):
        total_of_this_class = 0
        for index in selected_indexes:
            total_of_this_class += float(data[index][i])
        res.append(float(total_of_this_class)/total)
    return res

In [87]:
def checkThreshold(distr1, distr2, thres):
    if (len(distr1) != len(distr2)):
        print("columns' numbers don't fit.")
        return -1
    for i in range(len(distr1)):
        diff = abs(distr1[i] - distr2[i])
        if diff > thres:
            return False
    return True

In [88]:
def findBalance(data, train_percent, thres):
    tifs = len(data)
    class_num = len(data[0])
    for i in range(1000000):
        tr_set, te_set = train_test_split(np.array(list(range(len(data)))), test_size=1-train_percent)
        tr_d = showDistribution(data, tr_set)
        te_d = showDistribution(data, te_set)
        check = checkThreshold(tr_d, te_d, thres)
        if (check == -1):
            return -1
        elif (check == True):
            return tr_set, te_set
    return [], []

In [89]:
#input data (here "no_tif_col") has to be complete columns of classes of counts, no other columns like "tif_names"
#set train test split percentage (0.8) and threshold here (0.02)
train_ind, test_ind = findBalance(no_tif_col, 0.8, 0.02)

In [90]:
train_ind

array([ 0,  6,  9, 16, 15,  7, 14,  3, 13,  4,  8, 10,  2])

In [91]:
test_ind

array([ 1, 11, 12,  5])

In [92]:
# distributions
showDistribution(no_tif_col, train_ind)

[0.00723047127178825,
 0.0025823111684958036,
 0.8435119431891543,
 0.10845706907682376,
 0.03821820529373789]

In [93]:
showDistribution(no_tif_col, test_ind)

[0.0037529853292391675,
 0.000682360968952576,
 0.827021494370522,
 0.11395428181508017,
 0.05458887751620607]

In [94]:
#result tifs, need to use dataset with tif names as the input
train_tifs = indToTifName(results, train_ind)
test_tifs = indToTifName(results, test_ind)

In [95]:
train_tifs

['2268.tif',
 '2293.tif',
 '2303.tif',
 '2399.tif',
 '2398.tif',
 '2294.tif',
 '2387.tif',
 '2279.tif',
 '2386.tif',
 '2281.tif',
 '2301.tif',
 '2370.tif',
 '2278.tif']

In [96]:
test_tifs

['2270.tif', '2371.tif', '2384.tif', '2292.tif']