# Generate train / test split

In [1]:
import os
from collections import defaultdict
import numpy as np
import joblib

import torch

In [2]:
# Get the images that have masks
mask_files = os.listdir('/scratch/zach/masks_raw/')
print(len(mask_files))

310


In [3]:
mask_tensors = os.listdir('/scratch/zach/mask_tensors/')

In [4]:
# Delete any tensors that are not 1x224x224
count_dict = defaultdict(int)

for fn in mask_tensors:
	count_dict[fn.split("_")[0]] += 1

In [5]:
for k, v in count_dict.items():
	print(k)
	print(v)
	break

11ska565830
136


In [6]:
# We want to split the default dict into two categories
train_count = 0
test_count = 0
for i, (key, value) in enumerate(count_dict.items()):
	if i % 5 == 0:
		test_count += value
	else:
		train_count += value

print(test_count)
print(train_count)
print(test_count+train_count)

3717
14098
17815


In [19]:
dataset = {
	"test": {
		"raw" : [],
		"mask": [],
		"empty": []
	},
	"train": {
		"raw": [],
		"mask": [],
		"empty": []
	}
}

# Iterate through and fill in with the raw values:
for i, (key, value) in enumerate(count_dict.items()):
	if i % 5 == 0:
		dataset['test']['raw'].append(key)
	else:
		dataset['train']['raw'].append(key)

In [20]:
all_images = os.listdir('/scratch/zach/solar-pv/')
print(len(all_images))

421888


In [9]:
all_images[0]

'11ska475815_15_09.tif'

In [10]:
all_masks = os.listdir('/scratch/zach/masks/')
len(all_masks)

18302

In [21]:
for image_name in all_images:
	if image_name in all_masks:
		raw_name = image_name.split("_")[0]
		if raw_name in dataset['test']['raw']:
			dataset['test']['mask'].append(image_name)
		else:
			dataset['train']['mask'].append(image_name)
	else:
		# Only keep 4.5% of the time, to get down to desired number
		if np.random.rand() < 0.1:
			raw_name = image_name.split("_")[0]
			if raw_name in dataset['test']['raw']:
				dataset['test']['empty'].append(image_name)
			else:
				dataset['train']['empty'].append(image_name)

In [12]:
print(len(dataset['train']['raw']))
print(len(dataset['train']['mask']))
print(len(dataset['train']['empty']))
print(len(dataset['test']['raw']))
print(len(dataset['test']['mask']))
print(len(dataset['test']['empty']))

248
14098
34388
62
3717
6076


In [33]:
# IF YOU WANT TO MAKE A LARGE, but EVEN dataset (where
# the number of masked examples is roughly even to the number
# of non-masked examples, then use this cell)
# Use this to ensure numbers are about even.
prob = 3717/6084
test_empty = []
for fn in dataset['test']['empty']:
	if np.random.rand() < prob:
		test_empty.append(fn)

print(len(test_empty))

3725


In [34]:
# If the number above is about even to the number of masked examples,
# then rewrite over the empty array
dataset['test']['empty'] = test_empty

In [36]:
# Now that the dataset is about evenly split, save the JSON file for later use
joblib.dump(dataset, '/scratch/zach/train_test_split.joblib')

# The label limited case.
We think that it would be helpful to have a dataset where the train and test split
only have ~1000 examples each. That way we will have a truly limited case in which
we want to identify which model works best.

In [32]:
small = {
	"test": {
		"mask": [],
		"empty": []
	},
	"train": {
		"mask": [],
		"empty": []
	}
}



In [33]:
generator = np.random.default_rng(seed=42)
for k1 in small.keys():
    for k2 in small[k1].keys():
        small[k1][k2] = generator.choice(dataset[k1][k2], 500, replace=False)

In [34]:
# Save the smaller dataset for future experiments

joblib.dump(dataset, '/scratch/zach/train_test_split_small.joblib')

['/scratch/zach/train_test_split_small.joblib']