# Exploratory Data Analysis (TBD)

In [1]:
import os
import sys
sys.path.append(os.path.abspath(".."))

from utils.loading import load_masked_images, load_images, load_labels
from utils.visualization import plot_image

### Loading the data

In [None]:
labels = load_labels()
images = load_images(subset=True)
masked_images = load_masked_images(subset=True)

### Exploring the loaded data

In [3]:
example_image = "train_0.tif"

#### Labels

In [None]:
print("Size of labels:", len(labels))
print("Keys for labels:", list(labels.keys())[:5], end="\n\n")

print(f"Example of labels for {example_image} with {len(labels[example_image])} polygons:")    
for label in labels[example_image]:
    print(label)

#### Images

In [None]:
print("Keys for images:",images[example_image].keys(), end="\n\n") 
print(f"Profile for {example_image}:")
print(images[example_image]["profile"],end="\n\n")
print(f"Image for {example_image}:")
print(images[example_image]["image"],end="\n\n")

#### Masked images

In [None]:
print("Keys for masked_images:",masked_images[example_image].keys(), end="\n\n") 
print(f"Profile for {example_image}:")
print(masked_images[example_image]["profile"],end="\n\n")
print(f"Masked image for {example_image}:")
print(masked_images[example_image]["image"],end="\n\n")

### Checking for nan values

In [None]:
import numpy as np

values = images[example_image]["image"] # The first band
print("Shape of an image:", values.shape)

found_nan = list()
for img in images:
    for i in range(12):
        values = images[img]["image"][i]
        if np.isnan(values).any():
            found_nan.append(img)
            break

print(f"Found {len(found_nan)} images with NaN values: {found_nan}")

But does all bands of one image with NaN values all have the same Nan values?

In [None]:
different_nan = list()
for img in found_nan:
    nan_mask = np.isnan(images[img]["image"][0])
    for i in range(1, 12):
        nan_mask &= np.isnan(images[img]["image"][i])

    if not nan_mask.any():
        different_nan.append(img)

print(f"Found {len(different_nan)} with different NaN values per band: {different_nan}")

In [None]:
for img in different_nan:
    plot_image(img, num_plots=3)

### Is other data cleaning necessary?

In [None]:
# Your mama

### Looking at a datapoints profile

In [None]:
print(images[example_image]["profile"])

### Visualizing a data point

### Can we visualize the locations on a map? (does it exist tools to do this already?)

### Normalizing the data

### Is other preprocessing necessary?