## Notebook for data exploration and visualization

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv("datasets_pytorch/ai4forest_camera/train.csv", )
df_val = pd.read_csv("datasets_pytorch/ai4forest_camera/val.csv")
df_fix = pd.read_csv("datasets_pytorch/ai4forest_camera/fix_val.csv")

In [None]:
# proportion of 30m pixels in the training set
g30 = df_train["g30"].sum()
g25 = df_train["g25"].sum()
g20 = df_train["g20"].sum()
g15 = df_train["g15"].sum()
g10 = df_train["g10"].sum()
g5 = df_train["g5"].sum()
gsum = g30 + g25 + g20 + g15 + g10 + g5
g30/gsum

In [None]:
def cleanup(df):    
    #change hemisphere from categorical to binary
    df['hemisphere'] = df['hemisphere'].astype('category').cat.codes
    #rename hemisphere to hemi_S
    df.rename(columns={'hemisphere':'hemi_S'}, inplace=True)
    return df

df_train = cleanup(df_train)
df_val = cleanup(df_val)
df_fix = cleanup(df_fix)

In [None]:
df = pd.read_csv("datasets_pytorch/ai4forest_camera/samples.csv")

In [None]:
df

In [None]:
# df_eu = df[
#         (df['lat']>35)  & (-12<df['long']) & (df['long']<60) 
#         ]
# df_eu.to_csv("datasets_pytorch/ai4forest_camera/samples_eu.csv")
# len(df_eu)

In [None]:
#plot
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
#scatter plot with marker color depending on df_train['hemisphere']
ax.scatter(df_val['long'], df_val['lat'], c="black", alpha=0.7, s=1)
ax.scatter(df_train['long'], df_train['lat'], c="lightgrey", alpha=0.7, s=1)
# ax.scatter(df_fix['long'], df_fix['lat'], c="black", alpha=1, s=1)
# ax.scatter(lon, lat, c=data['zone_number'], cmap='prism', alpha=0.5, s=12) # time zones
# ax.set_xlabel('Longitude')
# ax.set_ylabel('Latitude')
plt.xlim(-185,185)
plt.xticks(np.arange(-180, 181, 30))
plt.ylim(-90,90)
plt.yticks(np.arange(-90, 91, 30))
plt.show()

# overlay world map

In [None]:
df_fix_idx = df_fix #.reset_index()

In [None]:
df_fix_idx

In [None]:
import numpy as np

# Load the file
# npz_name = df_fix_idx['path'][2]
npz_name = df_train['path'][0]
# npz_name = "sentinel_1_2_worldwide_utm_1N_25-0000012544-0000012544_0.npz"

# npz = np.load(os.path.join("data\samples.zip\samples",npz_name))

npz = np.load(os.path.join("datasets_pytorch/ai4forest_camera/data" ,npz_name))

# Check contents
print("Contents:", npz.files)
print("Type npz: ", type(npz))

# Access an image
arr = npz['data']
print("Array shape:", arr.shape)
print("Data struct type:", type(arr))
print("Data type:", arr.dtype)

# in array shape for each of the 14 channels
# Load the NPZ file
# Access the data array
arr = npz['data']
print("Array shape:", arr.shape)  # Should be (14, 512, 512)

# Create a figure with 14 subplots (adjust layout as needed)
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(15, 30))  # 7 rows, 2 columns
axes = axes.ravel()  # Flatten the axes array for easy iteration

# Plot each channel
for i in range(14):
    # Display the channel
    im = axes[i].imshow(arr[i], cmap='viridis')
    axes[i].set_title(f'Channel {i+1}')
    fig.colorbar(im, ax=axes[i], fraction=0.046, pad=0.04)


# Hide any unused subplots
for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

# Close the NPZ file
npz.close()

In [None]:
npz = np.load(os.path.join("datasets_pytorch/ai4forest_camera/data" ,npz_name))

# Check contents
print(npz['labels'].shape)

# Create a figure with 14 subplots (adjust layout as needed)
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))  # 7 rows, 2 columns
axes = axes.ravel()  # Flatten the axes array for easy iteration

# Plot each channel
for i in range(3):
    # Display the channel
    im = axes[i].imshow(npz['labels'][i], cmap='viridis')
    axes[i].set_title(f'Label {i+1}')
    fig.colorbar(im, ax=axes[i], fraction=0.046, pad=0.04)

plt.show()

In [None]:
import numpy as np

# Load the file
npz_name = df_fix_idx['path'][8]
# npz_name = "sentinel_1_2_worldwide_utm_1N_25-0000012544-0000012544_0.npz"
# npz = np.load(os.path.join("data\samples.zip\samples",npz_name))

npz_arr = []
for name in df_fix_idx['path']:
    npz_arr.append(name)

# in array shape for each of the 14 channels
# Load the NPZ file
# Access the data array
# arr = npz['data']
# print("Array shape:", arr.shape)  # Should be (14, 512, 512)

# Create a figure with 14 subplots (adjust layout as needed)
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(30, 10))  # 7 rows, 2 columns
axes = axes.ravel()  # Flatten the axes array for easy iteration

for i, name in enumerate(npz_arr):
    arr = np.load(os.path.join("datasets_pytorch/ai4forest_camera/data", name))
    im = axes[i].imshow(arr['data'][6], cmap='viridis')
    axes[i].set_title(f'Img {i}')
    # fig.colorbar(im, ax=axes[i], fraction=0.046, pad=0.04)


# Hide any unused subplots
for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

# Close the NPZ file
npz.close()

In [None]:
df_models = pd.read_csv('wandb_export_models.csv')
# drop all columns starting with "fixval"
df_models.drop(df_models.filter(regex="fixval"), axis=1, inplace=True)
df_models.drop(df_models.filter(regex="train/l"), axis=1, inplace=True)
df_models.drop(df_models.filter(regex="train/shift_l"), axis=1, inplace=True)
drop_cols = ['sampler', 'train/ips_throughput', 'optim','phase_runtime', 'learning_rate', 
             'num_workers_per_gpu','use_weighting_quantile', 'Created', 'n_lr_cycles', 'cyclic_mode', 
             'use_mixup', 'fp16', 'Notes', 'wandb', 'loss_name', 'arch', 'iteration', 'profiler', 'Tags', 
             'optimizer', 'weight_decay', 'dataset', 'initial_lr', 'use_amp', 'model_path', 'Sweep', 
             'device', 'use_input_clipping', 'model', 'seed', 'use_grad_clipping', 'use_label_rescaling', 
             'State', 'n_iterations', 'User', 'computer', 'log_freq'] 
for col in drop_cols:
    try:
        df_models.drop(col, axis=1, inplace=True)
    except:
        pass
# df_models.rename(columns={'use_weighted_sampler': 'sampler'}, inplace=True)

# sort columns by name
df_models = df_models.reindex(sorted(df_models.columns), axis=1)

#sort rows by val/loss values
df_models = df_models.sort_values(by=['val/l1'], ascending=True)
# round all floats to two decimals
df_models = df_models.round(2)

# optional condense
condense = True
condense_cols = ['use_weighted_sampler','train/huber', 'train/shift_huber', 'samples_seen', 'n_params', 'backbone', 'Runtime', 'batch_size', ]
if condense:
    df_condensed = df_models.copy()
    for col in condense_cols:
        try:
            df_condensed.drop(col, axis=1, inplace=True)
        except:
            pass

df_report = df_condensed.copy()
report_drops = [ 'val/l1_20', 'val/l1_25', 'val/l2', 'val/shift_l1', 'val/shift_l2', 'val/huber', 'val/l1_30', 'val/loss']
for col in report_drops, :
    try:
        df_report.drop(col, axis=1, inplace=True)
    except:
        pass

df_models.columns



In [None]:
# rename columns with "val" in the name by removing "val/"
for col in df_condensed.filter(regex="val/"):
    try:
        df_condensed.rename(columns={col: col.replace("val/", "")}, inplace=True)
    except Exception as e:
        print(f"Failed to rename {col}: {str(e)}")
df_condensed

In [None]:
# show df
df_report

In [None]:
df_condensed.to_csv(r"C:\Users\45609\dev\Global-Canopy-Height-Map\model_result_table.csv")