# Exploratory Analysis

In [7]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv ../../.env

import os, sys
import collections

import wandb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

sys.path.append(os.path.abspath('../src'))

from file_handler import download_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [None]:
download_data("../data")

Data already downloaded.


From a glance, we have optical - thermal videos and images. There is a test-dev set presumably for debugging with a much larger trainval set

In [13]:
def get_number_of_files_in_folder(folder):
    counter = collections.Counter()
    for root, dirs, files in os.walk(f"../data/Images/{folder}/Optical"):
        for dir in dirs:
            counter[dir] += len(os.listdir(os.path.join(root, dir)))
    
    print(f'Number of folders in {folder}: {len(counter)}')
    
    # Total number of files
    total = sum(counter.values())
        
    print(f'Total number of files in {folder}: {total}')
    
    return counter


dev_counter = get_number_of_files_in_folder('test-dev')
train_counter = get_number_of_files_in_folder('trainval')

Number of folders in test-dev: 6
Total number of files in test-dev: 1675
Number of folders in trainval: 30
Total number of files in trainval: 8714


## Evaluation Testing

In [2]:
from vos_benchmark.benchmark import benchmark

In [13]:
gt_dir = "../data/optic_thermal/Images/trainval/val/DAVIS_Masks"
pred_dir = "../results/2025-01-25_17-46-41/results/Annotations"

results = benchmark([gt_dir], [pred_dir])

J, JF, F = results[:3]
print(f"J: {J}, JF: {JF}, F: {F}")

We are *SKIPPING* the evaluation of the first and the last frame (standard for semi-supervised video object segmentation).
In dataset ../data/optic_thermal/Images/trainval/val/DAVIS_Masks, we are evaluating on 3 videos: ['Mavic1_DJI_0872', 'Mavic1_DJI_0876', 'Mavic1_DJI_0882']


100%|██████████| 3/3 [00:13<00:00,  4.48s/it]

sequence        obj   J&F     J     F
Global score         11.1  6.3  15.9
Mavic1_DJI_0872 001  11.6   7.5  15.8
Mavic1_DJI_0876 001  18.1  10.7  25.5
Mavic1_DJI_0882 001   3.6   0.7   6.5

Summary:
Global score: J&F: 11.1 J: 6.3 F: 15.9
Time taken: 13.48s
J: [11.115216914350716], JF: [6.2982486238266695], F: [15.932185204874763]





## Report Tables

In [15]:
# Some old code I made
def access_wandb_runs(entity=None, 
                      project="aml-blade-clustering", 
                      filters={}, get_baseline=False):
    """
    Retrieve and analyze runs from a Weights & Biases project
    
    Parameters:
    - entity: Your wandb username or team name
    - project: The project containing your runs
    - filters: Optional dictionary to filter runs
    
    Returns:
    - List of run objects with their details
    """
    # Initialize the wandb API
    api = wandb.Api()
    
    # Get the entity from the environment variable if not provided
    if entity is None:
        
        if os.getenv("WANDB_ENTITY") is None:
            raise ValueError("Please provide an entity or set the WANDB_ENTITY environment variable. This is your wandb username or team name")
        
        entity = os.getenv("WANDB_ENTITY")
    
    # Default filters
    if filters is not None:
        additional_filters = {
            'created_at' : {
                '$gte': '2025-02-01T00:00:00Z'    
            },
            'state': 'finished'
        }
        filters = {**filters, **additional_filters}
    
    # Fetch runs from the specified project
    runs = api.runs(
        path=f"{entity}/{project}", 
        filters=filters
    )
    
    return runs

In [32]:
runs = access_wandb_runs(filters={'config.clustering_algorithm': 'kmeans'})

kmeans_df = pd.DataFrame()

for run in runs:
    metrics = run.summaryMetrics
    F = metrics['Boundary F Measure']
    JF = metrics['Mean']
    J = metrics['Region Similarity']
    fps = metrics['Frames per Second']
    
    num_iterations = run.config['n_iter']
    
    # Concat
    kmeans_df = pd.concat([kmeans_df, pd.DataFrame({
        'F': [F],
        'JF': [JF],
        'J': [J],
        'fps': [fps],
        'num_iter': [num_iterations]
    })])

{'config.clustering_algorithm': 'kmeans', 'created_at': {'$gte': '2025-02-01T00:00:00Z'}, 'state': 'finished'}


In [50]:
symbol_map = {
    'J': '$\mathcal{J}$',
    'JF': '$\mathcal{J}\&\mathcal{F}$',
    'F': '$\mathcal{F}$',
    'fps': 'FPS',
}

In [51]:
# Put Num Iterations first, then J, JF, F, FPS
kmeans_df = kmeans_df[['num_iter', 'J', 'JF', 'F', 'fps']]

# Create copy
latex_df = kmeans_df.copy()

# Round to 2 decimal places
latex_df = latex_df.round(2)

# Rename num_iter to Iterations
latex_df = latex_df.rename(columns={'num_iter': 'Iterations'})

# Rename using symbol map
latex_df = latex_df.rename(columns=symbol_map)

# Change fps to FPS
latex_df = latex_df.rename(columns={'fps': 'FPS'})

In [52]:
# Latex
print(latex_df.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{rrrrr}
\toprule
Iterations & $\mathcal{J}$ & $\mathcal{J}\&\mathcal{F}$ & $\mathcal{F}$ & FPS \\
\midrule
2 & 16.60 & 24.43 & 8.78 & 43.38 \\
4 & 13.06 & 16.33 & 9.79 & 42.61 \\
8 & 19.00 & 27.59 & 10.40 & 37.14 \\
16 & 19.10 & 27.62 & 10.58 & 31.52 \\
\bottomrule
\end{tabular}

