# Exploratory Analysis

In [7]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv ../../.env

import os, sys
import collections

import wandb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

sys.path.append(os.path.abspath('../src'))

from file_handler import download_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [None]:
download_data("../data")

Data already downloaded.


From a glance, we have optical - thermal videos and images. There is a test-dev set presumably for debugging with a much larger trainval set

In [13]:
def get_number_of_files_in_folder(folder):
    counter = collections.Counter()
    for root, dirs, files in os.walk(f"../data/Images/{folder}/Optical"):
        for dir in dirs:
            counter[dir] += len(os.listdir(os.path.join(root, dir)))
    
    print(f'Number of folders in {folder}: {len(counter)}')
    
    # Total number of files
    total = sum(counter.values())
        
    print(f'Total number of files in {folder}: {total}')
    
    return counter


dev_counter = get_number_of_files_in_folder('test-dev')
train_counter = get_number_of_files_in_folder('trainval')

Number of folders in test-dev: 6
Total number of files in test-dev: 1675
Number of folders in trainval: 30
Total number of files in trainval: 8714


## Evaluation Testing

In [2]:
from vos_benchmark.benchmark import benchmark

In [13]:
gt_dir = "../data/optic_thermal/Images/trainval/val/DAVIS_Masks"
pred_dir = "../results/2025-01-25_17-46-41/results/Annotations"

results = benchmark([gt_dir], [pred_dir])

J, JF, F = results[:3]
print(f"J: {J}, JF: {JF}, F: {F}")

We are *SKIPPING* the evaluation of the first and the last frame (standard for semi-supervised video object segmentation).
In dataset ../data/optic_thermal/Images/trainval/val/DAVIS_Masks, we are evaluating on 3 videos: ['Mavic1_DJI_0872', 'Mavic1_DJI_0876', 'Mavic1_DJI_0882']


100%|██████████| 3/3 [00:13<00:00,  4.48s/it]

sequence        obj   J&F     J     F
Global score         11.1  6.3  15.9
Mavic1_DJI_0872 001  11.6   7.5  15.8
Mavic1_DJI_0876 001  18.1  10.7  25.5
Mavic1_DJI_0882 001   3.6   0.7   6.5

Summary:
Global score: J&F: 11.1 J: 6.3 F: 15.9
Time taken: 13.48s
J: [11.115216914350716], JF: [6.2982486238266695], F: [15.932185204874763]





## Report Tables

In [95]:
# Some old code I made
def access_wandb_runs(entity=None, 
                      project="aml-blade-clustering", 
                      filters={}, get_baseline=False,
                      override=False):
    """
    Retrieve and analyze runs from a Weights & Biases project
    
    Parameters:
    - entity: Your wandb username or team name
    - project: The project containing your runs
    - filters: Optional dictionary to filter runs
    
    Returns:
    - List of run objects with their details
    """
    # Initialize the wandb API
    api = wandb.Api()
    
    # Get the entity from the environment variable if not provided
    if entity is None:
        
        if os.getenv("WANDB_ENTITY") is None:
            raise ValueError("Please provide an entity or set the WANDB_ENTITY environment variable. This is your wandb username or team name")
        
        entity = os.getenv("WANDB_ENTITY")
    
    # Default filters
    if filters is not None and not override:
        additional_filters = {
            'created_at' : {
                '$gte': '2025-02-01T00:00:00Z'    
            },
            'state': 'finished'
        }
        filters = {**filters, **additional_filters}
    
    # Fetch runs from the specified project
    runs = api.runs(
        path=f"{entity}/{project}", 
        filters=filters
    )
    
    return runs

In [71]:
runs = access_wandb_runs(filters={'config.clustering_algorithm': 'kmeans'})

kmeans_df = pd.DataFrame()

metrics_to_collect = ['Boundary F Measure', 'Mean', 'Region Similarity', 'Frames per Second']

for run in runs:
    metrics = run.summaryMetrics
    metrics = {k: v for k, v in metrics.items() if k in metrics_to_collect}
    
    num_iterations = run.config['n_iter']
    
    # Concat
    kmeans_df = pd.concat([kmeans_df, pd.DataFrame({
        **metrics,
        'num_iter': [num_iterations]
    })])

{'config.clustering_algorithm': 'kmeans', 'created_at': {'$gte': '2025-02-01T00:00:00Z'}, 'state': 'finished'}


In [58]:
symbol_map = {
    'Region Similarity': '$\mathcal{J}$',
    'Mean': '$\mathcal{J}\&\mathcal{F}$',
    'Boundary F Measure': '$\mathcal{F}$',
    'Frames per Second': 'FPS',
}

In [59]:
# Put Num Iterations first, then J, JF, F, FPS
kmeans_df = kmeans_df[['num_iter', 'Region Similarity', 'Mean', 'Boundary F Measure', 'Frames per Second']]

# Create copy
latex_df = kmeans_df.copy()

# Round to 2 decimal places
latex_df = latex_df.round(2)

# Rename num_iter to Iterations
latex_df = latex_df.rename(columns={'num_iter': 'Iterations'})

# Rename using symbol map
latex_df = latex_df.rename(columns=symbol_map)

# Change fps to FPS
latex_df = latex_df.rename(columns={'fps': 'FPS'})

In [60]:
# Latex
print(latex_df.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{rrrrr}
\toprule
Iterations & $\mathcal{J}$ & $\mathcal{J}\&\mathcal{F}$ & $\mathcal{F}$ & FPS \\
\midrule
2 & 16.60 & 24.43 & 8.78 & 43.38 \\
4 & 13.06 & 16.33 & 9.79 & 42.61 \\
8 & 19.00 & 27.59 & 10.40 & 37.14 \\
16 & 19.10 & 27.62 & 10.58 & 31.52 \\
\bottomrule
\end{tabular}



## Algorithm Comparison

In [77]:
# Created before 2:35 AM on 2025-02-02
filter = {
    'created_at': {
        '$lte': '2025-02-02T02:35:00Z'
    }
}

metrics_to_collect_cluster = ['val/F', 'val/J', 'val/JF', 'val/fps']

runs = access_wandb_runs(project="blade_segmentation_report", filters=filter)

algorithm_df = pd.DataFrame()

for run in runs:
    
    
    metrics = run.summaryMetrics
    
    # Ternary operator, I should have been cleaner from the start
    collection = metrics_to_collect if run.name == 'laced-night-9' else metrics_to_collect_cluster
    
    metrics = {k: v for k, v in metrics.items() if k in collection}
    
    # Get clustering_algorithm
    clustering_algorithm = run.config['clustering_algorithm']
    
    # Get use_bgs
    use_bgs = run.config['use_bgs']
    
    # Concat
    algorithm_df = pd.concat([algorithm_df, pd.DataFrame({
        **metrics,
        'clustering_algorithm': [clustering_algorithm],
        'use_bgs': [use_bgs]
    })])
    
algorithm_df

{'created_at': {'$gte': '2025-02-01T00:00:00Z'}, 'state': 'finished'}


Unnamed: 0,val/F,val/J,val/JF,val/fps,clustering_algorithm,use_bgs,Boundary F Measure,Frames per Second,Mean,Region Similarity
0,18.634423,15.696718,12.759012,37.412723,kmeans,False,,,,
0,20.498714,28.57447,36.650225,37.798657,kmeans,True,,,,
0,0.0,4.019031,8.038062,17.431185,hierarchical,False,,,,
0,4.41175,5.937526,7.463301,17.125018,hierarchical,True,,,,
0,,,,,bgs,True,35.270781,564.294677,15.213037,25.241909


In [84]:
# Step 1: Create mapping dictionary
metric_mapping = {
    'val/F': 'Boundary F Measure',
    'val/J': 'Region Similarity',
    'val/JF': 'Mean',
    'val/fps': 'Frames per Second'
}

# Step 2 & 3: Create copies with renamed columns
metrics_val = algorithm_df[list(metric_mapping.keys())].rename(columns=metric_mapping)
metrics_direct = algorithm_df[list(metric_mapping.values())]

# Step 4: Combine metrics using coalesce
combined_metrics = pd.DataFrame()
for new_col in metric_mapping.values():
    combined_metrics[new_col] = metrics_val[new_col].combine_first(metrics_direct[new_col])

# Add algorithm info
result_df = pd.concat([
    combined_metrics,
    algorithm_df[['clustering_algorithm', 'use_bgs']]
], axis=1)

# Round to 2 decimal places
result_df = result_df.round(2)

# Print result
result_df

Unnamed: 0,Boundary F Measure,Region Similarity,Mean,Frames per Second,clustering_algorithm,use_bgs
0,18.63,15.7,12.76,37.41,kmeans,False
0,20.5,28.57,36.65,37.8,kmeans,True
0,0.0,4.02,8.04,17.43,hierarchical,False
0,4.41,5.94,7.46,17.13,hierarchical,True
0,35.27,25.24,15.21,564.29,bgs,True


In [92]:
# Rename using symbol map
latex_df = result_df.rename(columns=symbol_map)

# Change fps to FPS
latex_df = latex_df.rename(columns={'fps': 'FPS'})

# Rename use_bgs to Use BGS
latex_df = latex_df.rename(columns={'use_bgs': 'Use BGS'})

# Rename clustering_algorithm to Algorithm
latex_df = latex_df.rename(columns={'clustering_algorithm': 'Algorithm'})

# In clustering_algorithm, replace kmeans with K-Means, hierarchical with Hierarchical and bgs with Background Subtractor
latex_df['Algorithm'] = latex_df['Algorithm'].replace({
    'kmeans': 'K-Means',
    'hierarchical': 'Hierarchical',
    'bgs': 'BGS'
})

# Reorder columns, put Algorithm and Use BGS first, then the others following the symbol map
latex_df = latex_df[['Algorithm', 'Use BGS', '$\mathcal{J}$', '$\mathcal{J}\&\mathcal{F}$', '$\mathcal{F}$', 'FPS']]

# Put the Background Subtractor algorithm first
latex_df = latex_df.sort_values(by='Algorithm', ascending=True)

latex_df

Unnamed: 0,Algorithm,Use BGS,$\mathcal{J}$,$\mathcal{J}\&\mathcal{F}$,$\mathcal{F}$,FPS
0,BGS,True,25.24,15.21,35.27,564.29
0,Hierarchical,False,4.02,8.04,0.0,17.43
0,Hierarchical,True,5.94,7.46,4.41,17.13
0,K-Means,False,15.7,12.76,18.63,37.41
0,K-Means,True,28.57,36.65,20.5,37.8


In [93]:
print(latex_df.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{lrrrrr}
\toprule
Algorithm & Use BGS & $\mathcal{J}$ & $\mathcal{J}\&\mathcal{F}$ & $\mathcal{F}$ & FPS \\
\midrule
BGS & True & 25.24 & 15.21 & 35.27 & 564.29 \\
Hierarchical & False & 4.02 & 8.04 & 0.00 & 17.43 \\
Hierarchical & True & 5.94 & 7.46 & 4.41 & 17.13 \\
K-Means & False & 15.70 & 12.76 & 18.63 & 37.41 \\
K-Means & True & 28.57 & 36.65 & 20.50 & 37.80 \\
\bottomrule
\end{tabular}



## Test Set

In [106]:
runs = access_wandb_runs(project="blade_segmentation_report", filters=filter, override=False)

test_df = pd.DataFrame()

for run in runs:
    
    metrics = run.summaryMetrics
    if run.name == 'astral-yogurt-10':
        result = {k: v for k, v in metrics.items() if k in metrics_to_collect}
        break
        
# Concat
test_df = pd.concat([test_df, pd.DataFrame(result, index=[0])])

test_df

Unnamed: 0,Boundary F Measure,Frames per Second,Mean,Region Similarity
0,13.796672,37.717932,36.57981,25.188241


In [111]:
latex_df = test_df.copy()

# Rearrange columns
latex_df = latex_df[['Region Similarity', 'Mean', 'Boundary F Measure', 'Frames per Second']]

# Round to 2 decimal places
latex_df = latex_df.round(2)

# Rename using symbol map
latex_df = latex_df.rename(columns=symbol_map)

# Drop FPS
latex_df = latex_df.drop(columns='FPS')

latex_df

Unnamed: 0,$\mathcal{J}$,$\mathcal{J}\&\mathcal{F}$,$\mathcal{F}$
0,25.19,36.58,13.8


In [112]:
print(latex_df.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{rrr}
\toprule
$\mathcal{J}$ & $\mathcal{J}\&\mathcal{F}$ & $\mathcal{F}$ \\
\midrule
25.19 & 36.58 & 13.80 \\
\bottomrule
\end{tabular}

