In [1]:
%run imports.py

In [2]:
def get_mask_area_ratio(mask):
    foreground = (mask == 1).sum().item() 
    total_pixels = mask.numel() 
    return foreground / total_pixels

def get_test_dataset(subsets_path, choosen_test_subsets = ["i", "j"]):
    test_subsets = []
    for subset in os.listdir(subsets_path):
        if subset.split(".")[0].split("_")[1] in choosen_test_subsets:
            with open(f'{subsets_path}/{subset}', "rb") as f:
                data = pickle.load(f)
            test_subsets.append(data)
    test_data = ConcatDataset(test_subsets)
    
    test_dataset = KvasirDataset(data=test_data, 
                                 mode="test", 
                                 image_size=ast.literal_eval(env_vars["image_size"]), 
                                 mask_size=ast.literal_eval(env_vars["mask_size"]))
    
    return test_dataset

def get_train_dataset(subsets_path, choosen_test_subsets = ["i", "j"]):
    test_subsets = []
    for subset in os.listdir(subsets_path):
        if subset.split(".")[0].split("_")[1] not in choosen_test_subsets:
            with open(f'{subsets_path}/{subset}', "rb") as f:
                data = pickle.load(f)
            test_subsets.append(data)
    train_data = ConcatDataset(test_subsets)
    
    train_dataset = KvasirDataset(data=train_data, 
                                 mode="test", 
                                 image_size=ast.literal_eval(env_vars["image_size"]), 
                                 mask_size=ast.literal_eval(env_vars["mask_size"]))
    
    return train_dataset

In [3]:
def get_sas_modelwise_area_vs_dice(data, model_dir, model_name, model_config, device, savepath):
    model = select_model(model_name=model_name, model_config=model_config)
    model_files = sorted(os.listdir(model_dir))
    
    results = {}
    for fname in model_files:
        fname_parts = fname.split(".")
        sas_status = fname_parts[0]
        split_key = fname_parts[-2].split("%")[0]
        
        if split_key not in results:
            results[split_key] = {"presas": {}, 
                                  "postsas" : {}}
        
        sas_model = copy.deepcopy(model)
        checkpoint = torch.load(f"{model_dir}/{fname}", weights_only=True)
        sas_model.load_state_dict(checkpoint['model_state_dict'])
        
        sas_model.to(device)
        
        
        area_vs_score_list = []
        sas_model.eval()
        
        for idx in range(len(data)):
            image, mask = data[idx][0].to(device), data[idx][1].to(device)
            
            area_ratio = get_mask_area_ratio(mask)
            
            batched_image = image.unsqueeze(0).to(device)
            preds = sas_model(batched_image)
            preds = preds.squeeze(0)
            dice_score = calculate_dice_score(preds=preds, 
                                              targets=mask, 
                                              device=device, 
                                              model_name=model_name)
            
            tuple_data = (area_ratio, dice_score.item())
            area_vs_score_list.append(tuple_data)
            
        
        if sas_status=="presas":
            results[split_key]["presas"]["area_vs_score"] = area_vs_score_list
        else:
            results[split_key]["postsas"]["area_vs_score"] = area_vs_score_list
            
            
    with open(f"{savepath}/area_vs_score_results.json", "w") as file:
        json.dump(results, file, indent=4)
    
    return results

In [4]:
train_dataset = get_train_dataset(subsets_path=f"../data/subsets")
test_dataset = get_test_dataset(subsets_path=f"../data/subsets")
full_dataset = ConcatDataset([train_dataset, test_dataset])

In [None]:
results = get_sas_modelwise_area_vs_dice(data=full_dataset, 
                                         model_dir=f"{env_vars['results_root']}/final-results-with-segformer/train_semi_auto/models", 
                                         model_name=f"{env_vars['model_name']}", 
                                         model_config=f"{env_vars['model_config']}", 
                                         device="cuda", 
                                         savepath=f"{env_vars['output_folder_path']}")

with open(f"../outputs/area_vs_score_results.json", "r") as file:
    results = json.load(file)

In [None]:
splits = list(results.keys())
stages = ["presas", "postsas"]
key = "area_vs_score"

# json doesn't save tuples so converting back to tuples
for split in splits:
    for stage in stages:
        for idx in range(len(results[split][stage][key])):
            results[split][stage][key][idx] = tuple(results[split][stage][key][idx])

In [None]:
def plot_butterfly_mask_vs_score(ax, n_bins, threshold, results, split_key, stage_key, key="area_vs_score"):
    data = results[split_key][stage_key][key]

    mask_bins = np.linspace(0.0, 1.0, n_bins+1)
    data_bins = [[] for _ in range(len(mask_bins)-1)]
    butterfly_data = []

    for idx in range(len(data)):
        area, score = data[idx][0], data[idx][1]
        
        for j in range(len(mask_bins)-1):
            if mask_bins[j] <= area < mask_bins[j+1]:
                data_bins[j].append(score)
                break
            
    for j in range(len(data_bins)):
        above_count = sum(1 for value in data_bins[j] if value > threshold)
        below_count = sum(1 for value in data_bins[j] if value < threshold)
        
        butterfly_data.append((below_count, above_count))

    below_counts = [x[0] for x in butterfly_data]
    above_counts = [x[1] for x in butterfly_data]
    
    bar_width = 0.35
    index = np.arange(n_bins)

    ax.bar(index, -np.array(below_counts), bar_width, color='blue', label='Below Threshold')
    ax.bar(index, np.array(above_counts), bar_width, color='red', label='Above Threshold')

    ax.axhline(y=threshold, color='black', linestyle='--', label=f'Threshold = {threshold}')
    
    for i in range(n_bins):
        ax.text(i, -below_counts[i] - 0.5, str(below_counts[i]), ha='center', va='top', fontsize=9)
        ax.text(i, above_counts[i] + 0.5, str(above_counts[i]), ha='center', va='bottom', fontsize=9)

    ax.set_xlabel('Mask Sizes')
    ax.set_ylabel('Counts')
    split_pct = int(split_key.split("_")[-1])
    # ax.set_title(f'{split_pct}%:{80-split_pct}% ({stage_key})')
    
    ax.set_xticks(index)
    ax.set_xticklabels([f'{mask_bins[i]:.2f}-{mask_bins[i+1]:.2f}' for i in range(n_bins)])
    ax.legend()

In [24]:
# fig, axes = plt.subplots(len(splits), len(stages), figsize=(20, 30))

# for i, split_key in enumerate(splits):
#     for j, stage_key in enumerate(stages):
#         plot_butterfly_mask_vs_score(axes[i, j], n_bins=10, 
#                                      threshold=0.5,
#                                      results=results, 
#                                      split_key=split_key, 
#                                      stage_key=stage_key, 
#                                      key="area_vs_score")

# plt.tight_layout()
# plt.savefig('../outputs/figures/area_vs_score_butterfly_plot.pdf', dpi=300)
# plt.close()


In [25]:
fig, axes = plt.subplots(1, 1, figsize=(20, 8))

# for j, stage_key in enumerate(stages):
plot_butterfly_mask_vs_score(axes, 
                                n_bins=10, 
                                threshold=0.5,
                                results=results, 
                                split_key=splits[0],
                                stage_key=stage_key, 
                                key="area_vs_score")

plt.tight_layout()
plt.savefig(f'../outputs/figures/area_vs_score_butterfly_plot_{splits[0]}.pdf', dpi=300)
plt.close()


#### Train vs Test dataset distribution balancing

In [None]:
def analyze_train_vs_test_dist(train_dataset, test_dataset, savefig_name):
    
    training_mask_area_ratios = [get_mask_area_ratio(train_dataset[idx][1]) for idx in range(len(train_dataset))]
    testing_mask_area_ratios = [get_mask_area_ratio(test_dataset[idx][1]) for idx in range(len(test_dataset))]
    
    x_vals = np.linspace(0, 1, 1000)
    train_kde = gaussian_kde(training_mask_area_ratios, bw_method=0.5)
    test_kde = gaussian_kde(testing_mask_area_ratios, bw_method=0.5)
    
    train_y_vals,test_y_vals = train_kde(x_vals), test_kde(x_vals)
    ks_stat, p_value = ks_2samp(training_mask_area_ratios, testing_mask_area_ratios)
    
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.fill_between(x_vals, train_y_vals, color="blue", alpha=0.3)
    ax.plot(x_vals, train_y_vals, color="blue", linewidth=2, label="KDE on training data ")
    ax.fill_between(x_vals, test_y_vals, color="red", alpha=0.3)
    ax.plot(x_vals, test_y_vals, color="red", linewidth=2, label="KDE on testing data")

    ax.set_title(f"KDE for training and testing dataset mask distribution (KS Stat: {ks_stat:.4f}, P-value: {p_value:.4f})")
    ax.set_xlabel("Mask Area Ratios")
    ax.set_ylabel("Density")
    ax.set_xticks(np.arange(0.0, 1.1, 0.1))
    
    plt.legend()
    plt.savefig(f'.{env_vars["output_folder_path"]}/figures/{savefig_name}.pdf', format="pdf", bbox_inches="tight")
    
    plt.close()

In [None]:
analyze_train_vs_test_dist(train_dataset, 
                           test_dataset,
                           savefig_name="train_vs_test_kde_original")

##### 1st approach, Masking all bins have equal samples (First train-test split binwise, then extend the training dataset with the fmax)

In [None]:
def get_binwise_data(dataset, area_bins):
    data_bins = [[] for _ in range(len(area_bins)-1)]
    for idx in range(len(dataset)):
        area_ratio = get_mask_area_ratio(mask=dataset[idx][1])
        for j in range(len(area_bins)-1):
            if area_ratio >= area_bins[j] and area_ratio < area_bins[j+1]:
                data_bins[j].append(dataset[idx])
    return data_bins

In [None]:
def dataset_balancing_fmax(full_dataset):
    resampled_train_dataset, resampled_test_dataset = [], []
    area_bins = np.linspace(0, 1, 11)
    data_bins = get_binwise_data(full_dataset, area_bins)
    
    for j in range(len(data_bins)):
        if len(data_bins[j]) != 0:
            if len(data_bins[j]) > 1:
                train_images, test_images = train_test_split(data_bins[j], train_size=0.8, test_size=0.2, random_state=42)
                resampled_train_dataset.extend(train_images)
                resampled_test_dataset.extend(test_images)

    
    data_bins_train = get_binwise_data(resampled_train_dataset, area_bins)
    counts = [len(data) for data in data_bins_train]
    
    
    for j in range(len(data_bins_train)):
        to_add = np.max(counts) - len(data_bins_train[j])
        if data_bins_train[j] and to_add > 0:  
            data_bins_train[j].extend(random.choices(data_bins_train[j], k=to_add))
            

    resampled_train_dataset = [item for sublist in data_bins_train for item in sublist]
    
    return resampled_train_dataset, resampled_test_dataset


In [None]:
resampled_train_dataset, resampled_test_dataset = dataset_balancing_fmax(full_dataset)
len(resampled_train_dataset), len(resampled_test_dataset)

In [None]:
def create_subsets(data, num_subsets):
    random.shuffle(data)
    subsets = [[] for _ in range(num_subsets)]
    total_data_len = len(data)
    quotient = len(data) // num_subsets
    remainder = len(data) % num_subsets
    
    per_subset_len = [quotient for _ in range(num_subsets)]
    for i in range(remainder):
        per_subset_len[i] += 1
    
    start_idx = 0
    for i in range(num_subsets):
        end_idx = start_idx + per_subset_len[i]
        subsets[i] = data_copy[start_idx:end_idx]
        start_idx = end_idx
        
    return subsets

In [None]:
import string
string.ascii_lowercase[0]

In [None]:
analyze_train_vs_test_dist(resampled_train_dataset, 
                           resampled_test_dataset,
                           savefig_name="train_vs_test_kde_after_fmax_balancing_wo_binwise_split")

##### 2nd approach, First binwise train-test splitting then adding

In [None]:
# def dataset_balancing_binwise(full_dataset):
#     resampled_train_dataset, resampled_test_dataset = [], []
    
#     area_bins = np.linspace(0, 1, 11)
#     data_bins = [[] for _ in range(len(area_bins)-1)]

#     for idx in range(len(full_dataset)):
#         area_ratio = get_mask_area_ratio(mask=full_dataset[idx][1])
#         for j in range(len(area_bins)-1):
#             if area_ratio>=area_bins[j] and area_ratio<area_bins[j+1]:
#                 data_bins[j].append(full_dataset[idx])
                
#     for j in range(len(data_bins)):
#         if len(data_bins[j]) <2:
#             resampled_train_dataset.extend(data_bins[j])
#         else:
#             train_images, test_images = train_test_split(data_bins[j], train_size=0.8, test_size=0.2, random_state=42)
#             resampled_train_dataset.extend(train_images)
#             resampled_test_dataset.extend(test_images)
            
#     return resampled_train_dataset, resampled_test_dataset

In [None]:
# resampled_train_dataset, resampled_test_dataset = dataset_balancing_binwise(full_dataset)

In [None]:
# analyze_train_vs_test_dist(train_data, 
#                            test_data,
#                            savefig_name="train_vs_test_kde_after_binwise_balancing")