In [1]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from learned import *

np.random.seed(0)

In [None]:
DATASET = "segmentAnything"
FINAL_DATA_NAME = 'segmentAnything-full' # + _learned 
CONSTANT_SAMPLE_SIZE = int(1e5)
RAW_DATA_SUFFIX = "segmentAnything-croppedDeblurred"

num_images=300 # if skew test is run on a smaller sample of the dataset. None = use all images
jitter=True # if raw data has not been jittered
normalize=True # if raw data has not been normalized
n_bootstrap=1
sample_size=1

data_dir = os.path.join(ROOT_DIR, 'raw-data', DATASET, RAW_DATA_SUFFIX)

In [3]:


images = load_images_from_directory(data_dir, n=num_images, jitter=jitter, normalize=normalize)
print(images.shape)
filter_groups_coef = {
    key: filters[val] for key, val in filter_groups.items()
}

group_transform_coef = {group: transform_images(images, filter_groups_coef[group]) for group in filter_groups_coef.keys()}

Loading images:   0%|          | 0/200 [00:00<?, ?it/s]

(200, 512, 512, 3)


Applying filters:   0%|          | 0/200 [00:00<?, ?it/s]

Feature maps shape: (200, 20, 506, 506)


Applying filters:   0%|          | 0/200 [00:00<?, ?it/s]

Feature maps shape: (200, 10, 506, 506)


Applying filters:   0%|          | 0/200 [00:00<?, ?it/s]

Feature maps shape: (200, 6, 506, 506)


Applying filters:   0%|          | 0/200 [00:00<?, ?it/s]

Feature maps shape: (200, 13, 506, 506)


Applying filters:   0%|          | 0/200 [00:00<?, ?it/s]

Feature maps shape: (200, 9, 506, 506)


Applying filters:   0%|          | 0/200 [00:00<?, ?it/s]

Feature maps shape: (200, 6, 506, 506)


In [24]:
skewed_data, nonskewed_data, skewed_groups, nonskewed_groups, skewed_indices, nonskewed_indices = run_skew_test_with_filters(
    group_transform_coef, filter_groups, filters, n_bootstrap=n_bootstrap, sample_size=sample_size)

In [25]:
nonskewed_df = pd.read_csv(os.path.join(ROOT_DIR, 'learned-filters', 'nonskewed_indices_df.csv')).set_index(['dataset', 'num_images', 'num_bootstrap', 'sample_size'])
new_row = {'dataset': FINAL_DATA_NAME, 'num_images' : num_images, 'num_bootstrap' : n_bootstrap, 'bootstrap_size' : sample_size, 'nonskewed_indices': nonskewed_indices}
nonskewed_df.loc[FINAL_DATA_NAME, num_images if num_images is not None else len(images), n_bootstrap, sample_size] = new_row
nonskewed_df.to_csv(os.path.join(ROOT_DIR, 'learned-filters', 'nonskewed_indices_df.csv'))
nonskewed_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,nonskewed_indices
dataset,num_images,num_bootstrap,sample_size,Unnamed: 4_level_1
pastis-full,,100,100,"[3, 6, 10, 11, 12, 13, 14, 23, 24, 28, 29, 30,..."
segmentAnything-full,10.0,10,10,"[3, 6, 11, 12, 13, 14, 23, 24, 29, 30, 32, 34,..."
segmentAnything-full,100.0,100,100,"[3, 6, 10, 11, 12, 13, 14, 23, 24, 28, 32, 34,..."


In [26]:
total_filters = {group: len(filters) for group, filters in filter_groups.items()}

passed_counts = defaultdict(int)
for group in nonskewed_groups:
    passed_counts[group] += 1

summary = []
for group in total_filters:
    total = total_filters[group]
    passed = passed_counts.get(group, 0)
    failed = total - passed
    summary.append((group, total, passed, failed))

summary_df = pd.DataFrame(summary, columns=[
    "group", "total_filters", "passed_skew_test", "failed_skew_test"
])

summary_df = summary_df.drop(columns=["failed_skew_test"])

summary_df["proportion_passed"] = np.round(summary_df["passed_skew_test"] / summary_df["total_filters"], 3)

# Calculate sums for each numeric column
summary_totals = summary_df[["total_filters", "passed_skew_test"]].sum()

# Create new row with label 'all' and the totals
all_row = pd.DataFrame([{
    "group": "all",
    "total_filters": summary_totals["total_filters"],
    "passed_skew_test": summary_totals["passed_skew_test"],
    "proportion_passed": np.round(summary_totals["passed_skew_test"] / summary_totals["total_filters"], 3)
}])

# Append to the summary DataFrame
summary_df_with_total = pd.concat([summary_df, all_row], ignore_index=True)

# Display the updated DataFrame
summary_df_with_total

Unnamed: 0,group,total_filters,passed_skew_test,proportion_passed
0,single_edge,20,18,0.9
1,multi_edge,10,9,0.9
2,eye,6,5,0.833
3,dual_color,13,13,1.0
4,inside_out,9,7,0.778
5,misc,6,6,1.0
6,all,64,58,0.906


# Saving Data

In [27]:
data_output_file = os.path.join(ROOT_DIR, 'transformed-data', f"{FINAL_DATA_NAME}-learned.pickle") 
size_output_file = os.path.join(ROOT_DIR, 'transformed-data', f"{FINAL_DATA_NAME}-learned-size.pickle")

In [28]:
rows = []

for i in range(len(nonskewed_data)):
    group = nonskewed_groups[i]
    alex_idx = nonskewed_indices[i]
    id_in_group = filter_groups[group].index(alex_idx) if alex_idx in filter_groups[group] else None
    coefs = group_transform_coef[group][id_in_group]
    coefs = np.sort(coefs)[np.round(np.linspace(0, coefs.size - 1, min(coefs.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
    rows.append({
        'alex_idx': alex_idx,
        'group': group,
        'group_idx': id_in_group,
        'data': coefs
    })

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,alex_idx,group,group_idx,data
0,3,single_edge,0,"[-50.97711, -32.965393, -30.577969, -29.020397..."
1,6,single_edge,1,"[-34.565567, -23.078531, -21.108414, -20.04415..."
2,10,single_edge,2,"[-33.692085, -23.404648, -21.975685, -20.92823..."
3,11,single_edge,3,"[-28.96775, -24.230911, -23.525198, -22.967728..."
4,12,single_edge,4,"[-41.26574, -24.699814, -22.714813, -21.518356..."


In [29]:
group_size_counts = df['data'].apply(len)
size_df = df.copy().drop(columns = ['data'])
size_df['size'] = group_size_counts
size_df.head()

Unnamed: 0,alex_idx,group,group_idx,size
0,3,single_edge,0,100000
1,6,single_edge,1,100000
2,10,single_edge,2,100000
3,11,single_edge,3,100000
4,12,single_edge,4,100000


In [30]:
# Save the nonskewed_data to a pickle file
with open(data_output_file, "wb") as f:
    pickle.dump(df, f)

print(f"coefficient data has been saved to {data_output_file}")

with open(size_output_file, "wb") as f:
    pickle.dump(size_df, f)

print(f"size data has been saved to {size_output_file}")

coefficient data has been saved to /Users/brandonmarks/Desktop/hierarchical-bayesian-model-validation/transformed-data/segmentAnything-full-learned.pickle
size data has been saved to /Users/brandonmarks/Desktop/hierarchical-bayesian-model-validation/transformed-data/segmentAnything-full-learned-size.pickle
