In [1]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from learned import *

np.random.seed(0)

KeyboardInterrupt: 

In [None]:
DATASET = "coco"
RAW_DATA_SUFFIX = "coco-indoor-cropped"
FINAL_DATA_NAME = 'coco-indoor'
CONSTANT_SAMPLE_SIZE = int(1e5)

n=10 
jitter=False 
normalize=False

data_dir = os.path.join(ROOT_DIR, 'raw-data', DATASET, RAW_DATA_SUFFIX)

In [None]:
images = load_images_from_directory(data_dir, n=n, jitter=jitter, normalize=normalize)
filter_groups_coef = {
    key: filters[val] for key, val in filter_groups.items()
}

group_transform_coef = {group: transform_images(images, filter_groups_coef[group]) for group in filter_groups_coef.keys()}

KeyboardInterrupt: 

In [None]:
skewed_data, nonskewed_data, skewed_groups, nonskewed_groups, skewed_indices, nonskewed_indices = run_skew_test_with_filters(
    group_transform_coef, filter_groups, filters, n_bootstrap=100, sample_size=200)

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
nonskewed_df = pd.read_csv(os.path.join(ROOT_DIR, 'learned-filters', 'nonskewed_indices_df.csv'), index_col='dataset')
new_row = {'dataset': FINAL_DATA_NAME, 'nonskewed_indices': nonskewed_indices}
nonskewed_df.loc[FINAL_DATA_NAME] = new_row
nonskewed_df.to_csv(os.path.join(ROOT_DIR, 'learned-filters', 'nonskewed_indices_df.csv'))
nonskewed_df

Unnamed: 0_level_0,nonskewed_indices
dataset,Unnamed: 1_level_1
coco-indoor,"[3, 6, 10, 11, 12, 13, 23, 24, 28, 29, 30, 34,..."


In [None]:
total_filters = {group: len(filters) for group, filters in filter_groups.items()}

passed_counts = defaultdict(int)
for group in nonskewed_groups:
    passed_counts[group] += 1

summary = []
for group in total_filters:
    total = total_filters[group]
    passed = passed_counts.get(group, 0)
    failed = total - passed
    summary.append((group, total, passed, failed))

summary_df = pd.DataFrame(summary, columns=[
    "group", "total_filters", "passed_skew_test", "failed_skew_test"
])

summary_df = summary_df.drop(columns=["failed_skew_test"])

summary_df["proportion_passed"] = np.round(summary_df["passed_skew_test"] / summary_df["total_filters"], 3)

# Calculate sums for each numeric column
summary_totals = summary_df[["total_filters", "passed_skew_test"]].sum()

# Create new row with label 'all' and the totals
all_row = pd.DataFrame([{
    "group": "all",
    "total_filters": summary_totals["total_filters"],
    "passed_skew_test": summary_totals["passed_skew_test"],
    "proportion_passed": np.round(summary_totals["passed_skew_test"] / summary_totals["total_filters"], 3)
}])

# Append to the summary DataFrame
summary_df_with_total = pd.concat([summary_df, all_row], ignore_index=True)

# Display the updated DataFrame
summary_df_with_total

Unnamed: 0,group,total_filters,passed_skew_test,proportion_passed
0,single_edge,20,15,0.75
1,multi_edge,10,8,0.8
2,eye,6,5,0.833
3,dual_color,13,11,0.846
4,inside_out,9,8,0.889
5,misc,6,5,0.833
6,all,64,52,0.812


# Saving Data

In [None]:
data_output_file = os.path.join(ROOT_DIR, 'transformed-data', FINAL_DATA_NAME) + ".pickle"
size_output_file = os.path.join(ROOT_DIR, 'transformed-data', FINAL_DATA_NAME + "-size") + ".pickle"

In [None]:
rows = []

for i in range(len(nonskewed_data)):
    group = nonskewed_groups[i]
    alex_idx = nonskewed_indices[i]
    id_in_group = filter_groups[group].index(alex_idx) if alex_idx in filter_groups[group] else None
    coefs = group_transform_coef[group][id_in_group]
    coefs = np.sort(coefs)[np.round(np.linspace(0, coefs.size - 1, min(coefs.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
    rows.append({
        'alex_idx': alex_idx,
        'group': group,
        'group_idx': id_in_group,
        'data': coefs
    })

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,alex_idx,group,group_idx,data
0,3,single_edge,0,"[-54.37113021641114, -34.08181313527538, -30.9..."
1,6,single_edge,1,"[-37.932721286590066, -27.61167403257441, -24...."
2,10,single_edge,2,"[-47.258124294031695, -26.930818387694952, -23..."
3,12,single_edge,4,"[-40.841302907424634, -25.74030673575521, -22...."
4,14,single_edge,6,"[-47.61700268868441, -31.186843907789488, -26...."


In [None]:
group_size_counts = df['data'].apply(len)
size_df = df.copy().drop(columns = ['data'])
size_df['size'] = group_size_counts
size_df.head()

Unnamed: 0,alex_idx,group,group_idx,size
0,3,single_edge,0,100000
1,6,single_edge,1,100000
2,10,single_edge,2,100000
3,12,single_edge,4,100000
4,14,single_edge,6,100000


In [None]:
# Save the nonskewed_data to a pickle file
with open(data_output_file, "wb") as f:
    pickle.dump(df, f)

print(f"coefficient data has been saved to {data_output_file}")

with open(size_output_file, "wb") as f:
    pickle.dump(size_df, f)

print(f"size data has been saved to {size_output_file}")

coefficient data has been saved to c:\Users\yashd\Desktop\hierarchical-bayesian-model-validation\transformed-data\coco-indoor-learned.pickle
size data has been saved to c:\Users\yashd\Desktop\hierarchical-bayesian-model-validation\transformed-data\coco-indoor-learned-size.pickle
