In [13]:
import git
from pathlib import Path
import os

ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from learned import *

np.random.seed(0)

In [14]:
DATASET = "pastis"
RAW_DATA_SUFFIX = "full-pastis-RGB-jitter"
FINAL_DATA_NAME = 'pastis-full'
CONSTANT_SAMPLE_SIZE = int(1e5)

n=None 
jitter=False 
normalize=False

data_dir = os.path.join(ROOT_DIR, 'raw-data', DATASET, RAW_DATA_SUFFIX)

In [15]:
images = load_images_from_directory(data_dir, n=n, jitter=jitter, normalize=normalize)
filter_groups_coef = {
    key: filters[val] for key, val in filter_groups.items()
}

group_transform_coef = {group: transform_images(images, filter_groups_coef[group]) for group in filter_groups_coef.keys()}

In [None]:
skewed_data, nonskewed_data, skewed_groups, nonskewed_groups, skewed_indices, nonskewed_indices = run_skew_test_with_filters(
    group_transform_coef, filter_groups, filters, n_bootstrap=1000, sample_size=10000)

In [17]:
nonskewed_df = pd.read_csv(os.path.join(ROOT_DIR, 'learned-filters', 'nonskewed_indices_df.csv'), index_col='dataset')
new_row = {'dataset': FINAL_DATA_NAME, 'nonskewed_indices': nonskewed_indices}
nonskewed_df.loc[FINAL_DATA_NAME] = new_row
nonskewed_df.to_csv(os.path.join(ROOT_DIR, 'learned-filters', 'nonskewed_indices_df.csv'))
nonskewed_df

Unnamed: 0_level_0,nonskewed_indices
dataset,Unnamed: 1_level_1
coco-indoor,"[3, 6, 10, 11, 12, 13, 23, 24, 28, 29, 30, 34,..."
pastis-full,"[6, 10, 11, 12, 13, 24, 30, 32, 34, 43, 48, 49..."


In [18]:
total_filters = {group: len(filters) for group, filters in filter_groups.items()}

passed_counts = defaultdict(int)
for group in nonskewed_groups:
    passed_counts[group] += 1

summary = []
for group in total_filters:
    total = total_filters[group]
    passed = passed_counts.get(group, 0)
    failed = total - passed
    summary.append((group, total, passed, failed))

summary_df = pd.DataFrame(summary, columns=[
    "group", "total_filters", "passed_skew_test", "failed_skew_test"
])

summary_df = summary_df.drop(columns=["failed_skew_test"])

summary_df["proportion_passed"] = np.round(summary_df["passed_skew_test"] / summary_df["total_filters"], 3)

# Calculate sums for each numeric column
summary_totals = summary_df[["total_filters", "passed_skew_test"]].sum()

# Create new row with label 'all' and the totals
all_row = pd.DataFrame([{
    "group": "all",
    "total_filters": summary_totals["total_filters"],
    "passed_skew_test": summary_totals["passed_skew_test"],
    "proportion_passed": np.round(summary_totals["passed_skew_test"] / summary_totals["total_filters"], 3)
}])

# Append to the summary DataFrame
summary_df_with_total = pd.concat([summary_df, all_row], ignore_index=True)

# Display the updated DataFrame
summary_df_with_total

Unnamed: 0,group,total_filters,passed_skew_test,proportion_passed
0,single_edge,20,15,0.75
1,multi_edge,10,10,1.0
2,eye,6,4,0.667
3,dual_color,13,12,0.923
4,inside_out,9,4,0.444
5,misc,6,3,0.5
6,all,64,48,0.75


# Saving Data

In [19]:
data_output_file = os.path.join(ROOT_DIR, 'transformed-data', f"{FINAL_DATA_NAME}-learned.pickle") 
size_output_file = os.path.join(ROOT_DIR, 'transformed-data', f"{FINAL_DATA_NAME}-learned-size.pickle")

In [20]:
rows = []

for i in range(len(nonskewed_data)):
    group = nonskewed_groups[i]
    alex_idx = nonskewed_indices[i]
    id_in_group = filter_groups[group].index(alex_idx) if alex_idx in filter_groups[group] else None
    coefs = group_transform_coef[group][id_in_group]
    coefs = np.sort(coefs)[np.round(np.linspace(0, coefs.size - 1, min(coefs.size, CONSTANT_SAMPLE_SIZE))).astype(int)]
    rows.append({
        'alex_idx': alex_idx,
        'group': group,
        'group_idx': id_in_group,
        'data': coefs
    })

df = pd.DataFrame(rows)
df.head()

Unnamed: 0,alex_idx,group,group_idx,data
0,6,single_edge,1,"[-63.66270401354562, -45.10696570006463, -35.7..."
1,10,single_edge,2,"[-79.01661657646684, -43.23753708637163, -35.2..."
2,11,single_edge,3,"[-53.35163405501882, -31.0742783406018, -26.64..."
3,12,single_edge,4,"[-80.95108330777086, -41.93901805097575, -30.7..."
4,13,single_edge,5,"[-73.48932042092838, -44.60804577284734, -36.7..."


In [21]:
group_size_counts = df['data'].apply(len)
size_df = df.copy().drop(columns = ['data'])
size_df['size'] = group_size_counts
size_df.head()

Unnamed: 0,alex_idx,group,group_idx,size
0,6,single_edge,1,100000
1,10,single_edge,2,100000
2,11,single_edge,3,100000
3,12,single_edge,4,100000
4,13,single_edge,5,100000


In [22]:
# Save the nonskewed_data to a pickle file
with open(data_output_file, "wb") as f:
    pickle.dump(df, f)

print(f"coefficient data has been saved to {data_output_file}")

with open(size_output_file, "wb") as f:
    pickle.dump(size_df, f)

print(f"size data has been saved to {size_output_file}")

coefficient data has been saved to c:\Users\yashd\Desktop\hierarchical-bayesian-model-validation\transformed-data\pastis-full-learned.pickle
size data has been saved to c:\Users\yashd\Desktop\hierarchical-bayesian-model-validation\transformed-data\pastis-full-learned-size.pickle
