In [19]:
import json
import csv
from collections import defaultdict

# File paths
file_paths = [
    '/graft3/code/tracy/data/final_may24_ver2/test_data_2.json',
    '/graft3/code/tracy/data/final_may24_ver2/test_data_3.json',
    '/graft3/code/tracy/data/final_may24_ver2/test_data.json',
    '/graft3/code/tracy/data/final_may24_ver2/train_data.json',
    '/graft3/code/tracy/data/final_may24_ver2/valid_data.json'
]

# Store all unique time strings
all_times = defaultdict(int)

# Read JSON files and collect all time information
for file_path in file_paths:
    with open(file_path, 'r') as f:
        data = json.load(f)
        for item in data.values():
            all_times[item['time']] += 1

# Print all unique time strings
unique_times = list(all_times.keys())
print("Unique times found:", unique_times)

# Create time mapping
time_mapping = {time: idx for idx, time in enumerate(unique_times)}
print("Time mapping:", time_mapping)

# Store all unique collection strings
all_collections = defaultdict(int)

# Read JSON files and collect all collection information
for file_path in file_paths:
    with open(file_path, 'r') as f:
        data = json.load(f)
        for item in data.values():
            all_collections[item['collection_id']] += 1

# Print all unique collection  strings
unique_collections = list(all_collections.keys())
print("Unique collections found:", unique_collections)

# Create time mapping
collection_mapping = {collection: idx for idx, collection in enumerate(unique_collections)}
print("Collection mapping:", collection_mapping)

Unique times found: ['Ur III (ca. 2100-2000 BC)', 'ED IIIb (ca. 2500-2340 BC)', 'Old Akkadian (ca. 2340-2200 BC)', 'Lagash II (ca. 2200-2100 BC)', 'Early Old Babylonian (ca. 2000-1900 BC)', 'Neo-Babylonian (ca. 626-539 BC)', 'Old Babylonian (ca. 1900-1600 BC)', 'ED I-II (ca. 2900-2700 BC)', 'ED IIIa (ca. 2600-2500 BC)', 'Middle Assyrian (ca. 1400-1000 BC)', 'Neo-Assyrian (ca. 911-612 BC)', 'Old Assyrian (ca. 1950-1850 BC)', 'Middle Babylonian (ca. 1400-1100 BC)', 'Ebla (ca. 2350-2250 BC)']
Time mapping: {'Ur III (ca. 2100-2000 BC)': 0, 'ED IIIb (ca. 2500-2340 BC)': 1, 'Old Akkadian (ca. 2340-2200 BC)': 2, 'Lagash II (ca. 2200-2100 BC)': 3, 'Early Old Babylonian (ca. 2000-1900 BC)': 4, 'Neo-Babylonian (ca. 626-539 BC)': 5, 'Old Babylonian (ca. 1900-1600 BC)': 6, 'ED I-II (ca. 2900-2700 BC)': 7, 'ED IIIa (ca. 2600-2500 BC)': 8, 'Middle Assyrian (ca. 1400-1000 BC)': 9, 'Neo-Assyrian (ca. 911-612 BC)': 10, 'Old Assyrian (ca. 1950-1850 BC)': 11, 'Middle Babylonian (ca. 1400-1100 BC)': 12, '

In [20]:
import csv

# Read categories.csv and create a category mapping
output_categories_csv_path = '/graft2/code/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/categories.csv'


# Generate categories.csv file
with open(output_categories_csv_path, 'w', newline='') as categories_csvfile:
    fieldnames = ['y', 'category_id', 'name']
    writer = csv.DictWriter(categories_csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for time_string, idx in time_mapping.items():
        writer.writerow({
            'y': idx,
            'category_id': idx,  # Assuming category_id is the same as y for simplicity
            'name': time_string
        })

print(f"categories.csv has been generated at {output_categories_csv_path}")

categories.csv has been generated at /graft2/code/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/categories.csv


In [21]:
import csv

# Specify the output path for metadata.csv
output_csv_path = '/graft2/code/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/metadata.csv'

split_counters = {
    'train': 0,
    'val': 0,
    'id_test': 0,
    'id_val': 0,
    'test': 0
}

with open(output_csv_path, 'w', newline='') as csvfile:
    fieldnames = ['split', 'location_remapped', 'location', 'sequence_remapped', 'seq_id', 'y', 'category_id', 'datetime', 'filename', 'image_id']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    
    missing = 0
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            data = json.load(f)
            # Determine the split based on the file name
            if 'train' in file_path:
                split = 'train'
            elif 'valid' in file_path:
                split = 'val'
            elif 'test_data_2' in file_path:
                split = 'id_test'
            elif 'test_data_3' in file_path:
                split = 'id_val'
            elif 'test_data' in file_path:
                split = 'test'
            
            for item in data.values():
                image_id = item['id']
                filename = item['img_url'].split('/')[-1]
                if len(str(image_id)) <= 5:  # Skip if image_id is six digits
                    missing = missing + 1
                    filename = f"P{image_id:06d}.jpg"
                    # print(filename)
                    continue  # Uncomment this line to actually skip the images
                
                location_remapped_mapping = lambda x: x
                writer.writerow({
                    'split': split,
                    'location_remapped': location_remapped_mapping(item['collection_id']),  # Use the same value
                    'location': collection_mapping[item['collection_id']],
                    'sequence_remapped': 1,  # Use the same value
                    'seq_id': image_id,
                    'y': time_mapping[item['time']],  # Encode time string as a numerical value
                    'category_id': time_mapping[item['time']],
                    'datetime': '2100-01-01 00:00:00.000',  # Use the same value
                    'filename': filename,  # Use the image file name
                    'image_id': image_id
                })
                
                # Increment the counter for the current split
                split_counters[split] += 1

# print("Img missing: ", missing)
print(f"metadata.csv has been generated at {output_csv_path}")

# Print the count of files for each split
for split, count in split_counters.items():
    print(f"Number of files in {split}: {count}")

metadata.csv has been generated at /graft2/code/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/metadata.csv
Number of files in train: 30109
Number of files in val: 2114
Number of files in id_test: 1958
Number of files in id_val: 2037
Number of files in test: 2065


In [22]:
# import pandas as pd
# import json
# from sklearn.model_selection import train_test_split

# # Load metadata
# metadata_path = '/trunk2/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/metadata.csv'
# metadata = pd.read_csv(metadata_path)

# # Load valid_data.json and test_data.json
# with open('/trunk2/yufei/Ancient_Artifact_Dating_front/wilds/data/mydataset/collection/valid_data.json', 'r') as f:
#     valid_data = json.load(f)
# with open('/trunk2/yufei/Ancient_Artifact_Dating_front/wilds/data/mydataset/collection/test_data.json', 'r') as f:
#     test_data = json.load(f)

# # Count the occurrences of each time (mapped to y) in valid_data and test_data
# valid_y_counts = pd.Series([item['time'] for item in valid_data.values()]).map(time_mapping).value_counts()
# test_y_counts = pd.Series([item['time'] for item in test_data.values()]).map(time_mapping).value_counts()

# # Separate train data
# train_data = metadata[metadata['split'] == 'train']

# # Function to sample data based on y distribution
# def stratified_sample(data, counts):
#     sampled_data = pd.DataFrame()
#     for y, count in counts.items():
#         y_data = data[data['y'] == y]
#         if len(y_data) >= count:
#             sampled_y_data = y_data.sample(n=count, random_state=42)
#         else:
#             sampled_y_data = y_data
#         sampled_data = pd.concat([sampled_data, sampled_y_data])
#     return sampled_data

# # Sample id_val and id_test from train_data
# id_val_data = stratified_sample(train_data, valid_y_counts)
# # Avoid overlap
# id_test_data = stratified_sample(train_data.drop(id_val_data.index), test_y_counts)

# # Assign new splits
# id_val_data['split'] = 'id_val'
# id_test_data['split'] = 'id_test'

# # Remove the sampled data from train_data
# train_data = train_data.drop(id_val_data.index).drop(id_test_data.index)

# # Concatenate the new datasets with the original metadata
# new_metadata = pd.concat([metadata[metadata['split'] != 'train'], train_data, id_val_data, id_test_data])

# # Save the new metadata
# new_metadata_path = '/trunk2/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/metadata.csv'
# new_metadata.to_csv(new_metadata_path, index=False)

# print(f"New metadata.csv has been generated at {new_metadata_path}")


In [23]:
import csv
import os

# Specify file paths
metadata_path = '/graft2/code/yufei/Ancient_Artifact_Dating_front/wilds/data/iwildcam_v2.0/metadata.csv'
images_dir = '/graft2/datasets/danlu/cuneiform/segmented_images/segmented_images'

# Read the metadata.csv file
missing_files = []
with open(metadata_path, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        filename = row['filename']
        file_path = os.path.join(images_dir, filename)
        if not os.path.isfile(file_path):
            missing_files.append(filename)

# Print missing files
if missing_files:
    print(f"Missing files ({len(missing_files)}):")
    for file in missing_files:
        print(file)
else:
    print("All files in metadata.csv are present in the images directory.")


All files in metadata.csv are present in the images directory.
