In [1]:
import os
from huggingface_hub import HfFileSystem
from dotenv import load_dotenv
import fnmatch
from pathlib import Path

# Load environment variables (HF_TOKEN)
load_dotenv()

# Initialize the HfFileSystem
fs = HfFileSystem()

# Define the repository
repo_id = "INDEEPA/clip-siamese"
repo_path = f"datasets/{repo_id}"

print(f"Connected to repository: {repo_id}")

Connected to repository: INDEEPA/clip-siamese


In [2]:
def list_files_recursive(path, max_depth=10):
    """Recursively list all files in the repository"""
    all_files = []
    
    def _recursive_list(current_path, current_depth=0):
        if current_depth > max_depth:
            return
        
        try:
            items = fs.ls(current_path, detail=True)
            for item in items:
                if item['type'] == 'file':
                    all_files.append(item)
                elif item['type'] == 'directory':
                    _recursive_list(item['name'], current_depth + 1)
        except Exception as e:
            print(f"Error listing {current_path}: {e}")
    
    _recursive_list(path)
    return all_files

# List all files in the repository
print("Getting file list from repository...")
all_files = list_files_recursive(repo_path)
print(f"Found {len(all_files)} total files")

Getting file list from repository...
Found 142 total files


In [3]:
# Define the pattern to match
# pattern = "*/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset*"

# Alternative patterns you can use:
# pattern = "*/topk-siamese-emb-pairwise-dataset*"  # Match anywhere
# pattern = "*.csv"  # Match all CSV files
# pattern = "*processed*"  # Match anything with 'processed' in the path

def find_files_by_pattern(files, pattern):
    """Find files matching the glob pattern"""
    matching_files = []
    
    for file_info in files:
        file_path = file_info['name']
        # Remove the repo prefix for pattern matching
        relative_path = file_path.replace(f"{repo_path}/", "")
        
        if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(file_path, pattern):
            matching_files.append(file_info)
    
    return matching_files

# Find matching files
matching_files = find_files_by_pattern(all_files, pattern)

print(f"Files matching pattern '{pattern}':")
print(f"Found {len(matching_files)} files:")
for file_info in matching_files:
    file_path = file_info['name']
    file_size = file_info.get('size', 0)
    size_mb = file_size / (1024 * 1024) if file_size else 0
    print(f"  📄 {file_path} ({size_mb:.2f} MB)")

Files matching pattern '*/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset*':
Found 4 files:
  📄 datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt.csv (0.75 MB)
  📄 datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.520_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-threshold=0.19597989949748743.pt.csv (0.80 MB)
  📄 datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=None_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt.csv (829.89 MB)
  📄 datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=None_num-queries=5562_query-

In [4]:
# Show detailed information about files to be deleted
if matching_files:
    total_size = sum(file_info.get('size', 0) for file_info in matching_files)
    total_size_mb = total_size / (1024 * 1024)
    
    print(f"\n{'='*60}")
    print(f"DELETION PREVIEW")
    print(f"{'='*60}")
    print(f"Pattern: {pattern}")
    print(f"Files to delete: {len(matching_files)}")
    print(f"Total size: {total_size_mb:.2f} MB")
    print(f"{'='*60}")
    
    for i, file_info in enumerate(matching_files, 1):
        file_path = file_info['name']
        file_size = file_info.get('size', 0)
        size_mb = file_size / (1024 * 1024) if file_size else 0
        print(f"{i:2d}. {file_path}")
        print(f"    Size: {size_mb:.2f} MB")
else:
    print("❌ No files found matching the pattern")


DELETION PREVIEW
Pattern: */tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset*
Files to delete: 4
Total size: 1661.37 MB
 1. datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt.csv
    Size: 0.75 MB
 2. datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.520_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-threshold=0.19597989949748743.pt.csv
    Size: 0.80 MB
 3. datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=None_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt.csv
    Size: 829.89 MB
 4. datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese

In [5]:
def delete_files_with_confirmation(files_to_delete):
    """Delete files with user confirmation"""
    if not files_to_delete:
        print("No files to delete.")
        return
    
    # Ask for confirmation
    print(f"\n⚠️  You are about to delete {len(files_to_delete)} files")
    confirm = input("Type 'DELETE' to confirm: ")
    
    if confirm != "DELETE":
        print("❌ Deletion cancelled.")
        return
    
    # Delete files
    deleted_count = 0
    failed_count = 0
    
    for file_info in files_to_delete:
        file_path = file_info['name']
        try:
            print(f"🗑️  Deleting: {file_path}")
            fs.rm(file_path)
            print(f"✅ Deleted: {file_path}")
            deleted_count += 1
        except Exception as e:
            print(f"❌ Failed to delete {file_path}: {e}")
            failed_count += 1
    
    print(f"\n{'='*50}")
    print(f"DELETION SUMMARY")
    print(f"{'='*50}")
    print(f"✅ Successfully deleted: {deleted_count} files")
    if failed_count > 0:
        print(f"❌ Failed to delete: {failed_count} files")
    print(f"{'='*50}")

# Run the deletion
delete_files_with_confirmation(matching_files)


⚠️  You are about to delete 4 files
🗑️  Deleting: datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt.csv
✅ Deleted: datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt.csv
🗑️  Deleting: datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.520_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-threshold=0.19597989949748743.pt.csv
✅ Deleted: datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500/processed/topk-siamese-emb-pairwise-dataset_k=5_num-queries=5562_query-seller=ИНТЕРТРЕЙД_model=siamese_contrastive_test-f1=0.520_splitting-by-que