In [48]:
import os
import pandas as pd
import numpy as np

In [49]:
# 🗑️ XÓA DÒNG KHÔNG CÓ ẢNH TƯƠNG ỨNG
print("🗑️ CLEANING: XÓA DÒNG CSV KHÔNG CÓ ẢNH")
print("=" * 40)

# FUNCTION CHÍNH
def clean_labels_remove_orphans(csv_path, images_path):
    """Xóa dòng CSV không có ảnh tương ứng"""
    print(f"📖 CSV: {csv_path}")
    print(f"📁 Images: {images_path}")
    
    # Check files exist
    if not os.path.exists(csv_path):
        print(f"❌ CSV không tồn tại: {csv_path}")
        return
    if not os.path.exists(images_path):
        print(f"❌ Folder không tồn tại: {images_path}")
        return
    
    # Load CSV
    df = pd.read_csv(csv_path)
    print(f"✅ CSV: {len(df):,} dòng")
    
    # Get actual images
    extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']
    actual_images = [f for f in os.listdir(images_path) 
                    if any(f.lower().endswith(ext) for ext in extensions)]
    images_set = set(actual_images)
    print(f"✅ Images: {len(actual_images):,} files")
    
    # Find image column
    img_col = None
    for col in ['img_name', 'image_name', 'filename']:
        if col in df.columns:
            img_col = col
            break
    
    if not img_col:
        print(f"❌ Không tìm thấy cột ảnh: {list(df.columns)}")
        return
    
    print(f"📋 Column: '{img_col}'")
    
    # Filter matching rows
    mask = df[img_col].isin(images_set)
    cleaned_df = df[mask].copy()
    
    # Stats
    original = len(df)
    cleaned = len(cleaned_df)
    removed = original - cleaned
    
    print()
    print("📊 RESULTS:")
    print(f"   📋 Original: {original:,}")
    print(f"   ✅ Kept: {cleaned:,}")
    print(f"   🗑️ Removed: {removed:,}")
    print(f"   📈 Keep rate: {cleaned/original*100:.1f}%")
    
    # Save cleaned file
    output = csv_path.replace('.csv', '_cleaned.csv')
    cleaned_df.to_csv(output, index=False)
    print(f"💾 Saved: {output}")
    
    return cleaned_df

# AUTO-DETECT FILES
print("🔍 AUTO-DETECTING FILES...")

# Find CSV files
csv_candidates = []
for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.csv'):
            path = os.path.join(root, file).replace('\\', '/')
            csv_candidates.append(path)

# Find image folders
img_candidates = []
for root, dirs, files in os.walk('.'):
    img_count = sum(1 for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png')))
    if img_count > 0:
        path = root.replace('\\', '/')
        img_candidates.append((path, img_count))

print("📄 Found CSV files:")
for path in csv_candidates:
    size = os.path.getsize(path) / 1024 / 1024
    print(f"   {path} ({size:.1f}MB)")

print("📁 Found image folders:")
for path, count in img_candidates:
    print(f"   {path}/ ({count:,} images)")

# Process if found
if csv_candidates and img_candidates:
    # Pick largest CSV and folder with most images
    main_csv = max(csv_candidates, key=os.path.getsize)
    main_img = max(img_candidates, key=lambda x: x[1])[0]
    
    print()
    print("🚀 PROCESSING:")
    print(f"   CSV: {main_csv}")
    print(f"   Images: {main_img}/")
    print()
    
    # Clean the data
    result = clean_labels_remove_orphans(main_csv, main_img)
    
    if result is not None:
        print()
        print("✅ SUCCESS!")
        print(f"🎯 Dataset synced: {len(result):,} samples")
        print("📁 Ready for RGB training!")
        
else:
    print()
    print("❌ Files not found automatically")
    print("💡 Manual usage:")
    print("clean_labels_remove_orphans('labels.csv', 'images/')")


🗑️ CLEANING: XÓA DÒNG CSV KHÔNG CÓ ẢNH
🔍 AUTO-DETECTING FILES...
📄 Found CSV files:
   ./age_gender_labels_only.csv (0.9MB)
   ./age_gender_labels_only_FINAL_CLEANED.csv (0.3MB)
📁 Found image folders:
   ./data/ (10,137 images)

🚀 PROCESSING:
   CSV: ./age_gender_labels_only.csv
   Images: ./data/

📖 CSV: ./age_gender_labels_only.csv
📁 Images: ./data
✅ CSV: 23,705 dòng
✅ Images: 10,137 files
📋 Column: 'img_name'

📊 RESULTS:
   📋 Original: 23,705
   ✅ Kept: 0
   🗑️ Removed: 23,705
   📈 Keep rate: 0.0%
💾 Saved: ./age_gender_labels_only_cleaned.csv

✅ SUCCESS!
🎯 Dataset synced: 0 samples
📁 Ready for RGB training!


In [50]:
# 🗑️ XÓA DÒNG KHÔNG CÓ ẢNH TƯƠNG ỨNG
print("🗑️ CLEANING: XÓA DÒNG CSV KHÔNG CÓ ẢNH")
print("=" * 40)

# FUNCTION CHÍNH
def clean_labels_remove_orphans(csv_path, images_path):
    """Xóa dòng CSV không có ảnh tương ứng"""
    print(f"📖 CSV: {csv_path}")
    print(f"📁 Images: {images_path}")
    
    # Check files exist
    if not os.path.exists(csv_path):
        print(f"❌ CSV không tồn tại: {csv_path}")
        return
    if not os.path.exists(images_path):
        print(f"❌ Folder không tồn tại: {images_path}")
        return
    
    # Load CSV
    df = pd.read_csv(csv_path)
    print(f"✅ CSV: {len(df):,} dòng")
    
    # Get actual images
    extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']
    actual_images = [f for f in os.listdir(images_path) 
                    if any(f.lower().endswith(ext) for ext in extensions)]
    images_set = set(actual_images)
    print(f"✅ Images: {len(actual_images):,} files")
    
    # Find image column
    img_col = None
    for col in ['img_name', 'image_name', 'filename']:
        if col in df.columns:
            img_col = col
            break
    
    if not img_col:
        print(f"❌ Không tìm thấy cột ảnh: {list(df.columns)}")
        return
    
    print(f"📋 Column: '{img_col}'")
    
    # Filter matching rows
    mask = df[img_col].isin(images_set)
    cleaned_df = df[mask].copy()
    
    # Stats
    original = len(df)
    cleaned = len(cleaned_df)
    removed = original - cleaned
    
    print()
    print("📊 RESULTS:")
    print(f"   📋 Original: {original:,}")
    print(f"   ✅ Kept: {cleaned:,}")
    print(f"   🗑️ Removed: {removed:,}")
    print(f"   📈 Keep rate: {cleaned/original*100:.1f}%")
    
    # Save cleaned file
    output = csv_path.replace('.csv', '_cleaned.csv')
    cleaned_df.to_csv(output, index=False)
    print(f"💾 Saved: {output}")
    
    return cleaned_df

# AUTO-DETECT FILES
print("🔍 AUTO-DETECTING FILES...")

# Find CSV files
csv_candidates = []
for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.csv'):
            path = os.path.join(root, file).replace('\\', '/')
            csv_candidates.append(path)

# Find image folders
img_candidates = []
for root, dirs, files in os.walk('.'):
    img_count = sum(1 for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png')))
    if img_count > 0:
        path = root.replace('\\', '/')
        img_candidates.append((path, img_count))

print("📄 Found CSV files:")
for path in csv_candidates:
    size = os.path.getsize(path) / 1024 / 1024
    print(f"   {path} ({size:.1f}MB)")

print("📁 Found image folders:")
for path, count in img_candidates:
    print(f"   {path}/ ({count:,} images)")

# Process if found
if csv_candidates and img_candidates:
    # Pick largest CSV and folder with most images
    main_csv = max(csv_candidates, key=os.path.getsize)
    main_img = max(img_candidates, key=lambda x: x[1])[0]
    
    print()
    print("🚀 PROCESSING:")
    print(f"   CSV: {main_csv}")
    print(f"   Images: {main_img}/")
    print()
    
    # Clean the data
    result = clean_labels_remove_orphans(main_csv, main_img)
    
    if result is not None:
        print()
        print("✅ SUCCESS!")
        print(f"🎯 Dataset synced: {len(result):,} samples")
        print("📁 Ready for RGB training!")
        
else:
    print()
    print("❌ Files not found automatically")
    print("💡 Manual usage:")
    print("clean_labels_remove_orphans('labels.csv', 'images/')")


🗑️ CLEANING: XÓA DÒNG CSV KHÔNG CÓ ẢNH
🔍 AUTO-DETECTING FILES...
📄 Found CSV files:
   ./age_gender_labels_only.csv (0.9MB)
   ./age_gender_labels_only_cleaned.csv (0.0MB)
   ./age_gender_labels_only_FINAL_CLEANED.csv (0.3MB)
📁 Found image folders:
   ./data/ (10,137 images)

🚀 PROCESSING:
   CSV: ./age_gender_labels_only.csv
   Images: ./data/

📖 CSV: ./age_gender_labels_only.csv
📁 Images: ./data
✅ CSV: 23,705 dòng
✅ Images: 10,137 files
📋 Column: 'img_name'

📊 RESULTS:
   📋 Original: 23,705
   ✅ Kept: 0
   🗑️ Removed: 23,705
   📈 Keep rate: 0.0%
💾 Saved: ./age_gender_labels_only_cleaned.csv

✅ SUCCESS!
🎯 Dataset synced: 0 samples
📁 Ready for RGB training!


In [51]:
# 🔧 FIX CLEANING - SỬA LẠI TỪ ĐẦU
print("🔧 FIXED CLEANING SOLUTION")
print("=" * 35)

def simple_clean_csv(csv_file, data_folder):
    """Simple và hiệu quả - xóa dòng không có ảnh"""
    
    print(f"📖 Loading: {csv_file}")
    df = pd.read_csv(csv_file)
    print(f"✅ CSV loaded: {len(df):,} rows")
    print(f"   Columns: {list(df.columns)}")
    
    print(f"\n📁 Scanning folder: {data_folder}")
    all_files = os.listdir(data_folder)
    print(f"✅ Total files found: {len(all_files):,}")
    
    # Lấy 10 sample files để debug
    print(f"\n🔍 SAMPLE FILES IN FOLDER:")
    for i, file in enumerate(all_files[:10]):
        print(f"   {i+1}. {file}")
    
    # Lấy 10 sample từ CSV
    print(f"\n🔍 SAMPLE NAMES IN CSV:")
    img_col = 'img_name'
    for i, name in enumerate(df[img_col].head(10)):
        print(f"   {i+1}. {name}")
    
    # Tạo set tất cả files (không filter extension)
    files_set = set(all_files)
    
    # Check exact matches
    print(f"\n🔍 CHECKING EXACT MATCHES...")
    exact_matches = []
    for idx, row in df.iterrows():
        img_name = row[img_col]
        if img_name in files_set:
            exact_matches.append(idx)
            if len(exact_matches) <= 5:  # Show first 5 matches
                print(f"   ✅ MATCH: {img_name}")
    
    print(f"\n📊 EXACT MATCH RESULTS:")
    print(f"   Total exact matches: {len(exact_matches):,}")
    print(f"   Match rate: {len(exact_matches)/len(df)*100:.1f}%")
    
    if len(exact_matches) == 0:
        print(f"\n🔍 NO EXACT MATCHES - TRYING PARTIAL MATCHING...")
        
        # Thử partial matching với timestamp
        import re
        
        # Extract timestamps từ CSV
        csv_timestamps = {}
        for idx, row in df.iterrows():
            img_name = row[img_col]
            # Tìm timestamp 17 digits
            match = re.search(r'(\d{17})', img_name)
            if match:
                timestamp = match.group(1)
                csv_timestamps[timestamp] = idx
        
        print(f"   CSV timestamps found: {len(csv_timestamps):,}")
        
        # Extract timestamps từ files
        file_timestamps = {}
        for file in all_files:
            match = re.search(r'(\d{17})', file)
            if match:
                timestamp = match.group(1)
                file_timestamps[timestamp] = file
        
        print(f"   File timestamps found: {len(file_timestamps):,}")
        
        # Match by timestamp
        timestamp_matches = []
        mapping = {}
        
        for timestamp in csv_timestamps:
            if timestamp in file_timestamps:
                csv_idx = csv_timestamps[timestamp]
                actual_file = file_timestamps[timestamp]
                timestamp_matches.append(csv_idx)
                
                # Store mapping for later use
                original_name = df.iloc[csv_idx][img_col]
                mapping[original_name] = actual_file
                
                if len(timestamp_matches) <= 5:
                    print(f"   ✅ TIMESTAMP MATCH: {original_name} → {actual_file}")
        
        print(f"\n📊 TIMESTAMP MATCH RESULTS:")
        print(f"   Timestamp matches: {len(timestamp_matches):,}")
        print(f"   Match rate: {len(timestamp_matches)/len(df)*100:.1f}%")
        
        # Use timestamp matches
        if len(timestamp_matches) > 0:
            matches_to_use = timestamp_matches
            print(f"\n🎯 USING TIMESTAMP MATCHES")
            
            # Update img_name với actual filenames
            df_copy = df.copy()
            for idx in matches_to_use:
                original_name = df_copy.iloc[idx][img_col]
                if original_name in mapping:
                    df_copy.iloc[idx, df_copy.columns.get_loc(img_col)] = mapping[original_name]
            
            cleaned_df = df_copy.iloc[matches_to_use].copy()
        else:
            print(f"❌ NO MATCHES FOUND AT ALL")
            return None
    else:
        # Use exact matches
        matches_to_use = exact_matches
        cleaned_df = df.iloc[matches_to_use].copy()
        print(f"\n🎯 USING EXACT MATCHES")
    
    # Save cleaned dataset
    output_file = csv_file.replace('.csv', '_FINAL_CLEANED.csv')
    cleaned_df.to_csv(output_file, index=False)
    
    print(f"\n✅ CLEANING COMPLETED!")
    print(f"📋 Original: {len(df):,} rows")
    print(f"📋 Cleaned: {len(cleaned_df):,} rows")
    print(f"📈 Success rate: {len(cleaned_df)/len(df)*100:.1f}%")
    print(f"💾 Saved to: {output_file}")
    
    # Show final sample
    print(f"\n🔍 FINAL CLEANED SAMPLE:")
    for name in cleaned_df[img_col].head(5):
        print(f"   ✅ {name}")
    
    return cleaned_df

# RUN THE FIXED CLEANING
print("🚀 STARTING FIXED CLEANING PROCESS...")
result = simple_clean_csv('./age_gender_labels_only.csv', './data')

if result is not None and len(result) > 0:
    print(f"\n🎉 SUCCESS! Dataset ready with {len(result):,} samples!")
    print(f"📁 Use file: age_gender_labels_only_FINAL_CLEANED.csv")
    print(f"🚀 Ready for RGB training!")
else:
    print(f"\n❌ Cleaning failed - need manual investigation")

print(f"\n📋 NEXT: Use the FINAL_CLEANED.csv file for training!")


🔧 FIXED CLEANING SOLUTION
🚀 STARTING FIXED CLEANING PROCESS...
📖 Loading: ./age_gender_labels_only.csv
✅ CSV loaded: 23,705 rows
   Columns: ['age', 'ethnicity', 'gender', 'img_name']

📁 Scanning folder: ./data
✅ Total files found: 10,137

🔍 SAMPLE FILES IN FOLDER:
   1. 100_1_0_20170110183726390.jpg
   2. 100_1_2_20170105174847679.jpg
   3. 100_1_2_20170110182836729.jpg
   4. 101_1_2_20170105174739309.jpg
   5. 10_0_0_20161220222308131.jpg
   6. 10_0_0_20170103200329407.jpg
   7. 10_0_0_20170103200522151.jpg
   8. 10_0_0_20170103233459275.jpg
   9. 10_0_0_20170104013211746.jpg
   10. 10_0_0_20170110215927291.jpg

🔍 SAMPLE NAMES IN CSV:
   1. 20161219203650636.jpg.chip.jpg
   2. 20161219222752047.jpg.chip.jpg
   3. 20161219222832191.jpg.chip.jpg
   4. 20161220144911423.jpg.chip.jpg
   5. 20161220144914327.jpg.chip.jpg
   6. 20161220144957407.jpg.chip.jpg
   7. 20161220145040127.jpg.chip.jpg
   8. 20170109191125532.jpg.chip.jpg
   9. 20161219222749039.jpg.chip.jpg
   10. 201701091912099

In [52]:
# 🗑️ XÓA CÁC DÒNG KHÔNG CÓ ẢNH TƯƠNG ỨNG
print("🗑️ CLEANING CSV - XÓA DÒNG KHÔNG CÓ ẢNH")
print("=" * 45)

def clean_csv_remove_missing_images(csv_path, image_folder_path, output_path=None):
    """
    Xóa các dòng trong CSV mà không có ảnh tương ứng
    
    Args:
        csv_path: Đường dẫn file CSV
        image_folder_path: Đường dẫn folder chứa ảnh
        output_path: File output (tự động tạo nếu None)
    """
    
    print(f"📖 Đọc CSV: {csv_path}")
    print(f"📁 Kiểm tra ảnh trong: {image_folder_path}")
    print()
    
    # 1. Kiểm tra file tồn tại
    if not os.path.exists(csv_path):
        print(f"❌ File CSV không tồn tại: {csv_path}")
        return None
        
    if not os.path.exists(image_folder_path):
        print(f"❌ Folder ảnh không tồn tại: {image_folder_path}")
        return None
    
    # 2. Đọc CSV
    try:
        df = pd.read_csv(csv_path)
        print(f"✅ Loaded CSV: {len(df):,} dòng")
        print(f"   Columns: {list(df.columns)}")
    except Exception as e:
        print(f"❌ Lỗi đọc CSV: {e}")
        return None
    
    # 3. Lấy danh sách ảnh thực tế
    try:
        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
        actual_images = []
        
        for file in os.listdir(image_folder_path):
            if any(file.lower().endswith(ext) for ext in image_extensions):
                actual_images.append(file)
        
        actual_images_set = set(actual_images)
        print(f"✅ Tìm thấy: {len(actual_images):,} ảnh thực tế")
        
    except Exception as e:
        print(f"❌ Lỗi đọc folder ảnh: {e}")
        return None
    
    # 4. Tìm cột chứa tên ảnh
    img_column = None
    possible_columns = ['img_name', 'image_name', 'filename', 'file_name', 'image', 'img']
    
    for col in possible_columns:
        if col in df.columns:
            img_column = col
            break
    
    if img_column is None:
        print(f"❌ Không tìm thấy cột tên ảnh!")
        print(f"   Có sẵn: {list(df.columns)}")
        print(f"   Tìm kiếm: {possible_columns}")
        return None
    
    print(f"📋 Sử dụng cột: '{img_column}'")
    print()
    
    # 5. Lọc các dòng có ảnh tương ứng
    print("🔍 Kiểm tra từng dòng...")
    
    indices_to_keep = []
    indices_to_remove = []
    
    for idx, row in df.iterrows():
        img_name = str(row[img_column])
        
        if img_name in actual_images_set:
            indices_to_keep.append(idx)
        else:
            indices_to_remove.append(idx)
    
    # 6. Tạo DataFrame đã lọc
    cleaned_df = df.iloc[indices_to_keep].copy()
    
    # 7. Thống kê
    original_count = len(df)
    kept_count = len(cleaned_df)
    removed_count = len(indices_to_remove)
    
    print("📊 KẾT QUẢ:")
    print(f"   📋 Dòng gốc: {original_count:,}")
    print(f"   ✅ Giữ lại: {kept_count:,}")
    print(f"   🗑️  Đã xóa: {removed_count:,}")
    print(f"   📈 Tỷ lệ giữ: {kept_count/original_count*100:.1f}%")
    
    # 8. Lưu file đã clean
    if output_path is None:
        output_path = csv_path.replace('.csv', '_cleaned.csv')
    
    try:
        cleaned_df.to_csv(output_path, index=False)
        print(f"💾 Đã lưu file clean: {output_path}")
    except Exception as e:
        print(f"❌ Lỗi lưu file: {e}")
        return None
    
    # 9. Hiển thị thống kê chi tiết
    print()
    print("📈 PHÂN TÍCH CHI TIẾT:")
    print("-" * 25)
    
    if 'gender' in cleaned_df.columns:
        print("👥 Gender distribution:")
        gender_counts = cleaned_df['gender'].value_counts()
        for gender, count in gender_counts.items():
            gender_name = 'Male' if gender == 0 else 'Female' if gender == 1 else f'Gender_{gender}'
            print(f"   {gender_name}: {count:,} ({count/len(cleaned_df)*100:.1f}%)")
    
    if 'ethnicity' in cleaned_df.columns:
        print()
        print("🌍 Ethnicity distribution:")
        ethnicity_counts = cleaned_df['ethnicity'].value_counts()
        for eth, count in ethnicity_counts.items():
            print(f"   Ethnicity {eth}: {count:,} ({count/len(cleaned_df)*100:.1f}%)")
    
    if 'age' in cleaned_df.columns:
        print()
        print("📅 Age statistics:")
        print(f"   Min: {cleaned_df['age'].min()} years")
        print(f"   Max: {cleaned_df['age'].max()} years")
        print(f"   Mean: {cleaned_df['age'].mean():.1f} years")
        print(f"   Median: {cleaned_df['age'].median():.1f} years")
    
    print()
    print("🔍 Sample cleaned data:")
    print(cleaned_df.head())
    
    print()
    print("✅ HOÀN THÀNH!")
    print(f"📁 File gốc: {csv_path} ({original_count:,} dòng)")
    print(f"📁 File clean: {output_path} ({kept_count:,} dòng)")
    print(f"🎯 Sẵn sàng train với {kept_count:,} ảnh có màu!")
    
    return cleaned_df

# AUTO-DETECT VÀ THỰC HIỆN CLEANING
print("🚀 TỰ ĐỘNG TÌM VÀ CLEAN DATA...")
print()

# Tìm file CSV
csv_files = []
for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.csv'):
            path = os.path.join(root, file).replace('\\', '/')
            csv_files.append(path)

# Tìm folder ảnh  
image_folders = []
for root, dirs, files in os.walk('.'):
    image_count = sum(1 for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')))
    if image_count > 0:
        path = root.replace('\\', '/')
        image_folders.append((path, image_count))

print("📄 CSV files tìm thấy:")
for i, path in enumerate(csv_files, 1):
    try:
        size = os.path.getsize(path) / 1024 / 1024
        print(f"   {i}. {path} ({size:.1f}MB)")
    except:
        print(f"   {i}. {path}")

print()
print("📁 Image folders tìm thấy:")
for i, (path, count) in enumerate(image_folders, 1):
    print(f"   {i}. {path}/ ({count:,} ảnh)")

# Thực hiện cleaning tự động nếu tìm thấy
if csv_files and image_folders:
    # Chọn file CSV lớn nhất (có thể là file chính)
    main_csv = max(csv_files, key=lambda x: os.path.getsize(x))
    
    # Chọn folder có nhiều ảnh nhất
    main_folder = max(image_folders, key=lambda x: x[1])[0]
    
    print()
    print("🎯 AUTO-SELECTED:")
    print(f"   📄 CSV: {main_csv}")
    print(f"   📁 Images: {main_folder}/")
    print()
    
    # Thực hiện cleaning
    result = clean_csv_remove_missing_images(main_csv, main_folder)
    
else:
    print()
    print("❌ Không tìm thấy đủ files để auto-process")
    print("💡 Sử dụng manual:")
    print("   clean_csv_remove_missing_images('path/to/labels.csv', 'path/to/images/')")

print()
print("📋 NEXT STEPS:")
print("1️⃣ Kiểm tra file *_cleaned.csv")
print("2️⃣ Verify cùng số lượng với folder ảnh")
print("3️⃣ Ready để train RGB model! 🎨")


🗑️ CLEANING CSV - XÓA DÒNG KHÔNG CÓ ẢNH
🚀 TỰ ĐỘNG TÌM VÀ CLEAN DATA...

📄 CSV files tìm thấy:
   1. ./age_gender_labels_only.csv (0.9MB)
   2. ./age_gender_labels_only_cleaned.csv (0.0MB)
   3. ./age_gender_labels_only_FINAL_CLEANED.csv (0.3MB)

📁 Image folders tìm thấy:
   1. ./data/ (10,137 ảnh)

🎯 AUTO-SELECTED:
   📄 CSV: ./age_gender_labels_only.csv
   📁 Images: ./data/

📖 Đọc CSV: ./age_gender_labels_only.csv
📁 Kiểm tra ảnh trong: ./data

✅ Loaded CSV: 23,705 dòng
   Columns: ['age', 'ethnicity', 'gender', 'img_name']
✅ Tìm thấy: 10,137 ảnh thực tế
📋 Sử dụng cột: 'img_name'

🔍 Kiểm tra từng dòng...
📊 KẾT QUẢ:
   📋 Dòng gốc: 23,705
   ✅ Giữ lại: 0
   🗑️  Đã xóa: 23,705
   📈 Tỷ lệ giữ: 0.0%
💾 Đã lưu file clean: ./age_gender_labels_only_cleaned.csv

📈 PHÂN TÍCH CHI TIẾT:
-------------------------
👥 Gender distribution:

🌍 Ethnicity distribution:

📅 Age statistics:
   Min: nan years
   Max: nan years
   Mean: nan years
   Median: nan years

🔍 Sample cleaned data:
Empty DataFrame
Colum