# **1. File Integrity**
•	  No corrupted or unreadable image files.


•	  All images open correctly and are in supported formats (.jpg, .png, etc.).


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
from PIL import Image

def check_image_status(directory):
    if not os.path.isdir(directory):
        print(f"Error: Directory not found at {directory}")
        return

    print(f"Checking images in directory: {directory}")
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            try:
                img = Image.open(filepath)
                img.verify()  # Verify that the file is an image
                print(f"{filename}: NOT corrupted")
            except (IOError, SyntaxError) as e:
                print(f"{filename}: IS corrupted - {e}")
            except Exception as e:
                print(f"{filename}: Could not process - {e}")
        else:
            print(f"{filename}: IS NOT a file (skipping)")



In [None]:
address='/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)'
class_es=['test','train','valid']
sub_class_es=['Immature','Mature','Normal']

In [None]:
for cls in class_es:
  for sub_cls in sub_class_es:
    print(address+'/'+cls+'/'+sub_cls)
    check_image_status(address+'/'+cls+'/'+sub_cls)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Immature (492).jpg: NOT corrupted
Immature (493).jpg: NOT corrupted
Immature (494).jpg: NOT corrupted
Immature (495).jpg: NOT corrupted
Immature (496).jpg: NOT corrupted
Immature (497).jpg: NOT corrupted
Immature (498).jpg: NOT corrupted
Immature (499).jpg: NOT corrupted
Immature (500).jpg: NOT corrupted
Immature (501).jpg: NOT corrupted
Immature (502).jpg: NOT corrupted
Immature (503).jpg: NOT corrupted
Immature (504).jpg: NOT corrupted
Immature (505).jpg: NOT corrupted
Immature (506).jpg: NOT corrupted
Immature (507).jpg: NOT corrupted
Immature (508).jpg: NOT corrupted
Immature (509).jpg: NOT corrupted
Immature (510).jpg: NOT corrupted
Immature (511).jpg: NOT corrupted
Immature (512).jpg: NOT corrupted
Immature (513).jpg: NOT corrupted
Immature (514).jpg: NOT corrupted
Immature (515).jpg: NOT corrupted
Immature (516).jpg: NOT corrupted
Immature (517).jpg: NOT corrupted
Immature (518).jpg: NOT corrupted
Immature (519).jp

# **2. Consistent Structure**

•	  Folder names match class labels exactly (immature, mature, normal).

•	  No extra or misplaced files (e.g., .txt, .DS_Store, etc.).

•	  Each folder contains only images of that class.


In [None]:
import os

def check_folder_names(base_directory, expected_folders):
    """
    Checks if the folders in a base directory match the expected list of folder names.

    Args:
        base_directory (str): The path to the base directory.
        expected_folders (list): A list of expected folder names.
    """
    print(f"Checking folder names in base directory: {base_directory}")

    if not os.path.isdir(base_directory):
        print(f"Error: Base directory not found at {base_directory}")
        return

    found_folders = set([item for item in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, item))])
    expected_folders_set = set(expected_folders)

    missing_folders = expected_folders_set - found_folders
    if missing_folders:
        print("Warning: Expected folders not found:")
        for folder in missing_folders:
            print(f"- {folder}")

    unexpected_folders = found_folders - expected_folders_set
    if unexpected_folders:
        print("Warning: Unexpected folders found:")
        for folder in unexpected_folders:
            print(f"- {folder}")

    if not missing_folders and not unexpected_folders:
        print("All expected folders found and no unexpected folders.")


def check_image_files_only(directory):
    """
    Checks if a directory contains only image files (based on common extensions).

    Args:
        directory (str): The path to the directory to check.
    """
    print(f"Checking files in directory: {directory}")

    if not os.path.isdir(directory):
        print(f"Error: Directory not found at {directory}")
        return

    non_image_files_found = False
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            # Check if the file is an image (basic check based on extension)
            if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                print(f"Warning: Non-image file found: {filename}")
                non_image_files_found = True
        else:
            print(f"Info: Non-file item found (skipping): {filename}")

    if not non_image_files_found:
        print("All files appear to be images.")

# Example usage (using variables from your notebook):
# address='/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)'
# class_es=['test','train','valid']
# sub_class_es=['Immature','Mature','Normal']

# Check the main class folders:
#

# Check the sub-class folders within each class folder:
#

# Check the files within each sub-class folder:
#

In [None]:
check_folder_names(address, class_es)

Checking folder names in base directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)
All expected folders found and no unexpected folders.


In [None]:
for cls in class_es:
     check_folder_names(os.path.join(address, cls), sub_class_es)

Checking folder names in base directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test
All expected folders found and no unexpected folders.
Checking folder names in base directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/train
All expected folders found and no unexpected folders.
Checking folder names in base directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/valid
All expected folders found and no unexpected folders.


In [None]:
for cls in class_es:
     for sub_cls in sub_class_es:
         check_image_files_only(os.path.join(address, cls, sub_cls))

Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Immature
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Mature
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Normal
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/train/Immature
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/train/Mature
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/train/Normal
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/valid/Immature
All files appear to be images.
Checking files in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/v

In [None]:
import os

def check_filename_label_accuracy(base_directory, class_es, sub_class_es):
       print(f"Checking filename label accuracy in base directory: {base_directory}")
    if not os.path.isdir(base_directory):
        print(f"Error: Base directory not found at {base_directory}")
        return
    for cls in class_es:
        class_folder_path = os.path.join(base_directory, cls)
        if not os.path.isdir(class_folder_path):
            print(f"Warning: Class folder not found (skipping): {class_folder_path}")
            continue
        for sub_cls in sub_class_es:
            sub_class_folder_path = os.path.join(class_folder_path, sub_cls)
            print(f"\nChecking folder: {sub_class_folder_path}")
            if not os.path.isdir(sub_class_folder_path):
                print(f"Warning: Sub-class folder not found (skipping): {sub_class_folder_path}")
                continue
            for filename in os.listdir(sub_class_folder_path):
                filepath = os.path.join(sub_class_folder_path, filename)
                if os.path.isfile(filepath):
                    # Get the folder name (sub_cls) and check if it's in the filename (case-insensitive)
                    if sub_cls.lower() not in filename.lower():
                        print(f"Warning: Filename '{filename}' in folder '{sub_cls}' does not contain the folder name.")
                    else:
                        print(f"Filename '{filename}' in folder '{sub_cls}' contains the folder name.")
                else:
                    print(f"Info: Non-file item found (skipping): {filename}")
# Example usage (using variables from your notebook):
# address='/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)'
# class_es=['test','train','valid']
# sub_class_es=['Immature','Mature','Normal']

In [None]:
 check_filename_label_accuracy(address, class_es, sub_class_es)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Filename 'Immature (492).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (493).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (494).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (495).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (496).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (497).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (498).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (499).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (500).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (501).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (502).jpg' in folder 'Immature' contains the folder name.
Filename 'Immature (503).jpg' in folder 'Immature' contains the folder name.
Filename 'I

In [None]:


def check_image_size_consistency(directory, expected_size):
    print(f"Checking image sizes in directory: {directory}")

    if not os.path.isdir(directory):
        print(f"Error: Directory not found at {directory}")
        return

    size_mismatch_found = False
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            try:
                img = Image.open(filepath)
                if img.size != expected_size:
                    print(f"Warning: Image '{filename}' has size {img.size}, expected {expected_size}.")
                    size_mismatch_found = True
            except IOError:
                print(f"Error: Could not open or process image file: {filename}")
                size_mismatch_found = True # Consider inability to open as a mismatch for the success message
        else:
            print(f"Info: Non-file item found (skipping): {filename}")


    if not size_mismatch_found:
        print(f"All images in directory '{directory}' are of expected size {expected_size}.")


# Checking for pixel value normalization (e.g., divided by 255) is more complex
# and typically done during the data loading and preprocessing steps for model training.
# It's not something easily checked by just looking at the image files themselves
# without knowing the original pixel range.


In [None]:
expected_image_size = (224, 224)

In [None]:
for cls in class_es:
  for sub_cls in sub_class_es:
    check_image_size_consistency(os.path.join(address, cls, sub_cls), expected_image_size)

Checking image sizes in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Immature
All images in directory '/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Immature' are of expected size (224, 224).
Checking image sizes in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Mature
All images in directory '/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Mature' are of expected size (224, 224).
Checking image sizes in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Normal
All images in directory '/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/test/Normal' are of expected size (224, 224).
Checking image sizes in directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/train/Immature
All images in directory '/content/drive/My Drive/PROJECT1/data/data_set(15,15,70)/train/Immature' are of expected size (224, 224).
Checking image sizes in directory: /content/drive/My Drive/PRO

In [None]:
import os
import pandas as pd

def check_class_distribution(base_directory, class_es, sub_class_es):
    print(f"Checking class distribution in base directory: {base_directory}")
    if not os.path.isdir(base_directory):
        print(f"Error: Base directory not found at {base_directory}")
        return
    distribution_data = {}
    total_images = 0
    for cls in class_es:
        class_folder_path = os.path.join(base_directory, cls)
        if not os.path.isdir(class_folder_path):
            print(f"Warning: Class folder not found (skipping): {class_folder_path}")
            continue
        distribution_data[cls] = {}
        class_total = 0
        for sub_cls in sub_class_es:
            sub_class_folder_path = os.path.join(class_folder_path, sub_cls)
            if not os.path.isdir(sub_class_folder_path):
                print(f"Warning: Sub-class folder not found (skipping): {sub_class_folder_path}")
                distribution_data[cls][sub_cls] = 0
                continue
            image_count = 0
            for filename in os.listdir(sub_class_folder_path):
                if os.path.isfile(os.path.join(sub_class_folder_path, filename)) and filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_count += 1
            distribution_data[cls][sub_cls] = image_count
            class_total += image_count
        distribution_data[cls]['Total'] = class_total
        total_images += class_total
    df_distribution = pd.DataFrame(distribution_data)
    df_distribution['Sub-class Total'] = df_distribution.loc[sub_class_es].sum(axis=1)
    print("\nImage Distribution per Sub-class and Class:")
    display(df_distribution)
    print(f"\nTotal number of images found: {total_images}")

In [None]:
check_class_distribution(address, class_es, sub_class_es)

Checking class distribution in base directory: /content/drive/My Drive/PROJECT1/data/data_set(15,15,70)

Image Distribution per Sub-class and Class:


Unnamed: 0,test,train,valid,Sub-class Total
Immature,281,1317,185,1783.0
Mature,320,1499,321,2140.0
Normal,324,1510,332,2166.0
Total,925,4326,838,



Total number of images found: 6089
