In [1]:
import os
from PIL import Image
from collections import defaultdict, Counter
import numpy as np

# Path to your main folder
main_folder = "train_data"

# Initialize a dictionary to store data info
data_info = defaultdict(list)

# Traverse the folder
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(subfolder_path):  # Check if it is a folder
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith(".png"):  # Check if it's a PNG image
                file_path = os.path.join(subfolder_path, file_name)
                try:
                    with Image.open(file_path) as img:
                        resolution = img.size  # (width, height)
                        data_info[subfolder].append(resolution)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# Function to calculate statistics
def calculate_statistics(resolutions):
    widths, heights = zip(*resolutions)
    return {
        "Total Images": len(resolutions),
        "Min Resolution": (min(widths), min(heights)),
        "Max Resolution": (max(widths), max(heights)),
        "Mean Resolution": (np.mean(widths), np.mean(heights)),
        "Most Common Resolutions": Counter(resolutions).most_common(3)
    }

# Summarize data
print("Data Summary:")
for class_name, resolutions in data_info.items():
    print(f"\nClass {class_name}:")
    stats = calculate_statistics(resolutions)
    print(f"  Total images: {stats['Total Images']}")
    print(f"  Min resolution: {stats['Min Resolution']}")
    print(f"  Max resolution: {stats['Max Resolution']}")
    print(f"  Mean resolution: {stats['Mean Resolution']}")
    print("  Most common resolutions:")
    for res, count in stats["Most Common Resolutions"]:
        print(f"    {res}: {count} images")

Data Summary:

Class bothcells:
  Total images: 3448
  Min resolution: (107, 107)
  Max resolution: (2000, 2000)
  Mean resolution: (345.6798143851508, 365.8532482598608)
  Most common resolutions:
    (2000, 2000): 4 images
    (330, 338): 3 images
    (329, 354): 3 images

Class healthy:
  Total images: 28895
  Min resolution: (107, 102)
  Max resolution: (2000, 2000)
  Mean resolution: (345.60647170790793, 361.3189133068005)
  Most common resolutions:
    (2000, 2000): 22 images
    (344, 345): 8 images
    (364, 388): 7 images

Class rubbish:
  Total images: 50371
  Min resolution: (102, 102)
  Max resolution: (2000, 2000)
  Mean resolution: (314.7550971789323, 312.5858529709555)
  Most common resolutions:
    (2000, 2000): 384 images
    (110, 243): 32 images
    (110, 275): 22 images

Class unhealthy:
  Total images: 2366
  Min resolution: (115, 102)
  Max resolution: (2000, 1310)
  Mean resolution: (317.7912087912088, 326.6128486897718)
  Most common resolutions:
    (216, 220):

In [7]:
import pandas 
# read csv file 
df = pandas.read_csv('isbi2025-ps3c-test-dataset.csv')


In [12]:
# addq new column to the dataframe
# df['label'] = 'rubbish'
# 0.50889

# df['label'] = 'healthy'
# 0.15586

df['label'] = 'bothcells'
df


Unnamed: 0,image_name,label
0,isbi2025_ps3c_test_image_04577.png,bothcells
1,isbi2025_ps3c_test_image_08558.png,bothcells
2,isbi2025_ps3c_test_image_02443.png,bothcells
3,isbi2025_ps3c_test_image_12392.png,bothcells
4,isbi2025_ps3c_test_image_13368.png,bothcells
...,...,...
18154,isbi2025_ps3c_test_image_13285.png,bothcells
18155,isbi2025_ps3c_test_image_01568.png,bothcells
18156,isbi2025_ps3c_test_image_14151.png,bothcells
18157,isbi2025_ps3c_test_image_15941.png,bothcells


In [13]:
# save the dataframe to a new csv file
df.to_csv('check.csv', index=False)

In [1]:
import os
from PIL import Image
from tqdm import tqdm 

# Paths for input and output directories
input_root = "train_data"
output_root = "resized_train_data"

# Target resolution
target_size = (224, 224)

# Create the output directory if it doesn't exist
os.makedirs(output_root, exist_ok=True)

# Traverse the input folder
for subfolder in os.listdir(input_root):
    input_subfolder_path = os.path.join(input_root, subfolder)
    # output_subfolder_path = os.path.join(output_root, subfolder)
    
    if subfolder in ['bothcells', 'healthy']: 
        continue

    if os.path.isdir(input_subfolder_path):  # Check if it's a folder
        os.makedirs(output_root, exist_ok=True)  # Create the subfolder in the output directory

        for file_name in tqdm(os.listdir(input_subfolder_path)):
            if file_name.endswith(".png"):  # Check if it's a PNG image
                input_file_path = os.path.join(input_subfolder_path, file_name)
                output_file_path = os.path.join(output_root, file_name)

                try:
                    # Open and resize the image
                    with Image.open(input_file_path) as img:
                        resized_img = img.resize(target_size, Image.ANTIALIAS)
                        # Save to the output path
                        resized_img.save(output_file_path)
                except Exception as e:
                    print(f"Error processing {input_file_path}: {e}")

print("Image resizing completed!")

  resized_img = img.resize(target_size, Image.ANTIALIAS)
  0%|          | 1/50371 [00:00<1:30:17,  9.30it/s]

  1%|          | 262/50371 [00:13<44:14, 18.88it/s]  


KeyboardInterrupt: 

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Đường dẫn tới tệp CSV
csv_file = "train.csv"
data = pd.read_csv(csv_file)
data.head()

Unnamed: 0,label,image_path
0,0,resized_train_data/isbi2025_ps3c_train_image_7...
1,1,resized_train_data/isbi2025_ps3c_train_image_8...
2,0,resized_train_data/isbi2025_ps3c_train_image_8...
3,2,resized_train_data/isbi2025_ps3c_train_image_0...
4,2,resized_train_data/isbi2025_ps3c_train_image_3...


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Đường dẫn tới tệp CSV
csv_file = "train.csv"

# Đọc dữ liệu từ CSV
data = pd.read_csv(csv_file)

# Kiểm tra cấu trúc dữ liệu
print(data.head())

# Chia dữ liệu thành tập train và validation theo kiểu phân tầng
train_data, val_data = train_test_split(
    data,
    test_size=0.2,  # Tỉ lệ validation (20%)
    random_state=42,
    stratify=data['label']  # Phân tầng dựa trên nhãn
)

# Kiểm tra số lượng mẫu sau khi chia
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")

# Lưu kết quả ra file CSV
train_data.to_csv("split/train.csv", index=False)
val_data.to_csv("split/val.csv", index=False)

print("Data split and saved successfully!")

   label                                         image_path
0      0  resized_train_data/isbi2025_ps3c_train_image_7...
1      1  resized_train_data/isbi2025_ps3c_train_image_8...
2      0  resized_train_data/isbi2025_ps3c_train_image_8...
3      2  resized_train_data/isbi2025_ps3c_train_image_0...
4      2  resized_train_data/isbi2025_ps3c_train_image_3...
Train size: 68064
Validation size: 17016
Data split and saved successfully!
