In [1]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Collecting pyyaml (from kagglehub)
  Downloading pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB)
Collecting requests (from kagglehub)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm (from kagglehub)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting charset_normalizer<4,>=2 (from requests->kagglehub)
  Downloading charset_normalizer-3.4.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (37 kB)
Collecting idna<4,>=2.5 (from requests->kagglehub)
  Downloading idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests->kagglehub)
  Downloading urllib3-2.6.2-py3-none-any.whl.metadata (6.6 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
Downloading pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("briscdataset/brisc2025")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/briscdataset/brisc2025?dataset_version_number=6...


100%|██████████| 250M/250M [00:19<00:00, 13.6MB/s] 

Extracting files...





Path to dataset files: /home/whsung8451/.cache/kagglehub/datasets/briscdataset/brisc2025/versions/6


In [4]:
import os
import shutil
import random
from pathlib import Path

# Define paths based on the user's provided location
# Using expanduser to handle the "~" in the path
base_path = Path(os.path.expanduser("~/.cache/kagglehub/datasets/briscdataset/brisc2025/versions/6/brisc2025/classification_task"))
train_dir = base_path / "train"
val_dir = base_path / "val"

# Validation split ratio (e.g., 0.2 for 20% validation data)
val_ratio = 0.2

# Set seed for reproducibility
random.seed(42)

def create_validation_split():
    if not train_dir.exists():
        print(f"Error: Train directory not found at {train_dir}")
        return

    # Check if val directory already exists and is not empty to prevent double splitting
    if val_dir.exists() and any(val_dir.iterdir()):
        print(f"Validation directory {val_dir} already exists and is not empty. Skipping split.")
        return

    # Create val directory
    if not val_dir.exists():
        val_dir.mkdir(parents=True)
        print(f"Created validation directory: {val_dir}")

    # Iterate over each class folder in train
    class_folders = [d for d in train_dir.iterdir() if d.is_dir()]
    
    for class_folder in class_folders:
        class_name = class_folder.name
        
        # Create corresponding class folder in val
        val_class_dir = val_dir / class_name
        if not val_class_dir.exists():
            val_class_dir.mkdir(parents=True)
        
        # Get all files in the class folder
        files = [f for f in class_folder.iterdir() if f.is_file()]
        
        # Shuffle files to ensure random split (avoiding data leakage from ordering)
        random.shuffle(files)
        
        # Calculate number of validation samples
        num_val = int(len(files) * val_ratio)
        val_files = files[:num_val]
        
        print(f"Processing class '{class_name}': Moving {num_val} of {len(files)} images to validation set.")
        
        # Move files from train to val
        for file in val_files:
            shutil.move(str(file), str(val_class_dir / file.name))
            
    print("Validation split completed successfully.")

create_validation_split()

Created validation directory: /home/whsung8451/.cache/kagglehub/datasets/briscdataset/brisc2025/versions/6/brisc2025/classification_task/val
Processing class 'no_tumor': Moving 213 of 1067 images to validation set.
Processing class 'meningioma': Moving 265 of 1329 images to validation set.
Processing class 'glioma': Moving 229 of 1147 images to validation set.
Processing class 'pituitary': Moving 291 of 1457 images to validation set.
Validation split completed successfully.


In [5]:
import os

# Define the path to the dataset
# Assuming './classification_task' based on your workspace structure
dataset_path = './classification_task'
splits = ['train', 'val', 'test']

print(f"Counting images in {dataset_path}...")

for split in splits:
    split_path = os.path.join(dataset_path, split)
    if os.path.exists(split_path):
        total_images = 0
        print(f"\n--- {split.upper()} SET ---")
        # Walk through the directory to count files in subdirectories (classes)
        for root, dirs, files in os.walk(split_path):
            # Skip the root folder itself, only count in subfolders if structure is split/class/image
            if root == split_path:
                continue
                
            class_name = os.path.basename(root)
            count = len([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp'))])
            if count > 0:
                print(f"  {class_name}: {count}")
                total_images += count
        
        print(f"Total {split} images: {total_images}")
    else:
        print(f"Directory not found: {split_path}")

Counting images in ./classification_task...

--- TRAIN SET ---
  no_tumor: 854
  meningioma: 1064
  glioma: 918
  pituitary: 1166
Total train images: 4002

--- VAL SET ---
  no_tumor: 213
  meningioma: 265
  glioma: 229
  pituitary: 291
Total val images: 998

--- TEST SET ---
  no_tumor: 140
  meningioma: 306
  glioma: 254
  pituitary: 300
Total test images: 1000


In [None]:
import torch
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
import os
import random
import numpy as np

# Define the path to the dataset
base_path = os.path.expanduser("./classification_task")
test_dir = os.path.join(base_path, "test")

# Helper class for Gaussian Noise
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean

# Function to get a random image
def get_random_image(root_dir):
    classes = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
    if not classes:
        raise ValueError(f"No classes found in {root_dir}")
    
    random_class = random.choice(classes)
    class_path = os.path.join(root_dir, random_class)
    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    if not images:
        raise ValueError(f"No images found in {class_path}")
        
    image_path = os.path.join(class_path, random.choice(images))
    return Image.open(image_path).convert('RGB'), random_class

# Get a sample image
try:
    original_img, label = get_random_image(test_dir)
    print(f"Selected image from class: {label}")
except Exception as e:
    print(f"Error loading image: {e}")
    # Fallback to creating a dummy image if path is wrong
    original_img = Image.fromarray(np.uint8(np.random.rand(224, 224, 3) * 255))
    print("Using dummy image.")

# Base transforms
to_tensor = transforms.ToTensor()
to_pil = transforms.ToPILImage()
resize = transforms.Resize((224, 224))

img_tensor = to_tensor(resize(original_img))

# Define transformations
transformations = {
    'Original': img_tensor,
}

# Gaussian Noise
for sigma in [0.1, 0.5, 1.0]:
    noisy_tensor = AddGaussianNoise(0., sigma)(img_tensor.clone())
    noisy_tensor = torch.clamp(noisy_tensor, 0, 1)
    transformations[f'Noise (σ={sigma})'] = noisy_tensor

# Blur
for k in [3, 5, 7]:
    blur_transform = transforms.GaussianBlur(kernel_size=k)
    blurred_tensor = blur_transform(img_tensor.clone())
    transformations[f'Blur (k={k})'] = blurred_tensor

# Contrast
for f in [0.5, 0.8]:
    contrast_transform = transforms.ColorJitter(contrast=(f, f))
    contrast_tensor = contrast_transform(img_tensor.clone())
    transformations[f'Contrast (f={f})'] = contrast_tensor

# Plotting
num_imgs = len(transformations)
cols = 4
rows = (num_imgs + cols - 1) // cols

plt.figure(figsize=(15, 4 * rows))
for i, (name, tensor) in enumerate(transformations.items()):
    plt.subplot(rows, cols, i + 1)
    plt.imshow(to_pil(tensor))
    plt.title(name)
    plt.axis('off')

plt.tight_layout()
plt.show()