In [1]:
from pathlib import Path
import pandas as pd
from PIL import Image

In [2]:
folder_path = Path("/home/valentin/workspaces/histolung/data/raw/LungHist700/data/images")

In [None]:
# Initialize an empty list to store file information
file_info = []

# Iterate through all .jpg files in subfolders
for file in folder_path.rglob("*.jpg"):
    # Extract the file name
    filename = file.stem  # Get the file name without extension
    
    # Split the file name into parts
    parts = filename.split("_")
    
    # Handle normal images (missing differentiation field)
    if len(parts) == 3 and parts[0] == "nor":
        cancer_type = parts[0]
        magnification = int(parts[1].replace("x", ""))
        differentiation = None  # Normal images don't have differentiation
    # Handle images with differentiation
    elif len(parts) == 4:
        cancer_type, differentiation, magnification, image_id = parts
        magnification = int(magnification.replace("x", ""))
    else:
        print(f"Skipping file with unexpected format: {file}")
        continue
    
    # Get the dimensions of the image
    try:
        with Image.open(file) as img:
            width, height = img.size
    except Exception as e:
        print(f"Error reading file {file}: {e}")
        continue
    
    # Append the file information to the list
    file_info.append({
        "filename": file.name,
        "filepath": str(file),
        "cancer_type": cancer_type,
        "differentiation": differentiation,
        "magnification": magnification,
        "width": width,
        "height": height
    })

# Create a DataFrame from the collected information
df = pd.DataFrame(file_info)

# Display the first few rows of the DataFrame in the notebook
df.head()

# Optional: Save the DataFrame to a CSV file
df.to_csv("image_metadata.csv", index=False)
print("Metadata saved to image_metadata.csv")

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt

# Define the columns to plot
columns_to_plot = ["cancer_type", "differentiation", "magnification", "width", "height"]

# Create a histogram or bar plot for each column
for column in columns_to_plot:
    plt.figure(figsize=(8, 5))
    if df[column].dtype == "object" or df[column].isnull().any():
        # For categorical columns, create a bar plot
        df[column].fillna("None").value_counts().plot(kind="bar")
        plt.xlabel(column)
        plt.ylabel("Count")
        plt.title(f"Distribution of {column}")
    else:
        # For numerical columns, create a histogram
        df[column].plot(kind="hist", bins=10, edgecolor="black")
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.title(f"Distribution of {column}")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()
