In [None]:
from pathlib import Path
from dotenv import load_dotenv

import openslide
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from histolung.utils.yaml import load_yaml_with_env

In [None]:
project_dir = Path.cwd().parent

In [None]:
load_dotenv()

data_config = load_yaml_with_env(project_dir / "histolung/config/datasets_config.yaml")

In [None]:
data_config

In [None]:
def get_files_by_pattern(dataset_info):
    """Get files matching the pattern in the dataset."""
    data_dir = Path(dataset_info["data_dir"])
    input_pattern = dataset_info["input_pattern"]
    return list(data_dir.rglob(input_pattern))  # Recursive search for pattern


def get_wsi_metadata(file_path):
    """Retrieve pixel dimensions, pixel size, manufacturer, magnification, and resolution info for a given WSI file."""
    with openslide.OpenSlide(str(file_path)) as slide:
        # Get dimensions
        width, height = slide.dimensions
        mpp_x = float(slide.properties.get("openslide.mpp-x", "nan"))
        mpp_y = float(slide.properties.get("openslide.mpp-y", "nan"))

        # Calculate width and height in microns if mpp_x and mpp_y are available
        width_microns = width * mpp_x if not pd.isna(mpp_x) else float('nan')
        height_microns = height * mpp_y if not pd.isna(mpp_y) else float('nan')

        # Get manufacturer and magnification
        manufacturer = slide.properties.get("openslide.vendor", "Unknown")
        magnification = slide.properties.get("openslide.objective-power",
                                             "Unknown")

        # Get TIFF resolutions and calculate resolution in microns
        x_res = float(slide.properties.get("tiff.XResolution", "nan"))
        y_res = float(slide.properties.get("tiff.YResolution", "nan"))
        resolution_unit = slide.properties.get(
            "tiff.ResolutionUnit",
            "inch")  # Default to inches if not specified

        # Convert XResolution and YResolution to microns-per-pixel based on ResolutionUnit
        if resolution_unit == "centimeter":
            resolution_mpp_x = 1 / x_res * 10000  # Convert to microns
            resolution_mpp_y = 1 / y_res * 10000
        elif resolution_unit == "inch":
            resolution_mpp_x = 1 / x_res * 25400  # Convert to microns
            resolution_mpp_y = 1 / y_res * 25400
        else:
            resolution_mpp_x = resolution_mpp_y = float('nan')  # Unknown units

    return (width, height, mpp_x, mpp_y, width_microns, height_microns,
            manufacturer, magnification, x_res, y_res, resolution_mpp_x,
            resolution_mpp_y)

In [None]:
pixel_data = []
# Loop over each dataset
for dataset_name, dataset_info in data_config["datasets"].items():
    # Loop over each file in the dataset matching the pattern
    for file_path in tqdm(get_files_by_pattern(dataset_info)):
        # Get metadata
        (width, height, mpp_x, mpp_y, width_microns, height_microns,
         manufacturer, magnification, x_res, y_res, resolution_mpp_x,
         resolution_mpp_y) = get_wsi_metadata(file_path)

        # Append to the list
        pixel_data.append({
            "dataset": dataset_name,
            "file_name": file_path.name,
            "width": width,
            "height": height,
            "mpp_x": mpp_x,
            "mpp_y": mpp_y,
            "width_microns": width_microns,
            "height_microns": height_microns,
            "manufacturer": manufacturer,
            "magnification": magnification,
            "tiff.XResolution": x_res,
            "tiff.YResolution": y_res,
            "resolution_mpp_x": resolution_mpp_x,
            "resolution_mpp_y": resolution_mpp_y
        })
# Display the DataFrame
pixel_data = pd.DataFrame(pixel_data)
pixel_data.reset_index(drop=True, inplace=True)
print(pixel_data)

In [None]:
# Plot histograms for all numerical columns
pixel_data[["width_microns", "height_microns", "mpp_x", "mpp_y"]].hist(bins=30, figsize=(12, 8), layout=(2, 2), alpha=0.7, color='blue')
plt.suptitle("Histograms of WSI Dimensions and Pixel Sizes")
plt.show()

In [None]:
pixel_data[pixel_data["mpp_x"]>0.25]

In [None]:
na_columns = pixel_data.isna().any()
print("Columns with NaN values:")
print(na_columns[na_columns == True])


In [None]:
rows_with_na = pixel_data[pixel_data.isna().any(axis=1)]
print("Rows with NaN values:")
print(rows_with_na)


In [None]:
rows_with_na["file_name"].values[1]

In [None]:
pixel_data.to_csv("yo.csv")