In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:

# Load the datasets
radiomics_df = pd.read_excel('OpenBTAI_RADIOMICS.xlsx')
morphological_df = pd.read_excel('OpenBTAI_MORPHOLOGICAL_MEASUREMENTS.xlsx')
clinical_data_df = pd.read_excel('OpenBTAI_METS_ClinicalData_Sept2023.xlsx')


In [4]:
# Function for basic EDA
def basic_eda(df, title):
    print(f"--- {title} ---")
    print("Shape of the dataset:", df.shape)
    print("\nColumns:\n", df.columns)
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nBasic Statistics:\n", df.describe(include='all'))
    print("\nFirst Few Rows:\n", df.head())
    print("\n--- End of EDA ---\n")
    print("\n")

In [5]:
# Perform EDA on each dataset
basic_eda(radiomics_df, "Radiomics Dataset")
basic_eda(morphological_df, "Morphological Measurements Dataset")
basic_eda(clinical_data_df, "Clinical Data Dataset")

--- Radiomics Dataset ---
Shape of the dataset: (1276, 1159)

Columns:
 Index(['Patient', 'Timepoint', 'Label', 'Lesion', 'Segment', 'Image', 'Mask',
       'diagnostics_Configuration_EnabledImageTypes',
       'diagnostics_Configuration_Settings',
       'diagnostics_Image-original_Dimensionality',
       ...
       'wavelet-LLL_glszm_SmallAreaHighGrayLevelEmphasis',
       'wavelet-LLL_glszm_SmallAreaLowGrayLevelEmphasis',
       'wavelet-LLL_glszm_ZoneEntropy', 'wavelet-LLL_glszm_ZonePercentage',
       'wavelet-LLL_glszm_ZoneVariance', 'wavelet-LLL_ngtdm_Busyness',
       'wavelet-LLL_ngtdm_Coarseness', 'wavelet-LLL_ngtdm_Complexity',
       'wavelet-LLL_ngtdm_Contrast', 'wavelet-LLL_ngtdm_Strength'],
      dtype='object', length=1159)

Data Types:
 Patient                           int64
Timepoint                         int64
Label                             int64
Lesion                            int64
Segment                          object
                                 ...

In [None]:





# Visualize missing values using heatmaps
plt.figure(figsize=(16, 6))
plt.subplot(1, 3, 1)
sns.heatmap(radiomics_df.isnull(), cbar=False, cmap="viridis")
plt.title("Radiomics Dataset - Missing Values")

plt.subplot(1, 3, 2)
sns.heatmap(morphological_df.isnull(), cbar=False, cmap="viridis")
plt.title("Morphological Measurements Dataset - Missing Values")

plt.subplot(1, 3, 3)
sns.heatmap(clinical_data_df.isnull(), cbar=False, cmap="viridis")
plt.title("Clinical Data Dataset - Missing Values")

plt.tight_layout()
plt.show()

# Visualize distributions of numerical features (for Radiomics Dataset)
numeric_features_radiomics = radiomics_df.select_dtypes(include=['float64', 'int64']).columns

radiomics_df[numeric_features_radiomics].hist(bins=20, figsize=(14, 10), color='blue')
plt.suptitle('Distribution of Numerical Features - Radiomics Dataset', fontsize=16)
plt.show()

# Visualize distributions of numerical features (for Morphological Measurements Dataset)
numeric_features_morphological = morphological_df.select_dtypes(include=['float64', 'int64']).columns

morphological_df[numeric_features_morphological].hist(bins=20, figsize=(14, 10), color='green')
plt.suptitle('Distribution of Numerical Features - Morphological Measurements Dataset', fontsize=16)
plt.show()

# Visualize distributions of numerical features (for Clinical Data Dataset)
numeric_features_clinical = clinical_data_df.select_dtypes(include=['float64', 'int64']).columns

clinical_data_df[numeric_features_clinical].hist(bins=20, figsize=(14, 10), color='orange')
plt.suptitle('Distribution of Numerical Features - Clinical Data Dataset', fontsize=16)
plt.show()

# Correlation heatmap for each dataset
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
sns.heatmap(radiomics_df[numeric_features_radiomics].corr(), annot=True, cmap="coolwarm")
plt.title("Radiomics Dataset - Correlation Heatmap")

plt.subplot(1, 3, 2)
sns.heatmap(morphological_df[numeric_features_morphological].corr(), annot=True, cmap="coolwarm")
plt.title("Morphological Measurements Dataset - Correlation Heatmap")

plt.subplot(1, 3, 3)
sns.heatmap(clinical_data_df[numeric_features_clinical].corr(), annot=True, cmap="coolwarm")
plt.title("Clinical Data Dataset - Correlation Heatmap")

plt.tight_layout()
plt.show()
