In [None]:
# 01_data_exploration.ipynb

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
file_path = './patient_list.csv'  # Adjust this path if necessary
patient_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First five rows of the dataset:")
display(patient_data.head())

# Summary statistics
print("Dataset summary statistics:")
display(patient_data.describe(include='all'))

# Check for missing values
print("Missing values in each column:")
display(patient_data.isnull().sum())

# Distribution of age
plt.figure(figsize=(8, 6))
sns.histplot(patient_data['age'], bins=15, kde=True)
plt.title('Age Distribution of Patients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Distribution by sex
plt.figure(figsize=(6, 4))
sns.countplot(x='sex', data=patient_data, palette='pastel')
plt.title('Distribution by Sex')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()

# Distribution of tumor categories
plt.figure(figsize=(8, 6))
sns.countplot(y='tumor_categories', data=patient_data, palette='viridis')
plt.title('Distribution of Tumor Categories')
plt.xlabel('Count')
plt.ylabel('Tumor Category')
plt.show()

# Distribution by train/test set
plt.figure(figsize=(6, 4))
sns.countplot(x='set', data=patient_data, palette='muted')
plt.title('Distribution by Train/Test Set')
plt.xlabel('Set')
plt.ylabel('Count')
plt.show()

# Relationship between age and tumor category
plt.figure(figsize=(10, 6))
sns.boxplot(x='tumor_categories', y='age', data=patient_data, palette='Blues')
plt.title('Age Distribution by Tumor Category')
plt.xlabel('Tumor Category')
plt.ylabel('Age')
plt.show()

# Image count distribution
plt.figure(figsize=(8, 6))
sns.histplot(patient_data['images_num'], bins=10, kde=True, color='skyblue')
plt.title('Distribution of Image Counts per Patient')
plt.xlabel('Number of Images')
plt.ylabel('Frequency')
plt.show()

# Distribution of age by sex and tumor category
plt.figure(figsize=(10, 6))
sns.violinplot(x='tumor_categories', y='age', hue='sex', data=patient_data, split=True, palette='Set2')
plt.title('Age Distribution by Sex and Tumor Category')
plt.xlabel('Tumor Category')
plt.ylabel('Age')
plt.legend(title='Sex')
plt.show()
