# Lab Results Analysis - Auto-Discovery Mode üöÄ

**This notebook automatically discovers and analyzes data from your Workbench data collections!**

## ‚ú® Out-of-the-Box Experience

1. **Mount Resources** (if needed): Run `wb resource mount` in terminal or use the mount cell below
2. **Auto-Discovery**: The notebook automatically finds data files in your mounted workspaces
3. **Auto-Configuration**: No manual configuration needed - it detects CSV, Parquet, JSON, and Excel files
4. **Auto-Analysis**: Just click "Run All" to see distribution reports and visualizations

**Note**: Workbench buckets should automount on startup. If you don't see your data, mount resources first!

## üìä What You'll See

After running all cells, you'll get:
- **Data Overview**: Summary statistics and data structure
- **Distribution Reports**: For Patient ID, Lab Type, Lab Value, and Lab Date
- **Visualizations**: Charts, graphs, and statistical analyses
- **Summary Statistics**: Comprehensive data characteristics

## üîß Manual Override (Optional)

If you want to specify a different data source, you can modify the configuration in the next cell.


In [None]:
# Option B: Try to mount resources programmatically (if wb CLI is available)
import subprocess
import sys

print("="*70)
print("üîß Attempting to mount Workbench resources...")
print("="*70)

try:
    # Try to run wb resource mount
    result = subprocess.run(
        ["wb", "resource", "mount"],
        capture_output=True,
        text=True,
        timeout=60
    )
    
    if result.returncode == 0:
        print("‚úÖ Resources mounted successfully!")
        if result.stdout:
            print(result.stdout)
    else:
        print("‚ö†Ô∏è  Mount command completed with warnings:")
        if result.stderr:
            print(result.stderr)
        print("\nüí° You may need to run 'wb resource mount' manually in a terminal.")
        
except FileNotFoundError:
    print("‚ÑπÔ∏è  'wb' CLI not found in PATH.")
    print("   This is normal - resources should automount on startup.")
    print("   If data is missing, try running 'wb resource mount' in a terminal.")
except subprocess.TimeoutExpired:
    print("‚è±Ô∏è  Mount command timed out. Try running 'wb resource mount' manually.")
except Exception as e:
    print(f"‚ÑπÔ∏è  Could not run mount command: {e}")
    print("   This is okay - try running 'wb resource mount' manually if needed.")

print("\n" + "="*70)
print("Next: Run the diagnostic cell below to check what's available.")
print("="*70)


## 0. Diagnostic: Check Mounted Workspaces

Run this cell to see what's available in your mounted workspaces. This will help you find your data files.


In [None]:
# Diagnostic: Check what's available in mounted workspaces
import os
from pathlib import Path
import subprocess

print("="*70)
print("üîç DIAGNOSTIC: Checking Mounted Workspace Locations")
print("="*70)

# First, check if wb CLI is available and try to see mount status
print("\nüìã Checking Workbench CLI and mount status...")
try:
    # Check if wb command exists
    result = subprocess.run(["which", "wb"], capture_output=True, text=True)
    if result.returncode == 0:
        wb_path = result.stdout.strip()
        print(f"   ‚úÖ Workbench CLI found at: {wb_path}")
        
        # Try to check mount status
        try:
            mount_result = subprocess.run(
                ["wb", "resource", "mount", "--help"],
                capture_output=True,
                text=True,
                timeout=10
            )
            print("   ‚úÖ 'wb resource mount' command is available")
        except:
            print("   ‚ö†Ô∏è  Could not verify 'wb resource mount' command")
    else:
        print("   ‚ùå Workbench CLI ('wb') not found in PATH")
        print("   üí° Resources should automount, but CLI may not be installed in this container")
except Exception as e:
    print(f"   ‚ÑπÔ∏è  Could not check for wb CLI: {e}")

# Check for existing mount points
print("\nüìÅ Checking mount locations...")
mount_locations = [
    "/home/jovyan/workspace",
    "/home/jovyan/workspaces", 
    "/home/jovyan/work",
    "/home/jovyan/repos",
]

data_extensions = ['.csv', '.parquet', '.json', '.xlsx', '.xls', '.tsv']

for mount_base in mount_locations:
    mount_path = Path(mount_base)
    print(f"\nüìÅ Checking: {mount_base}")
    if mount_path.exists():
        print(f"   ‚úÖ Directory exists")
        try:
            items = list(mount_path.iterdir())
            print(f"   üìä Found {len(items)} items")
            for item in items[:10]:  # Show first 10 items
                if item.is_dir():
                    print(f"      üìÇ {item.name}/")
                else:
                    print(f"      üìÑ {item.name}")
            if len(items) > 10:
                print(f"      ... and {len(items) - 10} more items")
            
            # Check for data files
            data_files = []
            for ext in data_extensions:
                data_files.extend(list(mount_path.rglob(f"*{ext}")))
            
            if data_files:
                print(f"   ‚úÖ Found {len(data_files)} data file(s):")
                for df_file in data_files[:5]:
                    size_mb = df_file.stat().st_size / (1024 * 1024)
                    print(f"      üìä {df_file} ({size_mb:.2f} MB)")
                if len(data_files) > 5:
                    print(f"      ... and {len(data_files) - 5} more data files")
            else:
                print(f"   ‚ö†Ô∏è  No data files found in this location")
        except PermissionError:
            print(f"   ‚ùå Permission denied")
        except Exception as e:
            print(f"   ‚ùå Error: {e}")
    else:
        print(f"   ‚ùå Directory does not exist")

print("\n" + "="*70)
print("üí° TROUBLESHOOTING TIPS:")
print("="*70)
print("1. If '/home/jovyan/workspace' doesn't exist, resources may not be mounted yet.")
print("2. Try running in terminal: wb resource mount")
print("3. Check if your workspace has bucket resources configured in Workbench UI")
print("4. Resources should automount on app startup - wait a few minutes and check again")
print("5. If you know your GCS bucket path, you can use GCS_BUCKET instead:")
print("   GCS_BUCKET = 'your-bucket-name'")
print("   FILE_NAME = 'path/to/your/file.csv'")
print("   USE_MOUNTED_PATH = False")
print("\nüí° If you see your data files above, you can manually set:")
print("   MOUNTED_FILE_PATH = '/path/to/your/file.csv'")
print("   USE_MOUNTED_PATH = True")
print("="*70)


## 1. Configuration and Import Libraries

**Configure your data source below:**


## 1.5. Hardcoded Configuration (Use this if auto-discovery doesn't work)

**Set your bucket and file details here:**


In [None]:
# ============================================================================
# HARDCODED CONFIGURATION: Set your bucket and file details here
# ============================================================================
# Replace these with your actual values:

HARDCODED_GCS_BUCKET = ""  # e.g., "my-workspace-bucket" (without gs:// prefix)
HARDCODED_FILE_NAME = ""   # e.g., "data/lab_results.csv" or "lab_results.csv"
HARDCODED_FILE_FORMAT = "csv"  # Options: "csv", "parquet", "json", "excel"

# If hardcoded values are set, override the configuration
if HARDCODED_GCS_BUCKET and HARDCODED_FILE_NAME:
    print("üìä Using hardcoded GCS bucket configuration")
    USE_MOUNTED_PATH = False
    MOUNTED_FILE_PATH = ""
    GCS_BUCKET = HARDCODED_GCS_BUCKET.replace("gs://", "").strip()
    FILE_NAME = HARDCODED_FILE_NAME
    FILE_FORMAT = HARDCODED_FILE_FORMAT
    print(f"   ‚úÖ Bucket: {GCS_BUCKET}")
    print(f"   ‚úÖ File: {FILE_NAME}")
    print(f"   ‚úÖ Format: {FILE_FORMAT}")
else:
    print("‚ÑπÔ∏è  Hardcoded values not set - using auto-discovery or sample data")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os
from pathlib import Path
import json
warnings.filterwarnings('ignore')

# Optional: Import google.cloud.storage (only needed for GCS bucket access)
# This will be installed automatically if needed
try:
    from google.cloud import storage
    GCS_AVAILABLE = True
except ImportError:
    GCS_AVAILABLE = False
    print("‚ÑπÔ∏è  Note: google-cloud-storage not installed. GCS bucket access will be unavailable.")
    print("   Installing google-cloud-storage...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "google-cloud-storage"])
    from google.cloud import storage
    GCS_AVAILABLE = True
    print("‚úÖ google-cloud-storage installed successfully!")

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# ============================================================================
# AUTO-DISCOVERY: Automatically find data files in mounted workspaces
# ============================================================================

def auto_discover_data_files():
    """Auto-discover data files in mounted workspace locations."""
    mount_locations = [
        "/home/jovyan/workspace",
        "/home/jovyan/workspaces", 
        "/home/jovyan/work",
    ]
    
    data_extensions = ['.csv', '.parquet', '.json', '.xlsx', '.xls', '.tsv']
    found_files = []
    
    for mount_base in mount_locations:
        mount_path = Path(mount_base)
        if not mount_path.exists():
            continue
            
        for ext in data_extensions:
            for file_path in mount_path.rglob(f"*{ext}"):
                if file_path.name.startswith('.'):
                    continue
                try:
                    # Check file size (skip files larger than 10GB)
                    if file_path.stat().st_size < 10 * 1024 * 1024 * 1024:
                        found_files.append((str(file_path), ext, mount_base))
                except (OSError, PermissionError):
                    continue
    
    if not found_files:
        return None
    
    # Prioritize CSV files, then parquet, then others
    priority = {'.csv': 0, '.tsv': 0, '.parquet': 1, '.json': 2, '.xlsx': 3, '.xls': 3}
    found_files.sort(key=lambda x: (priority.get(x[1], 99), x[0]))
    
    selected_file, ext, mount_base = found_files[0]
    
    format_map = {
        '.csv': 'csv', '.tsv': 'csv', '.parquet': 'parquet',
        '.json': 'json', '.xlsx': 'excel', '.xls': 'excel'
    }
    
    return {
        'file_path': selected_file,
        'mount_base': mount_base,
        'file_format': format_map.get(ext, 'csv')
    }

# Auto-discover data
print("üîç Auto-discovering data files in mounted workspaces...")
discovered_config = auto_discover_data_files()

# ============================================================================
# CONFIGURATION: Auto-configured or manual override
# ============================================================================

if discovered_config:
    print(f"‚úÖ Found data file: {discovered_config['file_path']}")
    USE_MOUNTED_PATH = True
    MOUNTED_FILE_PATH = discovered_config['file_path']
    FILE_FORMAT = discovered_config['file_format']
    WORKSPACE_NAME = Path(discovered_config['mount_base']).name
    DATA_COLLECTION_NAME = ""  # Will be inferred from path if needed
    GCS_BUCKET = ""
    FILE_NAME = ""
    print(f"üìä Auto-configured to use: {MOUNTED_FILE_PATH}")
    print(f"üìÅ File format: {FILE_FORMAT}")
else:
    print("‚ö†Ô∏è  No data files auto-discovered. Using manual configuration or sample data.")
    # Manual configuration (can be overridden)
    WORKSPACE_NAME = ""
    DATA_COLLECTION_NAME = ""
    GCS_BUCKET = ""
    FILE_NAME = ""
    USE_MOUNTED_PATH = False
    MOUNTED_FILE_PATH = ""
    FILE_FORMAT = "csv"

print("\n" + "="*60)
print("Configuration Summary:")
print("="*60)
print(f"Use Mounted Path: {USE_MOUNTED_PATH}")
print(f"Mounted File Path: {MOUNTED_FILE_PATH if MOUNTED_FILE_PATH else 'Not set'}")
print(f"GCS Bucket: {GCS_BUCKET if GCS_BUCKET else 'Not set'}")
print(f"File Name: {FILE_NAME if FILE_NAME else 'Not set'}")
print(f"File Format: {FILE_FORMAT}")
print("="*60)


In [None]:
# ============================================================================
# Load Data from Workbench Data Collection
# ============================================================================

def load_data_from_gcs(bucket_name, file_name, file_format="csv"):
    """Load data from GCS bucket using Google Cloud Storage client."""
    if not GCS_AVAILABLE:
        raise ImportError("google-cloud-storage is not available. Please install it or use mounted workspace paths.")
    
    try:
        # Initialize GCS client
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(file_name)
        
        print(f"Reading file from GCS: gs://{bucket_name}/{file_name}")
        
        # Download to temporary file
        temp_file = f"/tmp/{os.path.basename(file_name)}"
        blob.download_to_filename(temp_file)
        print(f"File downloaded to: {temp_file}")
        
        # Read based on file format
        if file_format.lower() == "csv":
            df = pd.read_csv(temp_file)
        elif file_format.lower() == "parquet":
            df = pd.read_parquet(temp_file)
        elif file_format.lower() == "json":
            df = pd.read_json(temp_file)
        elif file_format.lower() == "excel":
            df = pd.read_excel(temp_file)
        else:
            raise ValueError(f"Unsupported file format: {file_format}")
        
        # Clean up temp file
        os.remove(temp_file)
        return df
        
    except Exception as e:
        print(f"Error loading from GCS: {e}")
        raise

def load_data_from_mounted_path(file_path, file_format="csv"):
    """Load data from mounted workspace path."""
    try:
        print(f"Reading file from mounted path: {file_path}")
        
        if file_format.lower() == "csv":
            df = pd.read_csv(file_path)
        elif file_format.lower() == "parquet":
            df = pd.read_parquet(file_path)
        elif file_format.lower() == "json":
            df = pd.read_json(file_path)
        elif file_format.lower() == "excel":
            df = pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_format}")
        
        return df
        
    except Exception as e:
        print(f"Error loading from mounted path: {e}")
        raise

# Load data based on configuration
if USE_MOUNTED_PATH and MOUNTED_FILE_PATH:
    # Use mounted workspace path
    df = load_data_from_mounted_path(MOUNTED_FILE_PATH, FILE_FORMAT)
elif GCS_BUCKET and FILE_NAME:
    # Use GCS bucket
    # Remove gs:// prefix if present
    bucket_name = GCS_BUCKET.replace("gs://", "").strip()
    df = load_data_from_gcs(bucket_name, FILE_NAME, FILE_FORMAT)
else:
    # Fallback: Generate sample data if configuration is not set
    print("‚ö†Ô∏è  WARNING: No data source configured. Generating sample data...")
    print("Please set GCS_BUCKET and FILE_NAME, or USE_MOUNTED_PATH and MOUNTED_FILE_PATH")
    
    np.random.seed(42)
    lab_types = ['Complete Blood Count', 'Lipid Panel', 'Liver Function', 'Kidney Function', 
                 'Thyroid Panel', 'Hemoglobin A1C', 'Vitamin D', 'Cholesterol']
    n_records = 500
    patient_ids = [f'PAT{str(i).zfill(5)}' for i in range(1, 101)]
    
    data = {
        'Patient ID': np.random.choice(patient_ids, n_records),
        'Lab Type': np.random.choice(lab_types, n_records, p=[0.2, 0.15, 0.15, 0.15, 0.1, 0.1, 0.1, 0.05]),
        'Lab Value': np.round(np.random.normal(100, 30, n_records), 2),
        'Lab Date': [(datetime.now() - timedelta(days=np.random.randint(0, 365))).strftime('%Y-%m-%d') 
                     for _ in range(n_records)]
    }
    
    df = pd.DataFrame(data)
    
    # Adjust lab values based on lab type
    lab_value_ranges = {
        'Complete Blood Count': (4.5, 11.0),
        'Lipid Panel': (120, 200),
        'Liver Function': (10, 40),
        'Kidney Function': (0.6, 1.2),
        'Thyroid Panel': (0.5, 5.0),
        'Hemoglobin A1C': (4.0, 6.5),
        'Vitamin D': (20, 50),
        'Cholesterol': (150, 250)
    }
    
    for lab_type, (min_val, max_val) in lab_value_ranges.items():
        mask = df['Lab Type'] == lab_type
        df.loc[mask, 'Lab Value'] = np.round(np.random.uniform(min_val, max_val, mask.sum()), 2)
    
    df['Lab Date'] = pd.to_datetime(df['Lab Date'])

# Ensure required columns exist (case-insensitive matching)
required_columns = ['Patient ID', 'Lab Type', 'Lab Value', 'Lab Date']
df_columns_lower = {col.lower(): col for col in df.columns}

# Map to standard column names
column_mapping = {}
for req_col in required_columns:
    req_lower = req_col.lower()
    if req_lower in df_columns_lower:
        column_mapping[df_columns_lower[req_lower]] = req_col
    else:
        print(f"‚ö†Ô∏è  WARNING: Column '{req_col}' not found in data. Available columns: {list(df.columns)}")

if column_mapping:
    df = df.rename(columns=column_mapping)

# Ensure Lab Date is datetime
if 'Lab Date' in df.columns:
    df['Lab Date'] = pd.to_datetime(df['Lab Date'], errors='coerce')

print(f"\n‚úì Dataset loaded successfully with {len(df)} records")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few records:")
df.head(10)


## 2. Data Overview


In [None]:
print("Dataset Info:")
print(f"Total Records: {len(df)}")
print(f"Total Patients: {df['Patient ID'].nunique()}")
print(f"Total Lab Types: {df['Lab Type'].nunique()}")
print(f"\nDate Range: {df['Lab Date'].min().date()} to {df['Lab Date'].max().date()}")
print(f"\nData Types:")
print(df.dtypes)
print(f"\nBasic Statistics:")
df.describe()


## 3. Distribution Report: Patient ID


In [None]:
# Count of lab results per patient
patient_counts = df['Patient ID'].value_counts().sort_values(ascending=False)

print("=== Patient ID Distribution Report ===")
print(f"\nTotal unique patients: {df['Patient ID'].nunique()}")
print(f"\nTop 10 patients by number of lab results:")
print(patient_counts.head(10))
print(f"\nStatistics:")
print(f"  Mean tests per patient: {patient_counts.mean():.2f}")
print(f"  Median tests per patient: {patient_counts.median():.2f}")
print(f"  Min tests per patient: {patient_counts.min()}")
print(f"  Max tests per patient: {patient_counts.max()}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of tests per patient
axes[0].hist(patient_counts.values, bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0].set_xlabel('Number of Lab Tests', fontsize=12)
axes[0].set_ylabel('Number of Patients', fontsize=12)
axes[0].set_title('Distribution of Lab Tests per Patient', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Top 15 patients bar chart
top_patients = patient_counts.head(15)
axes[1].barh(range(len(top_patients)), top_patients.values, color='coral')
axes[1].set_yticks(range(len(top_patients)))
axes[1].set_yticklabels(top_patients.index, fontsize=9)
axes[1].set_xlabel('Number of Lab Tests', fontsize=12)
axes[1].set_title('Top 15 Patients by Lab Test Count', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()


## 4. Distribution Report: Lab Type


In [None]:
# Count of each lab type
lab_type_counts = df['Lab Type'].value_counts()

print("=== Lab Type Distribution Report ===")
print(f"\nTotal unique lab types: {df['Lab Type'].nunique()}")
print(f"\nLab type frequency:")
for lab_type, count in lab_type_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {lab_type}: {count} ({percentage:.1f}%)")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
axes[0].barh(range(len(lab_type_counts)), lab_type_counts.values, color='lightgreen')
axes[0].set_yticks(range(len(lab_type_counts)))
axes[0].set_yticklabels(lab_type_counts.index, fontsize=10)
axes[0].set_xlabel('Number of Tests', fontsize=12)
axes[0].set_title('Lab Type Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# Pie chart
axes[1].pie(lab_type_counts.values, labels=lab_type_counts.index, autopct='%1.1f%%', 
            startangle=90, textprops={'fontsize': 9})
axes[1].set_title('Lab Type Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


## 5. Distribution Report: Lab Value


In [None]:
print("=== Lab Value Distribution Report ===")
print(f"\nBasic Statistics:")
print(df['Lab Value'].describe())
print(f"\nSkewness: {df['Lab Value'].skew():.3f}")
print(f"Kurtosis: {df['Lab Value'].kurtosis():.3f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram
axes[0, 0].hist(df['Lab Value'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].set_xlabel('Lab Value', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Lab Value Distribution (Histogram)', fontsize=14, fontweight='bold')
axes[0, 0].axvline(df['Lab Value'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["Lab Value"].mean():.2f}')
axes[0, 0].axvline(df['Lab Value'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df["Lab Value"].median():.2f}')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Box plot
axes[0, 1].boxplot(df['Lab Value'], vert=True, patch_artist=True, 
                   boxprops=dict(facecolor='lightblue', alpha=0.7))
axes[0, 1].set_ylabel('Lab Value', fontsize=12)
axes[0, 1].set_title('Lab Value Distribution (Box Plot)', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Density plot
df['Lab Value'].plot.density(ax=axes[1, 0], color='purple', linewidth=2)
axes[1, 0].set_xlabel('Lab Value', fontsize=12)
axes[1, 0].set_ylabel('Density', fontsize=12)
axes[1, 0].set_title('Lab Value Distribution (Density Plot)', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# Lab values by lab type
df.boxplot(column='Lab Value', by='Lab Type', ax=axes[1, 1], rot=45)
axes[1, 1].set_xlabel('Lab Type', fontsize=10)
axes[1, 1].set_ylabel('Lab Value', fontsize=12)
axes[1, 1].set_title('Lab Value Distribution by Lab Type', fontsize=14, fontweight='bold')
plt.setp(axes[1, 1].xaxis.get_majorticklabels(), rotation=45, ha='right')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## 6. Distribution Report: Lab Date


In [None]:
# Extract date components
df['Year'] = df['Lab Date'].dt.year
df['Month'] = df['Lab Date'].dt.month
df['DayOfWeek'] = df['Lab Date'].dt.day_name()

print("=== Lab Date Distribution Report ===")
print(f"\nDate Range: {df['Lab Date'].min().date()} to {df['Lab Date'].max().date()}")
print(f"\nTotal days covered: {(df['Lab Date'].max() - df['Lab Date'].min()).days} days")
print(f"\nTests by Year:")
print(df['Year'].value_counts().sort_index())
print(f"\nTests by Month:")
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_counts = df['Month'].value_counts().sort_index()
for month, count in month_counts.items():
    print(f"  {month_names[month-1]}: {count}")
print(f"\nTests by Day of Week:")
print(df['DayOfWeek'].value_counts())

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Timeline of tests
daily_counts = df.groupby(df['Lab Date'].dt.date).size()
axes[0, 0].plot(daily_counts.index, daily_counts.values, marker='o', markersize=3, linewidth=1, color='darkblue')
axes[0, 0].set_xlabel('Date', fontsize=12)
axes[0, 0].set_ylabel('Number of Tests', fontsize=12)
axes[0, 0].set_title('Lab Tests Over Time', fontsize=14, fontweight='bold')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# Tests by month
month_counts = df['Month'].value_counts().sort_index()
axes[0, 1].bar(range(1, 13), [month_counts.get(i, 0) for i in range(1, 13)], color='orange', alpha=0.7)
axes[0, 1].set_xticks(range(1, 13))
axes[0, 1].set_xticklabels(month_names, rotation=45, ha='right')
axes[0, 1].set_xlabel('Month', fontsize=12)
axes[0, 1].set_ylabel('Number of Tests', fontsize=12)
axes[0, 1].set_title('Lab Tests by Month', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Tests by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df['DayOfWeek'].value_counts().reindex(day_order, fill_value=0)
axes[1, 0].bar(range(len(day_order)), day_counts.values, color='teal', alpha=0.7)
axes[1, 0].set_xticks(range(len(day_order)))
axes[1, 0].set_xticklabels(day_order, rotation=45, ha='right')
axes[1, 0].set_xlabel('Day of Week', fontsize=12)
axes[1, 0].set_ylabel('Number of Tests', fontsize=12)
axes[1, 0].set_title('Lab Tests by Day of Week', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Histogram of dates
axes[1, 1].hist(df['Lab Date'], bins=30, edgecolor='black', alpha=0.7, color='crimson')
axes[1, 1].set_xlabel('Date', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_title('Lab Date Distribution (Histogram)', fontsize=14, fontweight='bold')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 7. Summary Statistics


In [None]:
print("="*70)
print(" " * 15 + "üìä COMPREHENSIVE SUMMARY REPORT üìä")
print("="*70)
print(f"\n{'='*70}")
print(f"üìã DATASET OVERVIEW")
print(f"{'='*70}")
print(f"  Total Records: {len(df):,}")
print(f"  Date Range: {df['Lab Date'].min().date()} to {df['Lab Date'].max().date()}")
print(f"  Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print(f"\n{'='*70}")
print(f"üë• PATIENT ID ANALYSIS")
print(f"{'='*70}")
print(f"  Total Unique Patients: {df['Patient ID'].nunique():,}")
print(f"  Average Tests per Patient: {df.groupby('Patient ID').size().mean():.2f}")
print(f"  Median Tests per Patient: {df.groupby('Patient ID').size().median():.2f}")
print(f"  Patients with Most Tests: {df.groupby('Patient ID').size().max()} tests")

print(f"\n{'='*70}")
print(f"üß™ LAB TYPE ANALYSIS")
print(f"{'='*70}")
print(f"  Total Unique Lab Types: {df['Lab Type'].nunique()}")
most_common = df['Lab Type'].mode()[0]
most_common_count = df['Lab Type'].value_counts().max()
most_common_pct = (most_common_count / len(df)) * 100
print(f"  Most Common Lab Type: {most_common} ({most_common_count} tests, {most_common_pct:.1f}%)")
print(f"  Lab Type Distribution:")
for lab_type, count in df['Lab Type'].value_counts().head(5).items():
    pct = (count / len(df)) * 100
    print(f"    - {lab_type}: {count} ({pct:.1f}%)")

print(f"\n{'='*70}")
print(f"üìà LAB VALUE STATISTICS")
print(f"{'='*70}")
print(f"  Mean: {df['Lab Value'].mean():.2f}")
print(f"  Median: {df['Lab Value'].median():.2f}")
print(f"  Standard Deviation: {df['Lab Value'].std():.2f}")
print(f"  Minimum: {df['Lab Value'].min():.2f}")
print(f"  Maximum: {df['Lab Value'].max():.2f}")
print(f"  Range: {df['Lab Value'].max() - df['Lab Value'].min():.2f}")
print(f"  Skewness: {df['Lab Value'].skew():.3f}")

print(f"\n{'='*70}")
print(f"üìÖ TEMPORAL ANALYSIS")
print(f"{'='*70}")
date_range_days = (df['Lab Date'].max() - df['Lab Date'].min()).days
print(f"  Date Range: {date_range_days} days")
print(f"  Average Tests per Day: {len(df) / (date_range_days + 1):.2f}")
print(f"  Total Tests: {len(df):,}")

# Find busiest day
daily_counts = df.groupby(df['Lab Date'].dt.date).size()
busiest_day = daily_counts.idxmax()
busiest_count = daily_counts.max()
print(f"  Busiest Day: {busiest_day} ({busiest_count} tests)")

print(f"\n{'='*70}")
print("‚úÖ Analysis Complete! All distribution reports and visualizations are shown above.")
print(f"{'='*70}")
