# Data Exploration and Analysis

This notebook provides an overview of the two datasets used in our biomedical active learning study:
1. **Blood-Brain Barrier Penetration (BBBP)** - Molecular classification dataset
2. **Breast Cancer Wisconsin** - Clinical classification dataset

## Overview

We will explore:
- Dataset loading and inspection
- Class distributions
- Feature importance analysis
- Correlation analysis with target variables

In [ ]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

from data.loader import DatasetLoader

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Dataset Loading and Inspection

In [ ]:
# Function to load the dataset
def load_dataset(file_path):
    """
    Load dataset from either xlsx or csv file into a pandas DataFrame
    Parameters:
    file_path (str): Path to the input file
    Returns:
    pd.DataFrame: Loaded dataset
    """
    if file_path.endswith('.xlsx'):
        return pd.read_excel(file_path)
    elif file_path.endswith('.csv'):
        return pd.read_csv(file_path)

# Function to inspect the generated DataFrame
def inspect_dataframe(df, dataset_name):
    """
    Inspect DataFrame keys, print head of each column and check for NaN values
    Parameters:
    df (pd.DataFrame): Input DataFrame to inspect
    dataset_name (str): Name of the dataset for clear reporting
    """
    print(f"\nInspecting {dataset_name} dataset:")
    print("-" * 50)
    print("DataFrame columns:", df.columns.tolist())
    print(f"DataFrame shape: {df.shape}")
    print("\nColumn summaries:")
    for col in df.columns:
        print(f"\n{col}:")
        print("First 5 values:", df[col].head().tolist())
        nan_count = df[col].isna().sum()
        print(f"NaN count: {nan_count}")
        if df[col].dtype in ['int64', 'float64']:
            print(f"Data type: {df[col].dtype}")
            print(f"Min: {df[col].min():.3f}, Max: {df[col].max():.3f}")
        else:
            print(f"Data type: {df[col].dtype}")
            print(f"Unique values: {df[col].nunique()}")

print("Helper functions defined!")

In [ ]:
# Load the Blood-Brain Barrier Penetration dataset
bbb_df = load_dataset('../data/raw/BBBP.xlsx')
inspect_dataframe(bbb_df, 'Blood-Brain Barrier Penetration')

In [ ]:
# Load the Breast Cancer dataset
bc_df = load_dataset('../data/raw/breast-cancer.csv')
inspect_dataframe(bc_df, 'Breast Cancer')

## 2. Basic Data Preprocessing

Let's prepare the target variables for analysis:

In [ ]:
# Prepare BBB dataset - map class labels to binary values
print("BBB Dataset class distribution:")
print(bbb_df['Class'].value_counts())

# Map class labels to binary values
class_mapping = {'BBB+': 1, 'BBB-': 0}
bbb_df['target'] = bbb_df['Class'].map(class_mapping)
print("\nAfter mapping:")
print(bbb_df['target'].value_counts())

# Prepare Breast Cancer dataset - map diagnosis to binary values
print("\n" + "="*50)
print("Breast Cancer Dataset class distribution:")
print(bc_df['diagnosis'].value_counts())

# Map class labels to binary values  
bc_df['target'] = bc_df['diagnosis'].map({'M': 1, 'B': 0})
print("\nAfter mapping:")
print(bc_df['target'].value_counts())

## 3. Class Distribution Visualization

In [ ]:
def plot_class_distributions_side_by_side(y1, y2, dataset_names=["Dataset 1", "Dataset 2"]):
    """
    Plots the class distributions for two datasets side by side (1x2 plot) using matplotlib.
    
    Parameters:
        y1 (array-like): Target vector for the first dataset.
        y2 (array-like): Target vector for the second dataset.
        dataset_names (list of str): Names for the datasets (length should be 2).
    """
    # Define font sizes for various text elements (except main title)
    subplot_title_fontsize = 15
    label_fontsize = 15
    tick_fontsize = 15

    # Compute class frequencies for the first dataset.
    unique1, counts1 = np.unique(y1, return_counts=True)
    positions1 = np.arange(len(unique1))
    
    # Compute class frequencies for the second dataset.
    unique2, counts2 = np.unique(y2, return_counts=True)
    positions2 = np.arange(len(unique2))
    
    # Create a figure with 1 row and 2 columns.
    fig, axes = plt.subplots(1, 2, figsize=(9, 5.5))
    
    # Plot for the first dataset (e.g., Breast Cancer)
    axes[0].bar(positions1, counts1, color='skyblue', edgecolor='black', width=0.7)
    axes[0].set_title(dataset_names[0], fontsize=subplot_title_fontsize)
    axes[0].set_xlabel('Class', fontsize=label_fontsize)
    axes[0].set_ylabel('Count', fontsize=label_fontsize)
    axes[0].set_xticks(positions1)
    axes[0].set_xticklabels(unique1, fontsize=tick_fontsize)
    axes[0].tick_params(axis='y', labelsize=tick_fontsize)
    
    # Plot for the second dataset (e.g., Blood Brain Barrier Permeability)
    axes[1].bar(positions2, counts2, color='salmon', edgecolor='black', width=0.7)
    axes[1].set_title(dataset_names[1], fontsize=subplot_title_fontsize)
    axes[1].set_xlabel('Class', fontsize=label_fontsize)
    axes[1].set_ylabel('Count', fontsize=label_fontsize)
    axes[1].set_xticks(positions2)
    axes[1].set_xticklabels(unique2, fontsize=tick_fontsize)
    axes[1].tick_params(axis='y', labelsize=tick_fontsize)
    
    fig.suptitle("Class Distribution of Datasets", fontsize=16, y=0.98)
    
    # Adjust layout with increased space between subplots
    plt.tight_layout(rect=[0, 0, 1, 0.98])
    fig.subplots_adjust(wspace=0.3)  # Increased from 0.2 to 0.3
    plt.show()

print("Plotting class distributions side by side...")
plot_class_distributions_side_by_side(
    bc_df['target'], 
    bbb_df['target'], 
    dataset_names=["Breast Cancer", "Blood Brain Barrier Permeability"]
)

## 4. Basic Statistical Summary

Let's examine the basic statistics for the numerical features in the Breast Cancer dataset:

In [ ]:
# Get numerical features for Breast Cancer dataset
bc_numerical = bc_df.select_dtypes(include=[np.number]).drop(columns=['target'])

print("Breast Cancer Dataset - Numerical Features Summary:")
print("=" * 60)
print(f"Number of numerical features: {bc_numerical.shape[1]}")
print(f"Number of samples: {bc_numerical.shape[0]}")
print("\nBasic statistics:")
print(bc_numerical.describe().round(3))

# Check for missing values
print(f"\nMissing values per feature:")
missing_counts = bc_numerical.isnull().sum()
if missing_counts.sum() == 0:
    print("No missing values found!")
else:
    print(missing_counts[missing_counts > 0])

# Feature correlation with target
print(f"\nCorrelation with target variable (top 10):")
correlations = bc_df[bc_numerical.columns.tolist() + ['target']].corr()['target'].abs().sort_values(ascending=False)
print(correlations.head(11).round(3))  # Top 10 + target itself

## 5. Dataset Summary

**Key Findings:**

### Blood-Brain Barrier Penetration (BBBP) Dataset:
- **Structure**: Molecular dataset with SMILES strings
- **Target**: Binary classification (BBB+ vs BBB-)
- **Features**: SMILES molecular representations
- **Preprocessing needed**: Molecular featurization using RDKit and Mol2vec

### Breast Cancer Wisconsin Dataset:
- **Structure**: Clinical dataset with numerical features
- **Target**: Binary classification (Malignant vs Benign)
- **Features**: 30 numerical clinical measurements
- **Preprocessing needed**: Standard scaling

Both datasets are well-suited for binary classification and active learning experiments. The BBB dataset requires specialized molecular featurization, while the Breast Cancer dataset can be used directly with standard preprocessing.

**Next Steps:**
1. Molecular featurization for BBB dataset
2. Standard preprocessing for both datasets  
3. Train/test splitting
4. Feature scaling