In [2]:
"""
Environment Configuration
------------------------
This notebook sets up the development environment, including:
- Environment type (local vs. Colab)
- GitHub repository configuration
- Google Drive data directory mounting
- Device configuration (CPU/GPU)
"""

# Core environment constants
ENVIRONMENT = 'colab'
BRANCH_NAME = 'main'
REPOSITORY_OWNER = 'tristan-day-research'
REPOSITORY = 'CIBMTR_cancer_outcome_predictions'
GOOGLE_DRIVE_DIR = 'CIBMTR_data'

# Configure environment based on runtime type
if ENVIRONMENT == 'colab':
    from google.colab import userdata

    # Start from the content directory
    %cd /content

    # Get GitHub token and clone repository
    token = userdata.get('GITHUB_PAT')
    !git clone -b {BRANCH_NAME} https://{token}@github.com/{REPOSITORY_OWNER}/{REPOSITORY}.git

    # Change to repository directory
    %cd {REPOSITORY}

    # Import and run environment setup
    from src import setup
    env_config = setup.configure_environment(
        environment=ENVIRONMENT,
        google_drive_dir=GOOGLE_DRIVE_DIR
    )

# Set commonly used configuration values for easier access
data_path = env_config.data_dir
device = env_config.device

/content
Cloning into 'CIBMTR_cancer_outcome_predictions'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 69 (delta 27), reused 53 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (69/69), 24.44 KiB | 6.11 MiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/CIBMTR_cancer_outcome_predictions
Mounted at /content/drive




In [6]:
import pandas as pd

# Load data into variables
data_dict_df = pd.read_csv(f'{data_path}/data_dictionary.csv')
sample_submission_df = pd.read_csv(f'{data_path}/sample_submission.csv')
test_df = pd.read_csv(f'{data_path}/test.csv')
train_df = pd.read_csv(f'{data_path}/train.csv')

In [None]:
# sample_submission_df.head()
# test_df.head()
# train_df.head()

In [14]:
from src.eda import analysis

# Generate overview
train_overview = analysis.initial_data_overview(train_df, "Training Data Overview")
test_overview = analysis.initial_data_overview(test_df, "Test Data Overview")

In [28]:
print(train_overview.keys())

# print(train_overview['Missing Values'])
# print('Numeric Summary\n', train_overview['Numeric Summary'])
print(train_overview['Categorical Cardinality'])


dict_keys(['Basic Info', 'Data Types', 'Missing Values', 'Numeric Summary', 'Categorical Cardinality'])
{'dri_score': 11, 'psych_disturb': 3, 'cyto_score': 7, 'diabetes': 3, 'tbi_status': 8, 'arrhythmia': 3, 'graft_type': 2, 'vent_hist': 2, 'renal_issue': 3, 'pulm_severe': 3, 'prim_disease_hct': 18, 'cmv_status': 4, 'tce_imm_match': 8, 'rituximab': 2, 'prod_type': 2, 'cyto_score_detail': 5, 'conditioning_intensity': 6, 'ethnicity': 3, 'obesity': 3, 'mrd_hct': 2, 'in_vivo_tcd': 2, 'tce_match': 4, 'hepatic_severe': 3, 'prior_tumor': 3, 'peptic_ulcer': 3, 'gvhd_proph': 17, 'rheum_issue': 3, 'sex_match': 4, 'race_group': 6, 'hepatic_mild': 3, 'tce_div_match': 4, 'donor_related': 3, 'melphalan_dose': 2, 'cardiac': 3, 'pulm_moderate': 3}


In [29]:
# Get all categorical (object type) columns
categorical_columns = train_df.select_dtypes(include=['object']).columns

print('Categorical Variable Distributions:\n')
for col in categorical_columns:
    print(f'\n{col} Distribution:')
    print(train_df[col].value_counts(normalize=True).mul(100).round(1))
    print(f'Missing values: {train_df[col].isnull().sum()}')
    print('-' * 50)

Categorical Variable Distributions:


dri_score Distribution:
dri_score
Intermediate                                         36.4
N/A - pediatric                                      16.7
High                                                 16.4
N/A - non-malignant indication                        8.5
TBD cytogenetics                                      7.0
Low                                                   6.7
High - TED AML case <missing cytogenetics             4.9
Intermediate - TED AML case <missing cytogenetics     1.7
N/A - disease not classifiable                        0.9
Very high                                             0.7
Missing disease status                                0.0
Name: proportion, dtype: float64
Missing values: 154
--------------------------------------------------

psych_disturb Distribution:
psych_disturb
No          86.0
Yes         13.4
Not done     0.5
Name: proportion, dtype: float64
Missing values: 2062
--------------------------------------