In [1]:
# """
# Environment Configuration
# ------------------------
# This notebook sets up the development environment, including:
# - Environment type (local vs. Colab)
# - GitHub repository configuration
# - Google Drive data directory mounting
# - Device configuration (CPU/GPU)
# """

# # Core environment constants
# ENVIRONMENT = 'colab'
# BRANCH_NAME = 'main'
# REPOSITORY_OWNER = 'tristan-day-research'
# REPOSITORY = 'CIBMTR_cancer_outcome_predictions'
# GOOGLE_DRIVE_DIR = 'CIBMTR_data'

# # Configure environment based on runtime type
# if ENVIRONMENT == 'colab':
#     from google.colab import userdata

#     # Start from the content directory
#     %cd /content

#     # Get GitHub token and clone repository
#     token = userdata.get('GITHUB_PAT')
#     !git clone -b {BRANCH_NAME} https://{token}@github.com/{REPOSITORY_OWNER}/{REPOSITORY}.git

#     # Change to repository directory
#     %cd {REPOSITORY}

#     # Import and run environment setup
#     from src import setup
#     env_config = setup.configure_environment(
#         environment=ENVIRONMENT,
#         google_drive_dir=GOOGLE_DRIVE_DIR
#     )

# # Set commonly used configuration values for easier access
# data_path = env_config.data_dir
# device = env_config.device

# Core environment constants
ENVIRONMENT = 'colab'
BRANCH_NAME = 'main'
REPOSITORY_OWNER = 'tristan-day-research'
REPOSITORY = 'CIBMTR_cancer_outcome_predictions'
GOOGLE_DRIVE_DIR = 'CIBMTR_data'

# Configure environment based on runtime type
if ENVIRONMENT == 'colab':
    from google.colab import userdata
    import os

    # Set repository name as environment variable for notebook setup
    os.environ['REPOSITORY'] = REPOSITORY

    # Start from the content directory
    %cd /content

    # Get GitHub token and clone repository
    token = userdata.get('GITHUB_PAT')
    !git clone -b {BRANCH_NAME} https://{token}@github.com/{REPOSITORY_OWNER}/{REPOSITORY}.git

    # Change to repository directory
    %cd {REPOSITORY}

    # Import and run environment setup
    from src import setup
    env_config = setup.configure_environment(
        environment=ENVIRONMENT,
        google_drive_dir=GOOGLE_DRIVE_DIR
    )

# Set commonly used configuration values for easier access
data_path = env_config.data_dir
device = env_config.device

/content
Cloning into 'CIBMTR_cancer_outcome_predictions'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 131 (delta 45), reused 98 (delta 24), pack-reused 0 (from 0)[K
Receiving objects: 100% (131/131), 98.83 KiB | 2.25 MiB/s, done.
Resolving deltas: 100% (45/45), done.
/content/CIBMTR_cancer_outcome_predictions
Mounted at /content/drive




# Load Data

In [2]:
import pandas as pd

# Load data into variables
data_dict_df = pd.read_csv(f'{data_path}/data_dictionary.csv')
sample_submission_df = pd.read_csv(f'{data_path}/sample_submission.csv')
test_df = pd.read_csv(f'{data_path}/test.csv')
train_df = pd.read_csv(f'{data_path}/train.csv')

# EDA

In [None]:
# # Get all categorical (object type) columns
# categorical_columns = train_df.select_dtypes(include=['object']).columns

# print('Categorical Variable Distributions:\n')
# for col in categorical_columns:
#     print(f'\n{col} Distribution:')
#     print(train_df[col].value_counts(normalize=True).mul(100).round(1))
#     print(f'Missing values: {train_df[col].isnull().sum()}')
#     print('-' * 50)

Categorical Variable Distributions:


dri_score Distribution:
dri_score
Intermediate                                         36.4
N/A - pediatric                                      16.7
High                                                 16.4
N/A - non-malignant indication                        8.5
TBD cytogenetics                                      7.0
Low                                                   6.7
High - TED AML case <missing cytogenetics             4.9
Intermediate - TED AML case <missing cytogenetics     1.7
N/A - disease not classifiable                        0.9
Very high                                             0.7
Missing disease status                                0.0
Name: proportion, dtype: float64
Missing values: 154
--------------------------------------------------

psych_disturb Distribution:
psych_disturb
No          86.0
Yes         13.4
Not done     0.5
Name: proportion, dtype: float64
Missing values: 2062
--------------------------------------

In [22]:
# from src.eda import analysis
# import importlib
# importlib.reload(analysis)  # Note: reload analysis, not eda

# output_dir='results/EDA'

# # analysis.comprehensive_eda(train_df)
# # analysis.initial_data_overview(train_df, output_dir)
# analysis.analyze_categorical_distributions(train_df, output_dir)

# Analyze missing values in data

In [26]:


csv_name = '/content/CIBMTR_cancer_outcome_predictions/results/EDA/missing_values.csv'

df = pd.read_csv(csv_name)
df = df.sort_values('Missing Percentage', ascending=False)  # Sort by specific column

df.to_csv(csv_name)

In [30]:
from src.eda import missing_values
import importlib
importlib.reload(missing_values)  # Note: reload analysis, not eda

output_dir='results/EDA'

missing_values.analyze_missing_patterns(train_df, output_dir)


In [22]:
from src.eda import missing_values
import importlib
importlib.reload(missing_values)  # Note: reload analysis, not eda

output_dir='results/EDA/missing_value_analysis'

# column = 'hla_high_res_10'
# group_col='race_group'

# # missing_values.chi_square_missing_by_group(train_df, column, group_col)
# missing_values.chi_square_missing_by_group(train_df, column, group_col)

group_variable = 'race_group'

# These are the top features with missing values
features_to_analyze = [
   'tce_match',
   'mrd_hct',
   'cyto_score_detail',
   'tce_div_match',
   'tce_imm_match',
   'cyto_score',
   'hla_high_res_10',
   'hla_high_res_8',
   'hla_high_res_6',
   'hla_match_dqb1_high',
   'hla_low_res_10',
   'conditioning_intensity',
   'hla_match_c_high',
   'hla_match_a_high',
   'hla_nmdp_6',
   'hla_match_dqb1_low',
   'hla_match_b_high',
   'hla_low_res_8',
   'hla_match_drb1_high',
   'hla_low_res_6'
]

missing_values.analyze_group_differences(train_df, group_variable, features_to_analyze, output_dir=output_dir)
missing_values.visualize_group_differences(train_df, group_variable, features_to_analyze, output_dir=output_dir)


results/EDA/missing_value_analysis
tce_match 6.065719569121874e-210
mrd_hct 0.0002647971673972512
cyto_score_detail 2.9912561827733095e-120
tce_div_match 5.896528378111704e-136
tce_imm_match 3.805168518693835e-125
cyto_score 5.874445800410101e-106
hla_high_res_10 1.2124063947067201e-116
hla_high_res_8 1.1159005098018191e-115
hla_high_res_6 1.5502003070922266e-108
hla_match_dqb1_high 7.856720449187174e-87
hla_low_res_10 7.35009191602592e-95
conditioning_intensity 1.848001895532784e-134
hla_match_c_high 7.277356179002075e-96
hla_match_a_high 3.186738434882811e-86
hla_nmdp_6 1.972605617354519e-99
hla_match_dqb1_low 1.561837255536488e-71
hla_match_b_high 8.487693085539501e-85
hla_low_res_8 3.151605189171145e-88
hla_match_drb1_high 3.9244589933749467e-88
hla_low_res_6 2.4216314608639876e-86


  plt.xlabel('Maximum Difference Between Groups')
  plt.ylabel('-log10(p-value)')
