In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
print("Libraries imported.")

Libraries imported.


In [None]:
# --- Load Clinical Data ---
# Using the path you specified.
clinical_path = os.path.join("..", "data", "raw", "lung_cancer", "luad_tcga_pan_can_atlas_2018_clinical_data.tsv")
df_clinical = pd.read_csv(clinical_path, sep='\t', comment='#')
print(f"Clinical data shape: {df_clinical.shape}")

# --- Load Mutation Data ---
mutation_path = os.path.join("..", "data", "raw", "cancer", "data_mutations.txt")
df_mutations = pd.read_csv(mutation_path, sep='\t', comment='#')
print(f"Mutation data shape: {df_mutations.shape}")

Clinical data shape: (566, 63)


  df_mutations = pd.read_csv(mutation_path, sep='\t', comment='#')


Mutation data shape: (243229, 114)


In [6]:
# --- Prepare the Clinical Data (Definitive Fix) ---

# Check available columns from the file header
df_clinical_check = pd.read_csv(clinical_path, sep='\t', comment='#', nrows=0)
available_cols = df_clinical_check.columns.tolist()

# Define the ideal features we want.
desired_features = [
    'Patient ID',
    'Sex',
    'Age at Diagnosis',
    'Smoking History',
    'Overall Survival Status'
]

# Select only the features that are ACTUALLY AVAILABLE in the file.
features_to_use = [col for col in desired_features if col in available_cols]
print(f"Using available clinical features: {features_to_use}")

# Select only the available columns from the full dataframe
df_clinical_selected = df_clinical[features_to_use].copy() # Use .copy() to avoid SettingWithCopyWarning

# Rename columns for clarity
rename_dict = {
    'Patient ID': 'Sample ID',
    'Age at Diagnosis': 'Age',
    'Overall Survival Status': 'DEATH_EVENT'
}
columns_to_rename = {k: v for k, v in rename_dict.items() if k in df_clinical_selected.columns}
df_clinical_selected = df_clinical_selected.rename(columns=columns_to_rename)


# Clean the target variable (0 = LIVING, 1 = DECEASED)
if 'DEATH_EVENT' in df_clinical_selected.columns:
    df_clinical_selected = df_clinical_selected.dropna(subset=['DEATH_EVENT'])
    df_clinical_selected['DEATH_EVENT'] = df_clinical_selected['DEATH_EVENT'].apply(lambda x: 1 if x == '1:DECEASED' else 0)

# --- THIS IS THE FIX ---
# Define the final, essential columns that MUST NOT have missing values.
# These are the names AFTER renaming.
essential_cols = ['Sample ID', 'Sex']
# Add 'Age' to the list only if it exists in the dataframe now.
if 'Age' in df_clinical_selected.columns:
    essential_cols.append('Age')
    
# Drop rows where any of the essential columns are missing.
df_clinical_selected = df_clinical_selected.dropna(subset=essential_cols)

print("\nCleaned clinical data shape:", df_clinical_selected.shape)
display(df_clinical_selected.head())

Using available clinical features: ['Patient ID', 'Sex', 'Overall Survival Status']

Cleaned clinical data shape: (514, 3)


Unnamed: 0,Sample ID,Sex,DEATH_EVENT
0,TCGA-05-4244,Male,0
1,TCGA-05-4249,Male,0
2,TCGA-05-4250,Female,1
3,TCGA-05-4382,Male,0
4,TCGA-05-4384,Male,0


In [7]:
# List of key lung cancer driver genes
key_genes = ['EGFR', 'KRAS', 'ALK', 'TP53', 'STK11', 'KEAP1', 'BRAF', 'ROS1', 'MET']

# For each patient, we want to know if they have a mutation in any of these key genes.
# We can use one-hot encoding on the mutation data.
df_mutations_pivot = pd.crosstab(
    index=df_mutations['Tumor_Sample_Barcode'],
    columns=df_mutations['Hugo_Symbol']
)

# Keep only the columns for our key genes
# Some key genes might not be in this specific dataset, so we find the intersection
genes_in_data = [gene for gene in key_genes if gene in df_mutations_pivot.columns]
df_key_gene_mutations = df_mutations_pivot[genes_in_data]

# The values are counts of mutations. We only care if there is at least one (present/absent).
df_key_gene_mutations = (df_key_gene_mutations > 0).astype(int)

# Rename the index to match our clinical data for merging
df_key_gene_mutations.index.name = 'Sample ID'

print("Engineered gene mutation features:")
display(df_key_gene_mutations.head())

Engineered gene mutation features:


Hugo_Symbol,EGFR,KRAS,ALK,TP53,STK11,KEAP1,BRAF,ROS1,MET
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-05-4244-01,0,1,0,0,0,0,0,0,0
TCGA-05-4249-01,0,1,0,0,0,0,1,0,0
TCGA-05-4250-01,0,1,0,0,0,0,0,0,0
TCGA-05-4382-01,1,0,1,1,0,0,1,1,0
TCGA-05-4384-01,0,0,0,1,0,1,0,0,0


In [8]:
# Set index on clinical data for merging
df_clinical_selected = df_clinical_selected.set_index('Sample ID')

# Merge the two dataframes. We use a left join to keep all patients from the clinical data.
df_merged = pd.merge(df_clinical_selected, df_key_gene_mutations, left_index=True, right_index=True, how='left')

# After merging, patients who had no mutations in our key genes will have NaN.
# We should fill these with 0 (meaning no mutation was found).
df_merged[genes_in_data] = df_merged[genes_in_data].fillna(0).astype(int)

print("Final merged data shape:", df_merged.shape)
display(df_merged.head())

Final merged data shape: (514, 11)


Unnamed: 0_level_0,Sex,DEATH_EVENT,EGFR,KRAS,ALK,TP53,STK11,KEAP1,BRAF,ROS1,MET
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TCGA-05-4244,Male,0,0,0,0,0,0,0,0,0,0
TCGA-05-4249,Male,0,0,0,0,0,0,0,0,0,0
TCGA-05-4250,Female,1,0,0,0,0,0,0,0,0,0
TCGA-05-4382,Male,0,0,0,0,0,0,0,0,0,0
TCGA-05-4384,Male,0,0,0,0,0,0,0,0,0,0


In [10]:
# --- Final Preprocessing (Corrected for Cell 6) ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- One-Hot Encode Categorical Features ---
# First, identify which of our desired categorical columns actually exist in df_merged
categorical_cols_to_encode = []
if 'Sex' in df_merged.columns:
    categorical_cols_to_encode.append('Sex')
if 'Smoking History' in df_merged.columns:
    categorical_cols_to_encode.append('Smoking History')

print(f"Columns to one-hot encode: {categorical_cols_to_encode}")

# Only perform get_dummies if there are columns to encode
if categorical_cols_to_encode:
    df_final = pd.get_dummies(df_merged, columns=categorical_cols_to_encode, drop_first=True)
else:
    df_final = df_merged.copy() # If no categorical columns, just use the dataframe as is

# --- Define Features (X) and Target (y) ---
X = df_final.drop('DEATH_EVENT', axis=1)
y = df_final['DEATH_EVENT']

# --- Split the Data ---
print("\nTarget variable distribution:")
print(y.value_counts(normalize=True))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Scale Numerical Features ---
# Only scale 'Age' if it exists.
if 'Age' in X_train.columns:
    scaler = StandardScaler()
    X_train['Age'] = scaler.fit_transform(X_train[['Age']])
    X_test['Age'] = scaler.transform(X_test[['Age']])
else:
    scaler = None # No scaler needed if Age column is missing

print("\nData splitting and scaling complete.")
display(X_train.head())

Columns to one-hot encode: ['Sex']

Target variable distribution:
DEATH_EVENT
0    0.638132
1    0.361868
Name: proportion, dtype: float64

Data splitting and scaling complete.


Unnamed: 0_level_0,EGFR,KRAS,ALK,TP53,STK11,KEAP1,BRAF,ROS1,MET,Sex_Male
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TCGA-44-7669,0,0,0,0,0,0,0,0,0,True
TCGA-44-5645,0,0,0,0,0,0,0,0,0,False
TCGA-55-8512,0,0,0,0,0,0,0,0,0,True
TCGA-55-8204,0,0,0,0,0,0,0,0,0,False
TCGA-86-A4JF,0,0,0,0,0,0,0,0,0,True


In [None]:
# Create directories
CANCER_PROCESSED_DIR = os.path.join("..", "data", "processed", "lung_cancer")
CANCER_MODELS_DIR = os.path.join("..", "models", "lung_cancer")
os.makedirs(CANCER_PROCESSED_DIR, exist_ok=True)
os.makedirs(CANCER_MODELS_DIR, exist_ok=True)

# Save data
joblib.dump(X_train, os.path.join(CANCER_PROCESSED_DIR, "X_train.joblib"))
joblib.dump(X_test, os.path.join(CANCER_PROCESSED_DIR, "X_test.joblib"))
joblib.dump(y_train, os.path.join(CANCER_PROCESSED_DIR, "y_train.joblib"))
joblib.dump(y_test, os.path.join(CANCER_PROCESSED_DIR, "y_test.joblib"))

# Save the scaler and the final list of feature columns
joblib.dump(scaler, os.path.join(CANCER_MODELS_DIR, "lung_cancer_scaler.joblib"))
joblib.dump(X.columns.tolist(), os.path.join(CANCER_MODELS_DIR, "lung_cancer_features.joblib"))

print("\nLung cancer data artifacts saved successfully!")


Lung cancer data artifacts saved successfully!
