In [1]:
import pandas as pd

# Import file .tsv
df = pd.read_csv('breast_msk_2025_clinical_data.tsv', sep='\t')

# Tampilkan nama kolom
print("Nama kolom:")
print(df.columns.tolist())

Nama kolom:
['Study ID', 'Patient ID', 'Sample ID', 'Cancer Type', 'Cancer Type Detailed', 'Ethnicity', 'Fraction Genome Altered', 'Sex', 'Gene Panel', 'Metastatic Site', 'MSI Comment', 'MSI Score', 'MSI Type', 'Mutation Count', 'Oncotree Code', 'Overall Survival (Months)', 'Overall Survival Status', 'Primary Tumor Site', 'Race', 'Sample Class', 'Number of Samples Per Patient', 'Sample coverage', 'Sample Type', 'Somatic Status', 'Tumor Purity']


In [2]:
# Mengubah nilai di kolom 'Overall Survival Status'
df['Overall Survival Status'] = df['Overall Survival Status'].map({'1:DECEASED': 1, '0:LIVING': 0})

# Tampilkan nilai unik setelah mapping untuk verifikasi
print("Nilai unik di kolom 'Overall Survival Status' setelah mapping:")
print(df['Overall Survival Status'].unique())

Nilai unik di kolom 'Overall Survival Status' setelah mapping:
[ 1.  0. nan]


In [6]:
from lifelines import CoxPHFitter
import pandas as pd

# Pilih kolom yang diperlukan
columns_needed = [
    'Overall Survival (Months)',
    'Overall Survival Status',
    'Cancer Type Detailed',
    'Primary Tumor Site',
    'Metastatic Site',
    'MSI Type',
    'Mutation Count',
    'Fraction Genome Altered'
]

# Buat subset data dan drop NaN
data_cox = df[columns_needed].dropna()

# Pastikan tipe data benar
data_cox['Overall Survival (Months)'] = data_cox['Overall Survival (Months)'].astype(float)
data_cox['Overall Survival Status'] = data_cox['Overall Survival Status'].astype(int)

# Encode kolom kategorikal ke dummy variables
categorical_cols = ['Cancer Type Detailed', 'Primary Tumor Site', 'Metastatic Site', 'MSI Type']
data_cox_encoded = pd.get_dummies(data_cox, columns=categorical_cols, drop_first=True)

# Tampilkan info data setelah encoding
print(f"Jumlah sampel setelah drop NaN dan encoding: {len(data_cox_encoded)}")
print("Kolom setelah encoding:")
print(data_cox_encoded.columns.tolist())

# Fit model Cox Proportional Hazards
cph = CoxPHFitter(penalizer=0.1)  # Tambahkan penalizer untuk mengatasi kolinearitas
cph.fit(data_cox_encoded, duration_col='Overall Survival (Months)', event_col='Overall Survival Status')

# Tampilkan summary model
print("\nSummary Model Cox:")
print(cph.summary)

# Uji asumsi proportional hazards
print("\nUji Asumsi Proportional Hazards:")
try:
    results = cph.check_assumptions(data_cox_encoded, p_value_threshold=0.05)
    print(results)
except Exception as e:
    print(f"Error dalam uji asumsi: {e}")

Jumlah sampel setelah drop NaN dan encoding: 1929
Kolom setelah encoding:
['Overall Survival (Months)', 'Overall Survival Status', 'Mutation Count', 'Fraction Genome Altered', 'Cancer Type Detailed_Adenomyoepithelioma of the Breast', 'Cancer Type Detailed_Breast Invasive Cancer, NOS', 'Cancer Type Detailed_Breast Invasive Carcinoma, NOS', 'Cancer Type Detailed_Breast Invasive Carcinosarcoma, NOS', 'Cancer Type Detailed_Breast Invasive Ductal Carcinoma', 'Cancer Type Detailed_Breast Invasive Lobular Carcinoma', 'Cancer Type Detailed_Breast Invasive Mixed Mucinous Carcinoma', 'Cancer Type Detailed_Breast Mixed Ductal and Lobular Carcinoma', 'Cancer Type Detailed_Breast Neoplasm, NOS', 'Cancer Type Detailed_Cancer of Unknown Primary', 'Cancer Type Detailed_Invasive Breast Carcinoma', 'Cancer Type Detailed_Juvenile Secretory Carcinoma of the Breast', 'Cancer Type Detailed_Metaplastic Breast Cancer', 'Cancer Type Detailed_Metaplastic Squamous Cell Carcinoma', 'Primary Tumor Site_Cancer of U

0,1
null_distribution,chi squared
degrees_of_freedom,1
model,<lifelines.CoxPHFitter: fitted with 1929 total...
test_name,proportional_hazard_test

Unnamed: 0,Unnamed: 1,test_statistic,p,-log2(p)
Cancer Type Detailed_Adenomyoepithelioma of the Breast,km,0.56,0.45,1.14
Cancer Type Detailed_Adenomyoepithelioma of the Breast,rank,0.59,0.44,1.17
"Cancer Type Detailed_Breast Invasive Cancer, NOS",km,0.0,0.97,0.04
"Cancer Type Detailed_Breast Invasive Cancer, NOS",rank,0.0,0.98,0.03
"Cancer Type Detailed_Breast Invasive Carcinoma, NOS",km,1.41,0.24,2.09
"Cancer Type Detailed_Breast Invasive Carcinoma, NOS",rank,1.43,0.23,2.11
"Cancer Type Detailed_Breast Invasive Carcinosarcoma, NOS",km,0.61,0.44,1.2
"Cancer Type Detailed_Breast Invasive Carcinosarcoma, NOS",rank,0.53,0.47,1.1
Cancer Type Detailed_Breast Invasive Ductal Carcinoma,km,0.16,0.68,0.55
Cancer Type Detailed_Breast Invasive Ductal Carcinoma,rank,0.17,0.68,0.56




1. Variable 'Metastatic Site_Chest Wall' failed the non-proportional test: p-value is 0.0109.

   Advice: with so few unique values (only 2), you can include `strata=['Metastatic Site_Chest
Wall', ...]` in the call in `.fit`. See documentation in link [E] below.

---
[A]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html
[B]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it
[C]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates
[D]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form
[E]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification

[]
