In [639]:
import pandas as pd
import numpy as np

# load the dataset
data_set = pd.read_csv('./dataset-breast-cancer.csv')

# preview the dataset
data_set.head()


Unnamed: 0,Patient_ID,Month_of_Birth,Age,Sex,Occupation,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months,Mortality_Status
0,A0012,12,68.0,Female,Teaching,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,A0013,12,50.0,Female,Medical,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,A0014,11,58.0,Female,Engineering,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,A0015,3,58.0,Female,Technology,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2.0,1,84,Alive
4,A0016,1,47.0,Female,Multimedia,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3.0,1,50,Alive


In [640]:
# dataset information
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Patient_ID              4024 non-null   object 
 1   Month_of_Birth          4024 non-null   int64  
 2   Age                     4015 non-null   float64
 3   Sex                     4020 non-null   object 
 4   Occupation              43 non-null     object 
 5   T_Stage                 4024 non-null   object 
 6   N_Stage                 4024 non-null   object 
 7   6th_Stage               4024 non-null   object 
 8   Differentiated          4024 non-null   object 
 9   Grade                   4024 non-null   int64  
 10  A_Stage                 4024 non-null   object 
 11  Tumor_Size              4021 non-null   float64
 12  Estrogen_Status         4024 non-null   object 
 13  Progesterone_Status     4024 non-null   object 
 14  Regional_Node_Examined  4023 non-null   

In [641]:
# dropping the month of birth column
data_set = data_set.drop(columns=['Patient_ID','Month_of_Birth','Sex','Occupation'])

data_set.dtypes

Age                       float64
T_Stage                    object
N_Stage                    object
6th_Stage                  object
Differentiated             object
Grade                       int64
A_Stage                    object
Tumor_Size                float64
Estrogen_Status            object
Progesterone_Status        object
Regional_Node_Examined    float64
Reginol_Node_Positive       int64
Survival_Months             int64
Mortality_Status           object
dtype: object

In [642]:
# summary of data
descriptive_stats = data_set.describe()
print(descriptive_stats)



               Age        Grade   Tumor_Size  Regional_Node_Examined  \
count  4015.000000  4024.000000  4021.000000             4023.000000   
mean     54.107098     2.150596    30.419299               14.373602   
std      11.715528     0.638234    21.161080                8.129293   
min     -50.000000     1.000000   -75.000000                1.000000   
25%      47.000000     2.000000    16.000000                9.000000   
50%      54.000000     2.000000    25.000000               14.000000   
75%      61.000000     3.000000    38.000000               19.000000   
max     502.000000     4.000000   140.000000               61.000000   

       Reginol_Node_Positive  Survival_Months  
count            4024.000000      4024.000000  
mean                4.158052        71.472167  
std                 5.109331        25.361855  
min                 1.000000         1.000000  
25%                 1.000000        56.000000  
50%                 2.000000        73.000000  
75%            

In [643]:
# counting missing data
data_set.isnull().sum()


Age                       9
T_Stage                   0
N_Stage                   0
6th_Stage                 0
Differentiated            0
Grade                     0
A_Stage                   0
Tumor_Size                3
Estrogen_Status           0
Progesterone_Status       0
Regional_Node_Examined    1
Reginol_Node_Positive     0
Survival_Months           0
Mortality_Status          0
dtype: int64

In [644]:
# Check for duplicates
data_set.duplicated().sum()

1

In [645]:
from sklearn.impute import SimpleImputer
# list of columns
columns_to_impute = ['Age', 'Tumor_Size', 'Regional_Node_Examined']

# create the imputer
mean_imputer = SimpleImputer(strategy='mean')

# apply imputation to the selected columns
data_set[columns_to_impute] = mean_imputer.fit_transform(data_set[columns_to_impute])

# ensure the imputatation
data_set.isnull().sum()


Age                       0
T_Stage                   0
N_Stage                   0
6th_Stage                 0
Differentiated            0
Grade                     0
A_Stage                   0
Tumor_Size                0
Estrogen_Status           0
Progesterone_Status       0
Regional_Node_Examined    0
Reginol_Node_Positive     0
Survival_Months           0
Mortality_Status          0
dtype: int64

In [646]:
# dimentions of the dataset
print("Dataset Shape:", data_set.shape)

Dataset Shape: (4024, 14)


In [647]:
import plotly.express as px

fig = px.bar(data_set, x='Mortality_Status', title='Mortality Status Distribution')
fig.update_traces(marker_color='black')
fig.update_layout(
    title_font_size=20,
    xaxis_title='Mortality Status',
    yaxis_title='Count',
    template='plotly_white' 
)
fig.show()


In [648]:
# view unique values in target variables
print(data_set['Mortality_Status'].value_counts())

print(data_set['Survival_Months'].describe())


Mortality_Status
Alive    3399
Dead      598
DEAD       10
dead        8
ALIVE       5
alive       3
ALive       1
Name: count, dtype: int64
count    4024.000000
mean       71.472167
std        25.361855
min         1.000000
25%        56.000000
50%        73.000000
75%        90.000000
max       760.000000
Name: Survival_Months, dtype: float64


In [649]:
# clean and standardize casing
data_set['Mortality_Status'] = data_set['Mortality_Status'].str.strip().str.capitalize()

# view unique values in target variables
print(data_set['Mortality_Status'].value_counts())


Mortality_Status
Alive    3408
Dead      616
Name: count, dtype: int64


In [650]:
# encode: Alive = 0, Dead = 1
data_set['Mortality_Status'] = data_set['Mortality_Status'].map({'Alive': 0, 'Dead': 1})

In [651]:
categorical_columns = ['T_Stage', 'N_Stage', 'Estrogen_Status', 'Progesterone_Status']

# One-hot encode (drop_first=True avoids multicollinearity)
data_set = pd.get_dummies(data_set, columns=categorical_columns, drop_first=True)


In [652]:
data_set.dtypes

Age                             float64
6th_Stage                        object
Differentiated                   object
Grade                             int64
A_Stage                          object
Tumor_Size                      float64
Regional_Node_Examined          float64
Reginol_Node_Positive             int64
Survival_Months                   int64
Mortality_Status                  int64
T_Stage_T2                         bool
T_Stage_T3                         bool
T_Stage_T4                         bool
N_Stage_N2                         bool
N_Stage_N3                         bool
Estrogen_Status_Positive           bool
Progesterone_Status_Positive       bool
dtype: object

In [653]:
# Drop irrelevant columns for classification and handle missing target
classification_data_set = data_set.drop(columns=[ 'Survival_Months']).dropna(subset=['Mortality_Status'])

# Filter the data for regression tasks (dead and alive patients)
regression_data_set = data_set[data_set['Mortality_Status'] == 1]


# Save cleaned datasets to CSV
classification_data_set.to_csv('classification_data_cleaned.csv', index=False)
regression_data_set.to_csv('regression_data_cleaned.csv', index=False)

# Print dataset shapes for confirmation
print("Classification Dataset Shape:", classification_data_set.shape)
print("Regression Dataset Shape (Dead):", regression_data_set.shape)


Classification Dataset Shape: (4024, 16)
Regression Dataset Shape (Dead): (616, 17)
