In [1]:
import pandas as pd
import numpy as np

# load the dataset
data_set = pd.read_csv('./dataset-breast-cancer.csv')

# preview the dataset
data_set.head()

# dataset information
data_set.info()

# summary of data
data_set.describe(include='all')

# dimentions of the dataset
print("Dataset Shape:", data_set.shape)





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Patient_ID              4024 non-null   object 
 1   Month_of_Birth          4024 non-null   int64  
 2   Age                     4015 non-null   float64
 3   Sex                     4020 non-null   object 
 4   Occupation              43 non-null     object 
 5   T_Stage                 4024 non-null   object 
 6   N_Stage                 4024 non-null   object 
 7   6th_Stage               4024 non-null   object 
 8   Differentiated          4024 non-null   object 
 9   Grade                   4024 non-null   int64  
 10  A_Stage                 4024 non-null   object 
 11  Tumor_Size              4021 non-null   float64
 12  Estrogen_Status         4024 non-null   object 
 13  Progesterone_Status     4024 non-null   object 
 14  Regional_Node_Examined  4023 non-null   

In [2]:
# view unique values in target variables
print(data_set['Mortality_Status'].value_counts())

print(data_set['Survival_Months'].describe())

print(data_set.columns.tolist())






Mortality_Status
Alive    3399
Dead      598
DEAD       10
dead        8
ALIVE       5
alive       3
ALive       1
Name: count, dtype: int64
count    4024.000000
mean       71.472167
std        25.361855
min         1.000000
25%        56.000000
50%        73.000000
75%        90.000000
max       760.000000
Name: Survival_Months, dtype: float64
['Patient_ID', 'Month_of_Birth', 'Age', 'Sex', 'Occupation', 'T_Stage', 'N_Stage', '6th_Stage', 'Differentiated', 'Grade', 'A_Stage', 'Tumor_Size', 'Estrogen_Status', 'Progesterone_Status', 'Regional_Node_Examined', 'Reginol_Node_Positive', 'Survival_Months', 'Mortality_Status']


In [3]:
# Check for missing values
data_set.isnull().sum()

# Check for duplicates
data_set.duplicated().sum()


0

In [4]:
# identify categorial columns
data_set.dtypes


Patient_ID                 object
Month_of_Birth              int64
Age                       float64
Sex                        object
Occupation                 object
T_Stage                    object
N_Stage                    object
6th_Stage                  object
Differentiated             object
Grade                       int64
A_Stage                    object
Tumor_Size                float64
Estrogen_Status            object
Progesterone_Status        object
Regional_Node_Examined    float64
Reginol_Node_Positive       int64
Survival_Months             int64
Mortality_Status           object
dtype: object

In [5]:
# clean and standardize casing
data_set['Mortality_Status'] = data_set['Mortality_Status'].str.strip().str.capitalize()
data_set['Sex'] = data_set['Sex'].str.strip().str.capitalize()


# encode: Alive = 0, Dead = 1
data_set['Mortality_Status'] = data_set['Mortality_Status'].map({'Alive': 0, 'Dead': 1})



In [6]:
categorical_columns = ['Sex', 'T_Stage', 'N_Stage', 'Estrogen_Status', 'Progesterone_Status']

# One-hot encode (drop_first=True avoids multicollinearity)
data_set = pd.get_dummies(data_set, columns=categorical_columns, drop_first=True)


In [7]:
data_set.head()
# dropping the month of birth column
data_set = data_set.drop(columns=['Month_of_Birth'])

data_set.dtypes


Patient_ID                       object
Age                             float64
Occupation                       object
6th_Stage                        object
Differentiated                   object
Grade                             int64
A_Stage                          object
Tumor_Size                      float64
Regional_Node_Examined          float64
Reginol_Node_Positive             int64
Survival_Months                   int64
Mortality_Status                  int64
Sex_Female                         bool
T_Stage_T2                         bool
T_Stage_T3                         bool
T_Stage_T4                         bool
N_Stage_N2                         bool
N_Stage_N3                         bool
Estrogen_Status_Positive           bool
Progesterone_Status_Positive       bool
dtype: object

In [8]:
data_set = pd.get_dummies(data_set, columns=['Occupation'], drop_first=True)


In [9]:
data_set.dtypes

Patient_ID                       object
Age                             float64
6th_Stage                        object
Differentiated                   object
Grade                             int64
A_Stage                          object
Tumor_Size                      float64
Regional_Node_Examined          float64
Reginol_Node_Positive             int64
Survival_Months                   int64
Mortality_Status                  int64
Sex_Female                         bool
T_Stage_T2                         bool
T_Stage_T3                         bool
T_Stage_T4                         bool
N_Stage_N2                         bool
N_Stage_N3                         bool
Estrogen_Status_Positive           bool
Progesterone_Status_Positive       bool
Occupation_Agriculture             bool
Occupation_Arts                    bool
Occupation_Business                bool
Occupation_Cleaning                bool
Occupation_Construction            bool
Occupation_Consultancy             bool


In [10]:
# Drop irrelevant columns for classification and handle missing target
classification_data_set = data_set.drop(columns=['Patient_ID', 'Survival_Months']).dropna(subset=['Mortality_Status'])

# Filter the data for regression tasks (dead and alive patients)
regression_data_set = data_set[data_set['Mortality_Status'] == 1].drop(columns=['Patient_ID'])


# Save cleaned datasets to CSV
classification_data_set.to_csv('classification_data_cleaned.csv', index=False)
regression_data_set.to_csv('regression_data_cleaned.csv', index=False)

# Print dataset shapes for confirmation
print("Classification Dataset Shape:", classification_data_set.shape)
print("Regression Dataset Shape (Dead):", regression_data_set.shape)


Classification Dataset Shape: (4024, 56)
Regression Dataset Shape (Dead): (616, 57)
