In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df=pd.read_csv('lung_cancer_mortality_data_test_v2.csv')
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Female,Slovakia,2016-04-07,Stage IV,2016-04-09,No,Former Smoker,21.2,191,0,0,0,0,Surgery,2017-02-10,0
1,2,50.0,Male,Slovenia,2023-04-22,Stage III,2023-05-05,Yes,Current Smoker,36.4,258,1,0,0,0,Chemotherapy,2024-08-23,0
2,3,65.0,Male,Italy,2023-04-07,Stage II,2023-04-12,Yes,Former Smoker,18.9,174,1,0,1,0,Chemotherapy,2025-03-24,1
3,4,51.0,Male,Latvia,2016-02-07,Stage I,2016-03-08,No,Passive Smoker,38.8,279,1,0,0,0,Combined,2017-03-01,0
4,5,37.0,Female,Spain,2023-12-01,Stage II,2023-12-04,Yes,Former Smoker,37.7,273,0,0,0,0,Combined,2025-07-16,0


In [9]:
df.columns

Index(['id', 'age', 'gender', 'country', 'diagnosis_date', 'cancer_stage',
       'beginning_of_treatment_date', 'family_history', 'smoking_status',
       'bmi', 'cholesterol_level', 'hypertension', 'asthma', 'cirrhosis',
       'other_cancer', 'treatment_type', 'end_treatment_date', 'survived'],
      dtype='object')

In [10]:
df['treatment_type'].unique()


array(['Surgery', 'Chemotherapy', 'Combined', 'Radiation'], dtype=object)

In [11]:
df.duplicated().sum()

0

In [12]:
df.isna().sum()

id                             0
age                            0
gender                         0
country                        0
diagnosis_date                 0
cancer_stage                   0
beginning_of_treatment_date    0
family_history                 0
smoking_status                 0
bmi                            0
cholesterol_level              0
hypertension                   0
asthma                         0
cirrhosis                      0
other_cancer                   0
treatment_type                 0
end_treatment_date             0
survived                       0
dtype: int64

In [14]:
df.dtypes

id                               int64
age                            float64
gender                          object
country                         object
diagnosis_date                  object
cancer_stage                    object
beginning_of_treatment_date     object
family_history                  object
smoking_status                  object
bmi                            float64
cholesterol_level                int64
hypertension                     int64
asthma                           int64
cirrhosis                        int64
other_cancer                     int64
treatment_type                  object
end_treatment_date              object
survived                         int64
dtype: object

In [15]:
df.info

<bound method DataFrame.info of        id   age  gender   country diagnosis_date cancer_stage  \
0       1  64.0  Female  Slovakia     2016-04-07     Stage IV   
1       2  50.0    Male  Slovenia     2023-04-22    Stage III   
2       3  65.0    Male     Italy     2023-04-07     Stage II   
3       4  51.0    Male    Latvia     2016-02-07      Stage I   
4       5  37.0  Female     Spain     2023-12-01     Stage II   
..    ...   ...     ...       ...            ...          ...   
995   996  59.0  Female   Finland     2023-05-17      Stage I   
996   997  46.0    Male   Belgium     2014-12-03    Stage III   
997   998  48.0  Female   Finland     2022-11-02     Stage II   
998   999  65.0  Female  Bulgaria     2015-12-25      Stage I   
999  1000  58.0  Female     Spain     2015-08-09      Stage I   

    beginning_of_treatment_date family_history  smoking_status   bmi  \
0                    2016-04-09             No   Former Smoker  21.2   
1                    2023-05-05            

In [16]:
df.describe()

Unnamed: 0,id,age,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,survived
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,54.651,30.3472,233.174,0.748,0.484,0.25,0.098,0.212
std,288.819436,10.014364,8.287035,43.613299,0.434379,0.499994,0.433229,0.297463,0.408929
min,1.0,21.0,16.0,150.0,0.0,0.0,0.0,0.0,0.0
25%,250.75,48.0,23.3,195.0,0.0,0.0,0.0,0.0,0.0
50%,500.5,55.0,30.2,241.0,1.0,0.0,0.0,0.0,0.0
75%,750.25,61.0,37.5,272.0,1.0,1.0,0.25,0.0,0.0
max,1000.0,90.0,45.0,300.0,1.0,1.0,1.0,1.0,1.0


In [94]:
# making a copy in order not to tamper with the original dataset
df1 = df.copy()

In [95]:
#converting date in datetime
df1['diagnosis_date'] = pd.to_datetime(df1['diagnosis_date'])
df1['end_treatment_date'] = pd.to_datetime(df1['end_treatment_date'])


In [96]:
# convert age data type to integer
df1['age'] = df1['age'].astype(int)

In [97]:
df1.dtypes

id                                      int64
age                                     int32
gender                                 object
country                                object
diagnosis_date                 datetime64[ns]
cancer_stage                           object
beginning_of_treatment_date            object
family_history                         object
smoking_status                         object
bmi                                   float64
cholesterol_level                       int64
hypertension                            int64
asthma                                  int64
cirrhosis                               int64
other_cancer                            int64
treatment_type                         object
end_treatment_date             datetime64[ns]
survived                                int64
dtype: object

In [98]:
df1.head()


Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64,Female,Slovakia,2016-04-07,Stage IV,2016-04-09,No,Former Smoker,21.2,191,0,0,0,0,Surgery,2017-02-10,0
1,2,50,Male,Slovenia,2023-04-22,Stage III,2023-05-05,Yes,Current Smoker,36.4,258,1,0,0,0,Chemotherapy,2024-08-23,0
2,3,65,Male,Italy,2023-04-07,Stage II,2023-04-12,Yes,Former Smoker,18.9,174,1,0,1,0,Chemotherapy,2025-03-24,1
3,4,51,Male,Latvia,2016-02-07,Stage I,2016-03-08,No,Passive Smoker,38.8,279,1,0,0,0,Combined,2017-03-01,0
4,5,37,Female,Spain,2023-12-01,Stage II,2023-12-04,Yes,Former Smoker,37.7,273,0,0,0,0,Combined,2025-07-16,0


In [99]:
# creating a new column and calculate number of days under treatment
df1['days_under_treatment'] = (df1['end_treatment_date'] - df1['diagnosis_date']).dt.days

In [100]:
df1.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived,days_under_treatment
0,1,64,Female,Slovakia,2016-04-07,Stage IV,2016-04-09,No,Former Smoker,21.2,191,0,0,0,0,Surgery,2017-02-10,0,309
1,2,50,Male,Slovenia,2023-04-22,Stage III,2023-05-05,Yes,Current Smoker,36.4,258,1,0,0,0,Chemotherapy,2024-08-23,0,489
2,3,65,Male,Italy,2023-04-07,Stage II,2023-04-12,Yes,Former Smoker,18.9,174,1,0,1,0,Chemotherapy,2025-03-24,1,717
3,4,51,Male,Latvia,2016-02-07,Stage I,2016-03-08,No,Passive Smoker,38.8,279,1,0,0,0,Combined,2017-03-01,0,388
4,5,37,Female,Spain,2023-12-01,Stage II,2023-12-04,Yes,Former Smoker,37.7,273,0,0,0,0,Combined,2025-07-16,0,593


In [101]:
print(df1.columns)

Index(['id', 'age', 'gender', 'country', 'diagnosis_date', 'cancer_stage',
       'beginning_of_treatment_date', 'family_history', 'smoking_status',
       'bmi', 'cholesterol_level', 'hypertension', 'asthma', 'cirrhosis',
       'other_cancer', 'treatment_type', 'end_treatment_date', 'survived',
       'days_under_treatment'],
      dtype='object')


In [102]:
df1['family_history'] = df1['family_history'].map({'Yes': 1, 'No': 0})

In [1]:
df1=df1.drop(['id','gender','country','diagnosis_date','cancer_stage','beginning_of_treatment_date','family_history','smoking_status','treatment_type','end_treatment_date'],axis=1)
df1.corr()['survived']

NameError: name 'df1' is not defined

In [104]:
df1.describe()


Unnamed: 0,age,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,survived,days_under_treatment
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,54.651,30.3472,233.174,0.748,0.484,0.25,0.098,0.212,453.87
std,10.014364,8.287035,43.613299,0.434379,0.499994,0.433229,0.297463,0.408929,137.452728
min,21.0,16.0,150.0,0.0,0.0,0.0,0.0,0.0,185.0
25%,48.0,23.3,195.0,0.0,0.0,0.0,0.0,0.0,357.75
50%,55.0,30.2,241.0,1.0,0.0,0.0,0.0,0.0,453.5
75%,61.0,37.5,272.0,1.0,1.0,0.25,0.0,0.0,542.25
max,90.0,45.0,300.0,1.0,1.0,1.0,1.0,1.0,729.0
