In [2]:
import mysql.connector
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
conn = mysql.connector.connect(host = "localhost", user = "root", password = "", database = "cancer_patients")

In [4]:
df = pd.read_sql_query("select * from cancer_p",conn)
df

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8544,PT0008544,67,Male,Australia,2016,6.6,1.6,7.0,3.7,0.8,Cervical,Stage III,82753.84,9.1,3.77
8545,PT0008545,24,Other,Canada,2015,4.3,4.4,3.2,5.9,6.2,Breast,Stage II,24711.66,2.2,5.31
8546,PT0008546,54,Female,UK,2015,0.0,1.6,5.0,0.5,9.7,Prostate,Stage III,34005.58,10.0,3.38
8547,PT0008547,65,Female,Pakistan,2018,8.2,7.9,6.0,5.7,9.3,Prostate,Stage II,94277.82,5.6,5.91


# 1. Data Description 


| Column Name               | Description                                                                                                                | Data Type           | Example Values                                  |
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------- |
| **Country_Region**        | The country or region where the patient was registered/treated. Helps identify geographical cancer distribution trends.    | Categorical         | UK, China, Pakistan, Brazil, Australia          |
| **Year**                  | Year of diagnosis/treatment record (2014–2024). Useful for temporal analysis and trend visualization.                      | Numerical (Integer) | 2021, 2017, 2015                                |
| **Genetic_Risk**          | Represents hereditary cancer susceptibility level (scale value). Higher values indicate greater genetic risk.              | Numerical (Float)   | 6.4, 1.3, 7.4                                   |
| **Air_Pollution**         | Air pollution exposure index or pollution score in patient’s region. Used to evaluate environmental impact on cancer risk. | Numerical (Float)   | 2.8, 4.5, 7.9                                   |
| **Alcohol_Use**           | Alcohol consumption score/level of the patient or average regional consumption. Higher values indicate more usage.         | Numerical (Float)   | 9.5, 3.7, 2.8                                   |
| **Smoking**               | Measures smoking exposure/habit level. Higher scores imply more smoking history.                                           | Numerical (Float)   | 0.9, 3.9, 2.4                                   |
| **Obesity_Level**         | Indicates BMI/Obesity index. Higher values reflect greater obesity risk.                                                   | Numerical (Float)   | 8.7, 6.3, 0.1                                   |
| **Cancer_Type**           | Type of cancer diagnosed in the patient. Useful for category-wise comparison.                                              | Categorical         | Lung, Leukemia, Breast, Colon                   |
| **Cancer_Stage**          | Stage of cancer at diagnosis. Higher stage = advanced disease.                                                             | Categorical         | Stage 0, Stage I, Stage II, Stage III, Stage IV |
| **Treatment_Cost_USD**    | Total treatment expenditure in USD. Helpful for cost analysis and healthcare economics.                                    | Numerical (Float)   | 62913.44, 12573.41, 69844.33                    |
| **Survival_Years**        | Number of years the patient survived after diagnosis/treatment. Used for survival analysis.                                | Numerical (Float)   | 5.9, 4.7, 7.1                                   |
| **Target_Severity_Score** | Final severity score combining risk factors. Can be used as target variable for prediction models.                         | Numerical (Float)   | 4.92, 4.65, 5.84                                |


In [5]:
df.head(5)

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8549 entries, 0 to 8548
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Patient_ID             8549 non-null   object 
 1   Age                    8549 non-null   int64  
 2   Gender                 8549 non-null   object 
 3   Country_Region         8549 non-null   object 
 4   Year                   8549 non-null   int64  
 5   Genetic_Risk           8549 non-null   float64
 6   Air_Pollution          8549 non-null   float64
 7   Alcohol_Use            8549 non-null   float64
 8   Smoking                8549 non-null   float64
 9   Obesity_Level          8549 non-null   float64
 10  Cancer_Type            8549 non-null   object 
 11  Cancer_Stage           8549 non-null   object 
 12  Treatment_Cost_USD     8549 non-null   float64
 13  Survival_Years         8549 non-null   float64
 14  Target_Severity_Score  8549 non-null   float64
dtypes: f

In [9]:
df.shape

(8549, 15)