In [31]:
import pandas as pd
from IPython.display import HTML

In [39]:
css = """
<style>
table.dataframe th {
    background-color: #e04275 !important;
    color: white !important;
    text-align: left !important;
}
table.dataframe tr:nth-child(even) {
    background-color: #ffe6e6 !important;
}
table.dataframe tr:hover {
    background-color: #ffb3b3 !important;
}
table.dataframe td {
    padding: 8px !important;
    text-align: left !important;
}
</style>
"""

HTML(css)

# Column Description

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Dataset Column Descriptions</title>
<style>
  body { font-family: Arial, sans-serif; margin: 20px; background: #fff5f5; color: #993333; }
  h2 { color: #aa4444; }
  table { border-collapse: collapse; width: 100%; max-width: 900px; }
  th, td { border: 1px solid #cc6666; padding: 10px; text-align: left; }
  th { background-color: #cc6666; color: white; }
  tr:nth-child(even) { background-color: #ffe6e6; }
  tr:hover { background-color: #ffb3b3; }
</style>
</head>
<body>

<h2>Dataset Column Descriptions</h2>
<table>
  <thead>
    <tr>
      <th>Column Name</th>
      <th>Description</th>
    </tr>
  </thead>
  <tbody>
    <tr><td>PatientID</td><td>Unique identifier assigned to each patient; nominal value without predictive significance.</td></tr>
    <tr><td>Gender</td><td>Biological sex of the patient (e.g., Male, Female, Other).</td></tr>
    <tr><td>Age</td><td>Age of the patient at diagnosis, in years.</td></tr>
    <tr><td>Province</td><td>Region or province where the patient resides or received treatment.</td></tr>
    <tr><td>Ethnicity</td><td>Ethnic group of the patient (e.g., Han, Hui, Zhuang).</td></tr>
    <tr><td>TumorType</td><td>Type of diagnosed cancer (e.g., lung cancer, breast cancer).</td></tr>
    <tr><td>CancerStage</td><td>Clinical stage of cancer indicating disease progression (Stage I, II, III, IV).</td></tr>
    <tr><td>DiagnosisDate</td><td>Date when cancer diagnosis was made.</td></tr>
    <tr><td>TumorSize</td><td>Size of the tumor in centimeters or millimeters.</td></tr>
    <tr><td>Metastasis</td><td>Whether cancer has spread to other organs (Yes/No or binary).</td></tr>
    <tr><td>TreatmentType</td><td>Primary treatment method applied (surgery, chemotherapy, radiation, combination).</td></tr>
    <tr><td>SurgeryDate</td><td>Date of surgery, if performed.</td></tr>
    <tr><td>ChemotherapySessions</td><td>Number of chemotherapy sessions administered.</td></tr>
    <tr><td>RadiationSessions</td><td>Number of radiation therapy sessions administered.</td></tr>
    <tr><td>SurvivalStatus</td><td>Patient’s survival outcome at last follow-up (Alive/Deceased).</td></tr>
    <tr><td>FollowUpMonths</td><td>Number of months the patient was followed after diagnosis.</td></tr>
    <tr><td>SmokingStatus</td><td>Patient’s smoking history (Never, Former, Current).</td></tr>
    <tr><td>AlcoholUse</td><td>Patient’s alcohol consumption status (Yes, No, Occasionally).</td></tr>
    <tr><td>GeneticMutation</td><td>Presence of cancer-related genetic mutations (e.g., BRCA1/2).</td></tr>
    <tr><td>Comorbidities</td><td>Other existing medical conditions (e.g., diabetes, hypertension); possibly multi-label.</td></tr>
  </tbody>
</table>

</body>
</html>

# Import 

In [25]:
df = pd.read_csv(r"C:\Users\apietka\kaggle\China Cancer Patient Records\china_cancer_patients_synthetic.csv")

#  Initial Data Exploration

In [29]:
df.head()

Unnamed: 0,PatientID,Gender,Age,Province,Ethnicity,TumorType,CancerStage,DiagnosisDate,TumorSize,Metastasis,TreatmentType,SurgeryDate,ChemotherapySessions,RadiationSessions,SurvivalStatus,FollowUpMonths,SmokingStatus,AlcoholUse,GeneticMutation,Comorbidities
0,CHN-00001,Male,32,Hunan,Han,Lung,I,2010-07-24,8.2,No,Radiation,2010-08-24,0,23,Alive,44,Current,,,
1,CHN-00002,Male,66,Sichuan,Han,Lung,IV,2015-01-03,9.5,Yes,Chemotherapy,2015-03-24,11,0,Deceased,57,Never,Occasional,EGFR,"Hepatitis B, Hypertension"
2,CHN-00003,Other,26,Guangdong,Han,Lung,III,2022-03-26,9.3,Yes,Surgery,2022-07-14,0,0,Alive,40,Former,,,
3,CHN-00004,Other,35,Anhui,Uyghur,Breast,III,2019-06-19,7.7,No,Surgery,2019-08-27,0,0,Deceased,45,Never,Occasional,KRAS,
4,CHN-00005,Male,32,Guangdong,Han,Stomach,II,2012-02-07,7.3,No,Chemotherapy,,4,0,Alive,42,Never,,,


In [16]:
df.columns

Index(['PatientID', 'Gender', 'Age', 'Province', 'Ethnicity', 'TumorType',
       'CancerStage', 'DiagnosisDate', 'TumorSize', 'Metastasis',
       'TreatmentType', 'SurgeryDate', 'ChemotherapySessions',
       'RadiationSessions', 'SurvivalStatus', 'FollowUpMonths',
       'SmokingStatus', 'AlcoholUse', 'GeneticMutation', 'Comorbidities'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PatientID             10000 non-null  object 
 1   Gender                10000 non-null  object 
 2   Age                   10000 non-null  int64  
 3   Province              10000 non-null  object 
 4   Ethnicity             10000 non-null  object 
 5   TumorType             10000 non-null  object 
 6   CancerStage           10000 non-null  object 
 7   DiagnosisDate         10000 non-null  object 
 8   TumorSize             10000 non-null  float64
 9   Metastasis            10000 non-null  object 
 10  TreatmentType         10000 non-null  object 
 11  SurgeryDate           4327 non-null   object 
 12  ChemotherapySessions  10000 non-null  int64  
 13  RadiationSessions     10000 non-null  int64  
 14  SurvivalStatus        10000 non-null  object 
 15  FollowUpMonths      

<div style="border-radius:10px; border:1px solid #cc6666; padding: 15px; background-color: #ffe6e6; font-size:150%; text-align:left; margin-bottom:20px;">
  <strong style="color:#cc0000;">📝 Describe:</strong> This section contains a descriptive analysis of the dataset.
</div>

In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,51.6002,19.705608,18.0,35.0,51.0,69.0,85.0
TumorSize,10000.0,6.33904,2.234999,0.5,4.8,6.4,7.9,14.2
ChemotherapySessions,10000.0,3.999,6.2036,0.0,0.0,0.0,8.0,20.0
RadiationSessions,10000.0,3.0464,7.301505,0.0,0.0,0.0,0.0,30.0
FollowUpMonths,10000.0,30.4328,17.162545,1.0,16.0,31.0,45.0,60.0


In [21]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
PatientID,10000,10000,CHN-00001,1
Gender,10000,3,Female,3371
Province,10000,13,Guangdong,1841
Ethnicity,10000,6,Han,9110
TumorType,10000,6,Lung,2561
CancerStage,10000,4,II,2971
DiagnosisDate,10000,4383,2016-11-08,8
Metastasis,10000,2,No,7301
TreatmentType,10000,5,Chemotherapy,2072
SurgeryDate,4327,2915,2012-09-23,5


<div style="border-radius:10px; border:1px solid #cc6666; padding: 15px; background-color: #ffe6e6; font-size:150%; text-align:left; margin-bottom:20px;">
  <strong style="color:#cc0000;">🚫 Missing Values:</strong> This section shows the total number of missing values per column using <code>df.isna().sum()</code>.
</div>

In [73]:
isna = df.isna().sum()
percentna =isna/len(df) *100
x = pd.concat([isna,percentna],axis=1)
x.columns=['isNa','% Na']
x = x.drop(x[x['isNa'] == 0].index)
x

Unnamed: 0,isNa,% Na
SurgeryDate,5673,56.73
AlcoholUse,5921,59.21
GeneticMutation,7200,72.0
Comorbidities,3715,37.15


<div style="border-radius:10px; border:1px solid #cc6666; padding: 15px; background-color: #ffe6e6; font-size:105%; text-align:left; margin-bottom:20px;">
  <strong style="color:#cc0000;">🚫 Missing Data Summary:</strong><br><br>
  <ul style="padding-left: 20px;">
    <li><code>SurgeryDate</code> – 5673 missing values (56.73%)</li>
    <li><code>AlcoholUse</code> – 5921 missing values (59.21%)</li>
    <li><code>GeneticMutation</code> – 7200 missing values (72.00%)</li>
    <li><code>Comorbidities</code> – 3715 missing values (37.15%)</li>
  </ul>
</div>