In [1]:
import pandas as pd
import numpy as np

# Load your raw file
df = pd.read_csv("synthetic_raw.csv")

# Quick structure check
print(df.shape)
print(df.dtypes)
df.head()


(1000, 9)
Patient_ID     object
Name           object
Age           float64
Gender         object
WBC_Count      object
Tumor_Size    float64
Diagnosis      object
Stage          object
Treatment      object
dtype: object


Unnamed: 0,Patient_ID,Name,Age,Gender,WBC_Count,Tumor_Size,Diagnosis,Stage,Treatment
0,PID0001,Allison Hill,-9.0,Female,8515,0.0,No Cancer,,Observation
1,PID0002,Noah Rhodes,71.0,Female,6705,0.0,No Cancer,,Observation
2,PID0003,Angie Henderson,48.0,Male,5358,0.0,No Cancer,,Observation
3,PID0004,Daniel Wagner,,Female,3442,0.0,No Cancer,,Observation
4,PID0005,Cristian Santos,62.0,Male,6343,172.57,No Cancer,,Observation


In [2]:
# Convert WBC_Count to numeric, invalid values -> NaN
df['WBC_Count'] = pd.to_numeric(df['WBC_Count'], errors='coerce')

# Age is already float, Tumor_Size is float; just confirm
df[['Age', 'Tumor_Size', 'WBC_Count']].head()


Unnamed: 0,Age,Tumor_Size,WBC_Count
0,-9.0,0.0,8515.0
1,71.0,0.0,6705.0
2,48.0,0.0,5358.0
3,,0.0,3442.0
4,62.0,172.57,6343.0


In [3]:
# Strip spaces
df['Gender'] = df['Gender'].str.strip()

# Fix common typos / variants
df['Gender'] = df['Gender'].replace({
    'femlae': 'Female',
    'female': 'Female',
    'F': 'Female',
    'M': 'Male',
    'male': 'Male'
})

# Any unexpected value -> 'Unknown'
valid_genders = ['Male', 'Female', 'Unknown']
df.loc[~df['Gender'].isin(valid_genders), 'Gender'] = 'Unknown'

df['Gender'].value_counts()


Gender
Female     492
Male       491
Unknown     17
Name: count, dtype: int64

In [4]:
# Lowercase and strip
df['Diagnosis'] = df['Diagnosis'].str.strip().str.lower()

# Map to consistent labels
df['Diagnosis'] = df['Diagnosis'].replace({
    'no cancer': 'No Cancer',
    'cancer': 'Cancer'
})

df['Diagnosis'].value_counts()


Diagnosis
No Cancer    676
Cancer       324
Name: count, dtype: int64

In [5]:
# First strip and lowercase for easier mapping
stage_raw = df['Stage'].astype(str).str.strip().str.lower()

stage_map = {
    'stage i': 'Stage I',
    'staeg 1': 'Stage I',
    'stage 1': 'Stage I',
    
    'stage ii': 'Stage II',
    'staeg 2': 'Stage II',
    'stage 2': 'Stage II',
    
    'stage iii': 'Stage III',
    'stag 3': 'Stage III',
    'stage 3': 'Stage III',
    
    'stage iv': 'Stage IV',
    'stage 4': 'Stage IV'
}

df['Stage'] = stage_raw.map(stage_map)

# Keep NaN for non-cancer / unknown stage
df['Stage'].value_counts(dropna=False)


Stage
NaN          695
Stage I      114
Stage III     78
Stage II      78
Stage IV      35
Name: count, dtype: int64

In [6]:
# Set impossible ages (< 0 or > 100) to NaN
df.loc[(df['Age'] < 0) | (df['Age'] > 100), 'Age'] = np.nan

# Impute Age with median
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)

# Make Age integer
df['Age'] = df['Age'].round().astype('int64')

df['Age'].describe()


count    1000.000000
mean       50.106000
std        16.307934
min        20.000000
25%        38.000000
50%        50.000000
75%        63.000000
max        79.000000
Name: Age, dtype: float64

In [7]:
# WBC_Count is already numeric with NaN for invalid; now impute
wbc_median = df['WBC_Count'].median()
df['WBC_Count'] = df['WBC_Count'].fillna(wbc_median)

df['WBC_Count'].describe()


count     1000.000000
mean      8564.532000
std       2989.562256
min       1972.000000
25%       6600.000000
50%       7795.500000
75%       9771.500000
max      20033.000000
Name: WBC_Count, dtype: float64

In [8]:
# For Cancer rows, treat 0 or negative tumor size as missing
mask_cancer = df['Diagnosis'] == 'Cancer'
df.loc[mask_cancer & (df['Tumor_Size'] <= 0), 'Tumor_Size'] = np.nan

# Impute tumor size for Cancer patients only
tumor_median_cancer = df.loc[mask_cancer, 'Tumor_Size'].median()
df.loc[mask_cancer, 'Tumor_Size'] = df.loc[mask_cancer, 'Tumor_Size'].fillna(tumor_median_cancer)

df['Tumor_Size'].describe()


count    1000.000000
mean       18.588960
std        35.095317
min         0.000000
25%         0.000000
50%         0.000000
75%        31.277500
max       199.110000
Name: Tumor_Size, dtype: float64

In [9]:
# Check duplicates by Patient_ID
df['Patient_ID'].value_counts().head()

# Drop duplicate Patient_ID rows keeping the first occurrence
df = df.drop_duplicates(subset='Patient_ID', keep='first')

# Optional: reset index
df = df.reset_index(drop=True)

df.shape


(998, 9)

In [10]:
# Convert some columns to 'category' for efficiency (optional)
cat_cols = ['Gender', 'Diagnosis', 'Stage', 'Treatment']
for col in cat_cols:
    df[col] = df[col].astype('category')

# Final overview
print(df.dtypes)
df.describe(include='all')


Patient_ID      object
Name            object
Age              int64
Gender        category
WBC_Count      float64
Tumor_Size     float64
Diagnosis     category
Stage         category
Treatment     category
dtype: object


Unnamed: 0,Patient_ID,Name,Age,Gender,WBC_Count,Tumor_Size,Diagnosis,Stage,Treatment
count,998,998,998.0,998,998.0,998.0,998,304,998
unique,998,993,,3,,,2,4,5
top,PID0001,Michael Miller,,Female,,,No Cancer,Stage I,Observation
freq,1,2,,492,,,675,113,675
mean,,,50.098196,,8552.809619,18.58516,,,
std,,,16.323104,,2970.403825,35.118414,,,
min,,,20.0,,1972.0,0.0,,,
25%,,,38.0,,6600.0,0.0,,,
50%,,,50.0,,7795.5,0.0,,,
75%,,,63.0,,9766.75,31.215,,,


In [11]:
df.to_csv("synthetic_clean.csv", index=False)
print("Cleaned dataset saved as synthetic_clean.csv")


Cleaned dataset saved as synthetic_clean.csv


In [12]:
df.head()

Unnamed: 0,Patient_ID,Name,Age,Gender,WBC_Count,Tumor_Size,Diagnosis,Stage,Treatment
0,PID0001,Allison Hill,50,Female,8515.0,0.0,No Cancer,,Observation
1,PID0002,Noah Rhodes,71,Female,6705.0,0.0,No Cancer,,Observation
2,PID0003,Angie Henderson,48,Male,5358.0,0.0,No Cancer,,Observation
3,PID0004,Daniel Wagner,50,Female,3442.0,0.0,No Cancer,,Observation
4,PID0005,Cristian Santos,62,Male,6343.0,172.57,No Cancer,,Observation


In [13]:
df.duplicated(subset='Patient_ID').sum()


0

In [14]:
df.isna().sum()


Patient_ID      0
Name            0
Age             0
Gender          0
WBC_Count       0
Tumor_Size      0
Diagnosis       0
Stage         694
Treatment       0
dtype: int64

In [15]:
print(df['Gender'].unique())
print(df['Diagnosis'].unique())
print(df['Stage'].unique())
print(df['Treatment'].unique())


['Female', 'Male', 'Unknown']
Categories (3, object): ['Female', 'Male', 'Unknown']
['No Cancer', 'Cancer']
Categories (2, object): ['Cancer', 'No Cancer']
[NaN, 'Stage I', 'Stage III', 'Stage II', 'Stage IV']
Categories (4, object): ['Stage I', 'Stage II', 'Stage III', 'Stage IV']
['Observation', 'Immunotherapy', 'Radiation', 'Surgery', 'Chemotherapy']
Categories (5, object): ['Chemotherapy', 'Immunotherapy', 'Observation', 'Radiation', 'Surgery']
