In [423]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Lung dataset

In [424]:
# https://www.kaggle.com/datasets/nancyalaswad90/lung-cancer
lung_dataset = pd.read_csv("dataset/lung.csv")

In [425]:
lung_dataset.columns

Index(['gender', 'age', 'smoking', 'yellow_fingers', 'anxiety',
       'peer_pressure', 'chronic disease', 'fatigue', 'allergy', 'wheezing',
       'alcohol consuming', 'coughing', 'shortness of breath',
       'swallowing difficulty', 'chest pain', 'lung_cancer'],
      dtype='object')

In [426]:
# formatting column names and remove column gender
lung_dataset.columns = [col.lower().replace(' ', '_') for col in lung_dataset.columns]
lung_dataset.drop(columns=['gender'], inplace=True)

# Convert all 1/2 binary columns to 0/1 excluding 'age', 'lung_cancer'
binary_cols = lung_dataset.columns.difference(['age', 'lung_cancer'])
for col in binary_cols:
    lung_dataset[col] = lung_dataset[col].map({1: 0, 2: 1})

# Convert 'lung_cancer' to numeric: YES = 1, NO = 0 ===
lung_dataset['target'] = lung_dataset['lung_cancer'].map({'YES': 1, 'NO': 0})
lung_dataset.drop(columns=['lung_cancer'], inplace=True)


In [427]:
# Step 1: Abbreviate feature names (excluding 'age' from PCA features)
abbr = {
    'smoking': 'smk',
    'yellow_fingers': 'yf',
    'anxiety': 'anx',
    'peer_pressure': 'pp',
    'chronic_disease': 'cd',
    'fatigue': 'ftg',
    'allergy': 'alg',
    'wheezing': 'whz',
    'alcohol_consuming': 'alc',
    'coughing': 'cgh',
    'shortness_of_breath': 'sob',
    'swallowing_difficulty': 'swd',
    'chest_pain': 'cp'
}

# Step 2: Separate age column and target
age = lung_dataset['age'].reset_index(drop=True)
y = lung_dataset['target'].reset_index(drop=True)

# Step 3: Drop 'age' and 'target' from features used in PCA
X = lung_dataset.drop(columns=['age', 'target'])

# Step 4: Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Perform PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
loadings = pca.components_

# Step 6: Create PCA column names from top 2 contributing features
pca_column_names = []
for component in loadings:
    top_features_idx = np.argsort(np.abs(component))[-2:][::-1]
    top_features = X.columns[top_features_idx]
    abbr_name = '+'.join(abbr[f] for f in top_features) + '_pc'
    pca_column_names.append(abbr_name)

# Step 7: Final DataFrame with PCA components, age, and target
lung = pd.DataFrame(X_pca, columns=pca_column_names)
lung['age'] = age
lung['target'] = y

In [428]:
lung

Unnamed: 0,anx+yf_pc,alc+sob_pc,cgh+whz_pc,smk+cd_pc,cd+alg_pc,age,target
0,-0.164314,0.209473,2.017398,0.496745,-0.978387,69,1
1,-0.461568,-0.604066,-1.190478,0.886671,2.513362,74,1
2,-1.433580,-1.510590,0.995657,-0.556335,-0.694395,59,0
3,1.889671,1.513468,-1.362016,1.859020,-1.153526,63,0
4,-0.490144,-1.368847,0.127094,-1.006002,-2.326854,63,0
...,...,...,...,...,...,...,...
304,-0.270215,-0.764980,0.184340,-1.730935,0.482539,56,1
305,-2.962043,0.039448,0.272374,1.376482,-0.142275,70,1
306,-2.160440,1.822093,-1.022964,0.961885,-1.465290,58,1
307,-1.540513,-0.177603,-0.301546,2.077276,0.099836,67,1


# Kidney dataset

In [429]:
# https://www.kaggle.com/datasets/mansoordaku/ckdisease
kidney_dataset = pd.read_csv("dataset/kidney.csv")

In [430]:
kidney_dataset.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [431]:
# formatting column names
kidney_dataset.columns = [col.lower().replace(' ', '_') for col in kidney_dataset.columns]

In [432]:
# drop id column
kidney_dataset.drop(columns=['id'], inplace=True)

# Fill missing values for numerical columns with mean
kidney_dataset.fillna(kidney_dataset.select_dtypes(include=['float64', 'int64']).mean(), inplace=True)

# Fill missing values for categorical columns with mode
kidney_dataset.fillna(kidney_dataset.select_dtypes(include=['object']).mode().iloc[0], inplace=True)


In [433]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1. Map classification to 0/1 and drop rows with undefined target
kidney_dataset['target'] = kidney_dataset['classification'].map({'ckd': 1, 'notckd': 0})
kidney_dataset.dropna(subset=['target'], inplace=True)
kidney_dataset['target'] = kidney_dataset['target'].astype(int)
kidney_dataset.drop(columns=['classification'], inplace=True)

# 2. Convert string numbers to numeric
for col in ['pcv', 'wc', 'rc']:
    kidney_dataset[col] = pd.to_numeric(kidney_dataset[col], errors='coerce')

# 3. Map binary/categorical columns
binary_map = {
    'normal': 0, 'abnormal': 1,
    'present': 1, 'notpresent': 0,
    'yes': 1, 'no': 0,
    'good': 0, 'poor': 1
}
binary_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in binary_cols:
    kidney_dataset[col] = kidney_dataset[col].map(binary_map)

# 4. Drop rows with missing values
kidney_dataset.dropna(inplace=True)

# 5. Separate age and target
age_col = kidney_dataset['age']
target_col = kidney_dataset['target']
X = kidney_dataset.drop(columns=['age', 'target'])

# 6. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. Apply PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# 8. Generate meaningful abbreviated PCA column names
abbr = {col: col[:3].lower() for col in X.columns}
loadings = pca.components_
pca_column_names = []
for component in loadings:
    top_idx = np.argsort(np.abs(component))[-2:][::-1]
    top_features = [abbr[X.columns[idx]] for idx in top_idx]
    name = '+'.join(top_features) + '_pc'
    pca_column_names.append(name)

# 9. Final DataFrame
kidney = pd.DataFrame(X_pca, columns=pca_column_names)
kidney['age'] = age_col.values
kidney['target'] = target_col.values


In [434]:
kidney

Unnamed: 0,hem+pcv_pc,su+bgr_pc,ba+pc_pc,pot+app_pc,pot+sod_pc,age,target
0,0.896504,0.402904,-1.011083,-0.368969,-0.161388,48.0,1
1,1.023435,-0.156124,0.748049,0.374460,-0.367576,7.0,1
2,-2.646731,2.349933,-1.948036,-0.364223,1.218263,62.0,1
3,-4.380768,-1.366903,2.416938,-0.871165,-1.162340,48.0,1
4,0.444355,-0.355148,0.379378,-0.112557,0.017939,51.0,1
...,...,...,...,...,...,...,...
382,2.427147,0.056184,-0.007111,0.400297,0.701667,55.0,0
383,3.539647,-0.174080,0.168222,0.348331,-0.361857,42.0,0
384,2.640627,-0.285218,-0.094132,0.403087,-0.252487,12.0,0
385,2.867856,-0.515629,-0.100791,0.616806,-0.379341,17.0,0


# Diabetes

In [435]:
#
diabetes_dataset = pd.read_csv("dataset/diabetes.csv")

In [436]:
diabetes_dataset.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [437]:
# formatting column names
diabetes_dataset.columns = [col.lower().replace(' ', '_') for col in diabetes_dataset.columns]

In [438]:
# drop id column
diabetes_dataset.drop(columns=['gender'], inplace=True)
diabetes_dataset.isna().sum()

age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
hba1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [439]:
diabetes_dataset['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [440]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1. Map smoking_history to numeric values
smoking_map = {
    'never': 0,
    'No Info': 1,
    'former': 2,
    'not current': 2,
    'ever': 3,
    'current': 4
}
diabetes_dataset['smoking_history'] = diabetes_dataset['smoking_history'].map(smoking_map)

# 2. Drop rows with missing values
diabetes_dataset.dropna(inplace=True)

# 3. Separate target and 'age' column
age_col = diabetes_dataset['age']
target_col = diabetes_dataset['diabetes']
X = diabetes_dataset.drop(columns=['age', 'diabetes'])

# 4. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. PCA to reduce to 5 components
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# 6. Generate abbreviated PCA column names from top contributors
abbr = {col: col[:3].lower() for col in X.columns}
loadings = pca.components_
pca_column_names = []
for i, component in enumerate(loadings):
    top_idx = np.argsort(np.abs(component))[-2:][::-1]
    top_features = [abbr[X.columns[idx]] for idx in top_idx]
    name = '+'.join(top_features) + '_pc'
    pca_column_names.append(name)

# 7. Create final DataFrame
diabetes = pd.DataFrame(X_pca, columns=pca_column_names)
diabetes['age'] = age_col.values
diabetes['target'] = target_col.values


# Heart Dataset

In [441]:
heart_dataset = pd.read_csv("dataset/heart.csv")

In [442]:
heart_dataset.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [443]:
# formatting column names
heart_dataset.columns = [col.lower().replace(' ', '_') for col in heart_dataset.columns]

In [444]:
heart_dataset.drop(columns=['sex'], inplace=True)

In [445]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Drop rows with missing values if any
heart_dataset.dropna(inplace=True)

# Separate columns
age_col = heart_dataset['age']
target_col = heart_dataset['target']
X = heart_dataset.drop(columns=['age', 'target'])

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=['cp', 'restecg', 'slope', 'thal'], drop_first=True)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Apply PCA
pca = PCA(n_components=6)
X_pca = pca.fit_transform(X_scaled)

# Generate meaningful PCA column names from top contributing features
abbr = {col: col[:3].lower() for col in X_encoded.columns}
loadings = pca.components_
pca_column_names = []
for i, component in enumerate(loadings):
    top_idx = np.argsort(np.abs(component))[-2:][::-1]
    top_features = [abbr[X_encoded.columns[idx]] for idx in top_idx]
    name = '+'.join(top_features) + '_pc'
    pca_column_names.append(name)

# Create PCA DataFrame
heart = pd.DataFrame(X_pca, columns=pca_column_names)

# Add age and target columns back
heart['age'] = age_col.values
heart['target'] = target_col.values


In [446]:
# data integration

In [447]:
lung.rename(columns={'target': 'target_lung'}, inplace=True)
heart.rename(columns={'target': 'target_heart'}, inplace=True)
diabetes.rename(columns={'target': 'target_diabetes'}, inplace=True)

In [448]:
import pandas as pd
import numpy as np

# Step 1: Ensure 'age' is in the correct format (integer)
lung['age'] = lung['age'].astype(int)
heart['age'] = heart['age'].astype(int)

# Step 2: Merge lung and heart datasets on 'age', keeping all columns from both datasets
merged_lung_heart = lung.merge(heart, on='age', how='outer', suffixes=('_lung', '_heart'))

# Step 3: Create a new column 'disease_type' based on the target column of each dataset
merged_lung_heart['disease_type'] = np.select(
    [
        merged_lung_heart['target_lung'] == 1,   # Lung disease present
        merged_lung_heart['target_heart'] == 1   # Heart disease present
    ],
    ['lung', 'heart'],  # 'lung' if lung disease, 'heart' if heart disease
    default='no_disease'  # No disease if neither target is 1
)

# Step 4: Drop the 'target_lung' and 'target_heart' columns (since they're no longer needed)
merged_lung_heart.drop(columns=['target_lung', 'target_heart'], inplace=True)

# Step 5: Handle duplicate 'age' columns
# Since 'age' is in both datasets, remove any column containing the suffix '_lung' or '_heart' for age.
merged_lung_heart = merged_lung_heart.loc[:, ~merged_lung_heart.columns.str.contains('age_')]

# Step 6: Ensure only one 'age' column exists in the dataset
final_dataset = merged_lung_heart[['age', 'disease_type'] + [col for col in merged_lung_heart.columns if col not in ['disease_type']]]

 '

In [449]:
final_dataset

Unnamed: 0,disease_type,anx+yf_pc,alc+sob_pc,cgh+whz_pc,smk+cd_pc,cd+alg_pc,age,slo+tha_pc,tha+tha_pc,tre+fbs_pc,cp_+cp__pc,cho+tha_pc,cp_+tha_pc
0,no_disease,-0.783397,-1.837239,-2.039021,-0.034920,1.611783,21,,,,,,
1,heart,,,,,,29,3.19242,0.203661,0.169817,-1.912714,0.296016,-0.186216
2,heart,,,,,,29,3.19242,0.203661,0.169817,-1.912714,0.296016,-0.186216
3,heart,,,,,,29,3.19242,0.203661,0.169817,-1.912714,0.296016,-0.186216
4,heart,,,,,,29,3.19242,0.203661,0.169817,-1.912714,0.296016,-0.186216
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9008,lung,0.839928,-0.658621,1.073428,-1.223263,1.372941,78,,,,,,
9009,lung,-1.624447,0.432825,-0.292099,0.888598,1.785343,79,,,,,,
9010,lung,-1.093447,0.251012,-1.070138,-1.897484,0.178748,81,,,,,,
9011,lung,0.649694,0.240743,2.499310,-0.839448,1.446150,81,,,,,,


In [450]:
import pandas as pd
import numpy as np

# Ensure 'age' is in the correct format (integer)
lung['age'] = lung['age'].astype(int)
heart['age'] = heart['age'].astype(int)
diabetes['age'] = diabetes['age'].astype(int)

# Merge lung and heart datasets on 'age', keeping all columns from both datasets
merged_lung_heart = lung.merge(heart, on='age', how='outer', suffixes=('_lung', '_heart'))

# Create a new column 'disease_type' based on the target column of each dataset
merged_lung_heart['disease_type'] = np.select(
    [
        merged_lung_heart['target_lung'] == 1,   # Lung disease present
        merged_lung_heart['target_heart'] == 1   # Heart disease present
    ],
    ['lung', 'heart'],  # 'lung' if lung disease, 'heart' if heart disease
    default='no_disease'  # No disease if neither target is 1
)

# Drop the 'target_lung' and 'target_heart' columns (since they're no longer needed)
merged_lung_heart.drop(columns=['target_lung', 'target_heart'], inplace=True)

# Drop any columns with 'age_' prefix to keep only one age column
merged_lung_heart = merged_lung_heart.loc[:, ~merged_lung_heart.columns.str.contains('age_')]

# Finalize the dataset for lung and heart data
final_lung_heart_dataset = merged_lung_heart[['age', 'disease_type'] + [col for col in merged_lung_heart.columns if col not in ['disease_type']]]

# Now merge with the diabetes dataset
# First, rename the target column in diabetes dataset to make sure it doesn't conflict
diabetes.rename(columns={'target': 'target_diabetes'}, inplace=True)

# Merge the lung+heart dataset with the diabetes dataset on 'age'
merged_final_diabetes = final_lung_heart_dataset.merge(diabetes, on='age', how='outer', suffixes=('_merged', '_diabetes'))

# Create a new 'disease_type' column for the final merged dataset, considering all diseases
merged_final_diabetes['disease_type'] = np.select(
    [
        merged_final_diabetes['target_lung'] == 1,  # Lung disease
        merged_final_diabetes['target_heart'] == 1,  # Heart disease
        merged_final_diabetes['target_diabetes'] == 1  # Diabetes disease
    ],
    ['lung', 'heart', 'diabetes'],  # Assign corresponding disease names
    default='no_disease'  # No disease if none of the targets are 1
)

# Drop the target columns after creating the 'disease_type' column
merged_final_diabetes.drop(columns=['target_lung', 'target_heart', 'target_diabetes'], inplace=True)

# Finalize the merged dataset
final_merged_dataset = merged_final_diabetes[['age', 'disease_type'] + [col for col in merged_final_diabetes.columns if col not in ['disease_type']]]

# Display the final merged dataset
print(final_merged_dataset.head())


ValueError: The column label 'age' is not unique.