Stage 1: Data Preprocessing

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# --- 1. Load and Clean Data ---

# Load the raw dataset
try:
    df = pd.read_csv('kidney_disease.csv')
except FileNotFoundError:
    print("Error: 'kidney_disease.csv' not found. Please make sure it's in the same directory.")
    # As a fallback for demonstration, I will load the first few rows you provided.
    # In your real run, this 'except' block shouldn't be needed.
    from io import StringIO
    fallback_data = """id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121,36,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,,18,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,53,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,56,3.8,111,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,26,1.4,,,11.6,35,7200,4.6,no,no,no,good,no,no,ckd
"""
    print("Loading fallback data as 'kidney_disease.csv' was not found.")
    df = pd.read_csv(StringIO(fallback_data))


print(f"Original shape of the data: {df.shape}")

# Drop the 'id' column as it's just an identifier
if 'id' in df.columns:
    df = df.drop('id', axis=1)

# Replace special characters and incorrect values with NaN
df.replace('?', np.nan, inplace=True)
df.replace('\t?', np.nan, inplace=True) # Handle specific cases if any
df.replace('\t', np.nan, inplace=True)  # Handle tabs if they exist as values

# --- 2. Correct Data Types ---

# List of columns that should be numeric
numeric_cols = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

for col in numeric_cols:
    if col in df.columns:
        # 'coerce' will turn any non-numeric values (like '?') into NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')

# --- 3. Impute Missing Values ---

# Separate into numerical and categorical columns for imputation
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
# Remove the target variable from this list
if 'classification' in categorical_cols:
    categorical_cols.remove('classification')

numerical_cols_with_data = df.select_dtypes(include=np.number).columns.tolist()

# Impute numerical columns with MEDIAN
num_imputer = SimpleImputer(strategy='median')
df[numerical_cols_with_data] = num_imputer.fit_transform(df[numerical_cols_with_data])

# Impute categorical columns with MODE (most frequent)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

print("\n--- Missing values after imputation ---")
print(df.isnull().sum())

# --- 4. Encode Categorical Data ---

le = LabelEncoder()

# Encode all categorical features
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# --- 5. Encode Target Variable ---

# Handle the target variable 'classification'
# The paper likely maps 'ckd' to 1 and 'notckd' to 0
df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})
# Handle any potential variations like whitespace
df['classification'] = df['classification'].replace(r'\s*ckd\s*', 1, regex=True)
df['classification'] = df['classification'].replace(r'\s*notckd\s*', 0, regex=True)

# Drop any rows where 'classification' might still be NaN (e.g., if original file had bad data)
df.dropna(subset=['classification'], inplace=True)
df['classification'] = df['classification'].astype(int)

print("\n--- Target variable 'classification' counts ---")
print(df['classification'].value_counts())

# --- 6. Separate Features (X) and Target (y) ---
X = df.drop('classification', axis=1)
y = df['classification']

# --- 7. Feature Scaling ---
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to a DataFrame for easier handling in the next stage
X_processed = pd.DataFrame(X_scaled, columns=X.columns)

print("\n--- Preprocessing Complete ---")
print("Processed Features (X_processed) head:")
print(X_processed.head())
print("\nProcessed Target (y) head:")
print(y.head())
print(f"\nShape of X_processed: {X_processed.shape}")
print(f"Shape of y: {y.shape}")import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# --- 1. Load and Clean Data ---

# Load the raw dataset
try:
    df = pd.read_csv('kidney_disease.csv')
except FileNotFoundError:
    print("Error: 'kidney_disease.csv' not found. Please make sure it's in the same directory.")
    # As a fallback for demonstration, I will load the first few rows you provided.
    # In your real run, this 'except' block shouldn't be needed.
    from io import StringIO
    fallback_data = """id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121,36,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,,18,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,53,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,56,3.8,111,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,26,1.4,,,11.6,35,7200,4.6,no,no,no,good,no,no,ckd
"""
    print("Loading fallback data as 'kidney_disease.csv' was not found.")
    df = pd.read_csv(StringIO(fallback_data))


print(f"Original shape of the data: {df.shape}")

# Drop the 'id' column as it's just an identifier
if 'id' in df.columns:
    df = df.drop('id', axis=1)

# Replace special characters and incorrect values with NaN
df.replace('?', np.nan, inplace=True)
df.replace('\t?', np.nan, inplace=True) # Handle specific cases if any
df.replace('\t', np.nan, inplace=True)  # Handle tabs if they exist as values

# --- 2. Correct Data Types ---

# List of columns that should be numeric
numeric_cols = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

for col in numeric_cols:
    if col in df.columns:
        # 'coerce' will turn any non-numeric values (like '?') into NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')

# --- 3. Impute Missing Values ---

# Separate into numerical and categorical columns for imputation
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
# Remove the target variable from this list
if 'classification' in categorical_cols:
    categorical_cols.remove('classification')

numerical_cols_with_data = df.select_dtypes(include=np.number).columns.tolist()

# Impute numerical columns with MEDIAN
num_imputer = SimpleImputer(strategy='median')
df[numerical_cols_with_data] = num_imputer.fit_transform(df[numerical_cols_with_data])

# Impute categorical columns with MODE (most frequent)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

print("\n--- Missing values after imputation ---")
print(df.isnull().sum())

# --- 4. Encode Categorical Data ---

le = LabelEncoder()

# Encode all categorical features
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# --- 5. Encode Target Variable ---

# Handle the target variable 'classification'
# The paper likely maps 'ckd' to 1 and 'notckd' to 0
df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})
# Handle any potential variations like whitespace
df['classification'] = df['classification'].replace(r'\s*ckd\s*', 1, regex=True)
df['classification'] = df['classification'].replace(r'\s*notckd\s*', 0, regex=True)

# Drop any rows where 'classification' might still be NaN (e.g., if original file had bad data)
df.dropna(subset=['classification'], inplace=True)
df['classification'] = df['classification'].astype(int)

print("\n--- Target variable 'classification' counts ---")
print(df['classification'].value_counts())

# --- 6. Separate Features (X) and Target (y) ---
X = df.drop('classification', axis=1)
y = df['classification']

# --- 7. Feature Scaling ---
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to a DataFrame for easier handling in the next stage
X_processed = pd.DataFrame(X_scaled, columns=X.columns)

print("\n--- Preprocessing Complete ---")
print("Processed Features (X_processed) head:")
print(X_processed.head())
print("\nProcessed Target (y) head:")
print(y.head())
print(f"\nShape of X_processed: {X_processed.shape}")
print(f"Shape of y: {y.shape}")

# You now have two crucial variables for the next steps:
# 1. X_processed (your clean, scaled features)
# 2. y (your clean, encoded target variable)

# You now have two crucial variables for the next steps:
# 1. X_processed (your clean, scaled features)
# 2. y (your clean, encoded target variable)

Original shape of the data: (400, 26)

--- Missing values after imputation ---
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

--- Target variable 'classification' counts ---
classification
1    248
0    150
Name: count, dtype: int64

--- Preprocessing Complete ---
Processed Features (X_processed) head:
        age        bp    sg   al   su  rbc   pc  pcc   ba       bgr  ...  \
0  0.522727  0.230769  0.75  0.2  0.0  1.0  1.0  0.0  0.0  0.211538  ...   
1  0.056818  0.000000  0.75  0.8  0.0  1.0  1.0  0.0  0.0  0.211538  ...   
2

Stage 2: Feature Selection

1. Pearson Correlation

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# --- Re-create X_processed and y if not in memory ---
# (You can skip this part if X_processed and y are already in your environment)
try:
    # This is to make sure X_processed and y exist
    X_processed.shape
    y.shape
    print("X_processed and y are already in memory. Proceeding...")
except NameError:
    print("X_processed and y not found in memory. Re-running essential parts of Stage 1...")
    # A minimal re-run to get X_processed and y
    df = pd.read_csv('kidney_disease.csv')
    df = df.drop('id', axis=1)
    df.replace(['?', '\t?', '\t'], np.nan, inplace=True)
    
    numeric_cols = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    categorical_cols.remove('classification')
    numerical_cols_with_data = df.select_dtypes(include=np.number).columns.tolist()
    
    from sklearn.impute import SimpleImputer
    num_imputer = SimpleImputer(strategy='median')
    df[numerical_cols_with_data] = num_imputer.fit_transform(df[numerical_cols_with_data])
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
        
    df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})
    df['classification'] = df['classification'].replace(r'\s*ckd\s*', 1, regex=True)
    df['classification'] = df['classification'].replace(r'\s*notckd\s*', 0, regex=True)
    df.dropna(subset=['classification'], inplace=True)
    df['classification'] = df['classification'].astype(int)
    
    X = df.drop('classification', axis=1)
    y = df['classification']
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_processed = pd.DataFrame(X_scaled, columns=X.columns)
    print("Re-creation complete.")
# --- End of re-creation block ---


# 1. Pearson Correlation
print("\n--- 1. Running Pearson Correlation ---")
# Calculate Pearson correlation
cor = X_processed.corrwith(y)
cor_target = abs(cor)

# Select the top 22 features
relevant_features_pearson = cor_target.nlargest(22).index

# Create the new dataset
X_pearson = X_processed[relevant_features_pearson]
Dataset_1 = X_pearson.copy()
Dataset_1['label'] = y.values

print(f"Selected {X_pearson.shape[1]} features: {list(relevant_features_pearson)}")
Dataset_1.to_csv('Dataset_1.csv', index=False)
print("Saved Dataset_1.csv")

X_processed and y are already in memory. Proceeding...

--- 1. Running Pearson Correlation ---
Selected 22 features: ['hemo', 'pcv', 'sg', 'htn', 'rc', 'al', 'dm', 'appet', 'bgr', 'pe', 'pc', 'bu', 'sod', 'ane', 'bp', 'su', 'sc', 'rbc', 'pcc', 'cad', 'age', 'ba']
Saved Dataset_1.csv


2. CHI-SQUARE (SelectKBest)

In [6]:
from sklearn.feature_selection import SelectKBest, chi2

# 2. CHI-SQUARE
print("\n--- 2. Running Chi-Square (SelectKBest) ---")
# Select top 18 features
chi_selector = SelectKBest(chi2, k=18)
X_chi2_arr = chi_selector.fit_transform(X_processed, y)
relevant_features_chi2 = X_processed.columns[chi_selector.get_support()]

# Create the new dataset
X_chi2 = pd.DataFrame(X_chi2_arr, columns=relevant_features_chi2)
Dataset_2 = X_chi2.copy()
Dataset_2['label'] = y.values

print(f"Selected {X_chi2.shape[1]} features: {list(relevant_features_chi2)}")
Dataset_2.to_csv('Dataset_2.csv', index=False)
print("Saved Dataset_2.csv")


--- 2. Running Chi-Square (SelectKBest) ---
Selected 18 features: ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'hemo', 'pcv', 'rc', 'htn', 'dm', 'appet', 'pe', 'ane']
Saved Dataset_2.csv


3. VARIANCE THRESHOLD

In [7]:
from sklearn.feature_selection import VarianceThreshold

# 3. VARIANCE THRESHOLD
print("\n--- 3. Running Variance Threshold ---")
# Calculate variances
variances = X_processed.var()

# Get the names of the top 7 features with the highest variance
relevant_features_var = variances.nlargest(7).index

# Create the new dataset
X_var_thresh = X_processed[relevant_features_var]
Dataset_3 = X_var_thresh.copy()
Dataset_3['label'] = y.values

print(f"Selected {X_var_thresh.shape[1]} features: {list(relevant_features_var)}")
Dataset_3.to_csv('Dataset_3.csv', index=False)
print("Saved Dataset_3.csv")


--- 3. Running Variance Threshold ---
Selected 7 features: ['htn', 'appet', 'pe', 'pc', 'ane', 'rbc', 'pcc']
Saved Dataset_3.csv


4. RECURSIVE FEATURE ELIMINATION (RFE)

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# 4. RFE (Recursive Feature Elimination)
print("\n--- 4. Running RFE ---")
# Initialize the model and RFE
# We use max_iter=1000 to ensure convergence
model_lr = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=model_lr, n_features_to_select=11)

# Fit RFE
X_rfe_arr = rfe.fit_transform(X_processed, y)
relevant_features_rfe = X_processed.columns[rfe.get_support()]

# Create the new dataset
X_rfe = pd.DataFrame(X_rfe_arr, columns=relevant_features_rfe)
Dataset_4 = X_rfe.copy()
Dataset_4['label'] = y.values

print(f"Selected {X_rfe.shape[1]} features: {list(relevant_features_rfe)}")
Dataset_4.to_csv('Dataset_4.csv', index=False)
print("Saved Dataset_4.csv")


--- 4. Running RFE ---
Selected 11 features: ['sg', 'al', 'su', 'pc', 'hemo', 'pcv', 'rc', 'htn', 'dm', 'appet', 'pe']
Saved Dataset_4.csv


In [None]:
5. SEQUENTIAL FEATURE SELECTION (SFS)

In [9]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# 5. SFS (Sequential Feature Selection)
print("\n--- 5. Running SFS ---")
# Note: SFS can be slow. n_jobs=-1 uses all available CPU cores.
model_lr_sfs = LogisticRegression(max_iter=1000, random_state=42)
sfs = SequentialFeatureSelector(estimator=model_lr_sfs, 
                              n_features_to_select=11, 
                              direction='forward', 
                              cv=5, 
                              n_jobs=-1)
# Fit SFS
X_sfs_arr = sfs.fit_transform(X_processed, y)
relevant_features_sfs = X_processed.columns[sfs.get_support()]

# Create the new dataset
X_sfs = pd.DataFrame(X_sfs_arr, columns=relevant_features_sfs)
Dataset_5 = X_sfs.copy()
Dataset_5['label'] = y.values

print(f"Selected {X_sfs.shape[1]} features: {list(relevant_features_sfs)}")
Dataset_5.to_csv('Dataset_5.csv', index=False)
print("Saved Dataset_5.csv")


--- 5. Running SFS ---
Selected 11 features: ['bp', 'sg', 'al', 'su', 'bu', 'hemo', 'pcv', 'rc', 'htn', 'dm', 'pe']
Saved Dataset_5.csv


6. LASSO REGRESSION (L1 Regularization)

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# 6. LASSO REGRESSION
print("\n--- 6. Running Lasso Regression ---")
# Use LassoCV to find the best alpha (regularization strength)
# We set a high max_iter for convergence
lasso_cv = LassoCV(cv=5, max_iter=10000, random_state=42)
lasso_cv.fit(X_processed, y)

# SelectFromModel will pick features whose coefficients are non-zero
sfm_lasso = SelectFromModel(lasso_cv, threshold=1e-5) # Use a small threshold
sfm_lasso.fit(X_processed, y)

relevant_features_lasso = X_processed.columns[sfm_lasso.get_support()]

# Create the new dataset
X_lasso = X_processed[relevant_features_lasso]
Dataset_6 = X_lasso.copy()
Dataset_6['label'] = y.values

print(f"Selected {X_lasso.shape[1]} features: {list(relevant_features_lasso)}")
Dataset_6.to_csv('Dataset_6.csv', index=False)
print("Saved Dataset_6.csv")


--- 6. Running Lasso Regression ---
Selected 16 features: ['bp', 'sg', 'al', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'hemo', 'pcv', 'htn', 'dm', 'appet', 'pe', 'ane']
Saved Dataset_6.csv


In [None]:
7. RIDGE REGRESSION (L2 Regularization)

In [11]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV

# 7. RIDGE REGRESSION (used with RFE)
print("\n--- 7. Running Ridge Regression (with RFE) ---")
# Use RidgeCV to find the best alpha during cross-validation
ridge_cv = RidgeCV(alphas=np.logspace(-6, 6, 13), cv=5)

# Use RFE to select the 11 most important features as ranked by Ridge
rfe_ridge = RFE(estimator=ridge_cv, n_features_to_select=11)
X_ridge_arr = rfe_ridge.fit_transform(X_processed, y)
relevant_features_ridge = X_processed.columns[rfe_ridge.get_support()]

# Create the new dataset
X_ridge = pd.DataFrame(X_ridge_arr, columns=relevant_features_ridge)
Dataset_7 = X_ridge.copy()
Dataset_7['label'] = y.values

print(f"Selected {X_ridge.shape[1]} features: {list(relevant_features_ridge)}")
Dataset_7.to_csv('Dataset_7.csv', index=False)
print("Saved Dataset_7.csv")


--- 7. Running Ridge Regression (with RFE) ---
Selected 11 features: ['bp', 'sg', 'al', 'bgr', 'bu', 'sc', 'sod', 'hemo', 'htn', 'dm', 'cad']
Saved Dataset_7.csv


Stage 3 (Part A): K-Fold Classification & Classifier 1

Random Forest

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Import all classifiers we will use
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

# --- 1. Load All 7 Datasets ---

datasets = {}
for i in range(1, 8):
    try:
        filename = f'Dataset_{i}.csv'
        df = pd.read_csv(filename)
        datasets[filename] = {
            'X': df.drop('label', axis=1),
            'y': df['label']
        }
        print(f"Loaded {filename} with {datasets[filename]['X'].shape[1]} features.")
    except FileNotFoundError:
        print(f"Error: {filename} not found.")

# --- 2. Define the K-Fold Evaluation Function ---

def evaluate_model_kfold(model, model_name, dataset_name, X, y):
    """
    Performs k-fold cross-validation for k=10, 20, 30 and prints results.
    """
    print(f"\n--- Evaluating {model_name} on {dataset_name} ---")
    
    # We use a pipeline for consistency, especially for models like SVM
    # that are sensitive to scaling (even though our data is already scaled).
    # For SVC, we need probability=True to calculate AUC.
    if model_name == "Support Vector Machine":
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', SVC(probability=True, random_state=42))
        ])
    else:
         pipeline = Pipeline([
            ('model', model)
        ])

    # Define the k-values
    k_values = [10, 20, 30]
    
    for k in k_values:
        try:
            # Set up the K-Fold
            kfold = KFold(n_splits=k, shuffle=True, random_state=42)
            
            # Get cross-validation scores for accuracy
            acc_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='accuracy')
            
            # Get cross-validation scores for AUC
            # Some models/data might struggle with AUC in some folds
            try:
                auc_scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='roc_auc')
                auc_mean = np.mean(auc_scores)
            except ValueError as e:
                # print(f"  Warning (k={k}): Could not calculate AUC. {e}")
                auc_mean = np.nan # Report as Not a Number if it fails
            
            print(f"  K = {k}:")
            print(f"    Accuracy: {np.mean(acc_scores):.4f} (± {np.std(acc_scores):.4f})")
            print(f"    AUC Score: {auc_mean:.4f} (± {np.std(auc_scores):.4f})")
            
        except Exception as e:
            print(f"  Error during k={k} evaluation: {e}")
            
    print("-" * (20 + len(model_name) + len(dataset_name)))


# --- 3. Run Classifier 1: Random Forest ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 1: RANDOM FOREST {'=' * 20}")

# Initialize the model
rf_model = RandomForestClassifier(random_state=42)

# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    evaluate_model_kfold(rf_model, "Random Forest", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 1: RANDOM FOREST {'=' * 20}")

Loaded Dataset_1.csv with 22 features.
Loaded Dataset_2.csv with 18 features.
Loaded Dataset_3.csv with 7 features.
Loaded Dataset_4.csv with 11 features.
Loaded Dataset_5.csv with 11 features.
Loaded Dataset_6.csv with 16 features.
Loaded Dataset_7.csv with 11 features.


{'=' * 20} STARTING CLASSIFIER 1: RANDOM FOREST {'=' * 20}

--- Evaluating Random Forest on Dataset_1.csv ---
  K = 10:
    Accuracy: 0.9924 (± 0.0161)
    AUC Score: 1.0000 (± 0.0000)
  K = 20:
    Accuracy: 0.9950 (± 0.0218)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9923 (± 0.0304)
    AUC Score: 1.0000 (± 0.0000)
----------------------------------------------

--- Evaluating Random Forest on Dataset_2.csv ---
  K = 10:
    Accuracy: 0.9924 (± 0.0161)
    AUC Score: 1.0000 (± 0.0000)
  K = 20:
    Accuracy: 0.9925 (± 0.0238)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9923 (± 0.0304)
    AUC Score: 1.0000 (± 0.0000)
----------------------------------------------

--- Evaluating Rando

Gradient Boosting, AdaBoost, and XGBoost

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings

# Import the classifiers for this stage
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC # Need this for the evaluation function logic
warnings.filterwarnings('ignore')


# --- Check if datasets and function exist from previous step ---
try:
    datasets
    evaluate_model_kfold
    print("Dependencies from Stage 3 (Part A) found. Proceeding...")
except NameError:
    print("Error: 'datasets' or 'evaluate_model_kfold' not found.")
    print("Please re-run the code from Stage 3 (Part A) first.")
    # Stop execution if dependencies are missing
    raise

# --- 4. Run Classifier 2: Gradient Boosting ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 2: GRADIENT BOOSTING {'=' * 20}")

# Initialize the model
gb_model = GradientBoostingClassifier(random_state=42)

# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    evaluate_model_kfold(gb_model, "Gradient Boosting", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 2: GRADIENT BOOSTING {'=' * 20}")


# --- 5. Run Classifier 3: AdaBoost (Adaptive Boosting) ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 3: ADABOOST {'=' * 20}")

# Initialize the model
# AdaBoost often works well with a simple Decision Tree as its base estimator
from sklearn.tree import DecisionTreeClassifier
ada_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), random_state=42)


# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    evaluate_model_kfold(ada_model, "AdaBoost", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 3: ADABOOST {'=' * 20}")


# --- 6. Run Classifier 4: XGBoost (eXtreme Gradient Boosting) ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 4: XGBOOST {'=' * 20}")

# Initialize the model
# eval_metric='logloss' is common for binary classification
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)

# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    evaluate_model_kfold(xgb_model, "XGBoost", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 4: XGBOOST {'=' * 20}")

Dependencies from Stage 3 (Part A) found. Proceeding...


{'=' * 20} STARTING CLASSIFIER 2: GRADIENT BOOSTING {'=' * 20}

--- Evaluating Gradient Boosting on Dataset_1.csv ---
  K = 10:
    Accuracy: 0.9874 (± 0.0168)
    AUC Score: 0.9997 (± 0.0008)
  K = 20:
    Accuracy: 0.9875 (± 0.0268)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9850 (± 0.0360)
    AUC Score: 1.0000 (± 0.0000)
--------------------------------------------------

--- Evaluating Gradient Boosting on Dataset_2.csv ---
  K = 10:
    Accuracy: 0.9899 (± 0.0123)
    AUC Score: 0.9995 (± 0.0016)
  K = 20:
    Accuracy: 0.9925 (± 0.0179)
    AUC Score: 0.9995 (± 0.0022)
  K = 30:
    Accuracy: 0.9899 (± 0.0257)
    AUC Score: 0.9992 (± 0.0043)
--------------------------------------------------

--- Evaluating Gradient Boosting on Dataset_3.csv ---
  K = 10:
    Accuracy: 0.8996 (± 0.0446)
    AUC Score: 0.9199 (± 0.0341)
  K = 20:
    Accuracy: 0.8995 (± 0.0613)
    AUC Score: 0.9204 (± 0.0474)
  K = 30:
  

Support Vector Machine (SVM), Decision Tree, and Logistic Regression.

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings

# Import the classifiers for this stage
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Import other classifiers needed for the function logic
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')


# --- Check if datasets and function exist from previous step ---
try:
    datasets
    evaluate_model_kfold
    print("Dependencies from Stage 3 (Part A) found. Proceeding...")
except NameError:
    print("Error: 'datasets' or 'evaluate_model_kfold' not found.")
    print("Please re-run the code from Stage 3 (Part A) first.")
    # Stop execution if dependencies are missing
    raise

# --- 7. Run Classifier 5: Support Vector Machine (SVM) ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 5: SUPPORT VECTOR MACHINE {'=' * 20}")

# Initialize the model
# We set probability=True to be able to calculate ROC AUC scores
svm_model = SVC(probability=True, random_state=42)

# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    # The 'evaluate_model_kfold' function has special handling
    # to put SVM in a pipeline with a StandardScaler,
    # as SVMs are very sensitive to feature scales.
    evaluate_model_kfold(svm_model, "Support Vector Machine", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 5: SUPPORT VECTOR MACHINE {'=' * 20}")


# --- 8. Run Classifier 6: Decision Tree ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 6: DECISION TREE {'=' * 20}")

# Initialize the model
dt_model = DecisionTreeClassifier(random_state=42)

# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    evaluate_model_kfold(dt_model, "Decision Tree", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 6: DECISION TREE {'=' * 20}")


# --- 9. Run Classifier 7: Logistic Regression ---

print("\n\n{'=' * 20} STARTING CLASSIFIER 7: LOGISTIC REGRESSION {'=' * 20}")

# Initialize the model
# We set max_iter=1000 to ensure convergence
lr_model = LogisticRegression(random_state=42, max_iter=1000)

# Loop through all 7 datasets and evaluate the model
for name, data in datasets.items():
    evaluate_model_kfold(lr_model, "Logistic Regression", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED CLASSIFIER 7: LOGISTIC REGRESSION {'=' * 20}")

print("\n\n*** All 7 classifiers have been evaluated on all 7 datasets. ***")

Dependencies from Stage 3 (Part A) found. Proceeding...


{'=' * 20} STARTING CLASSIFIER 5: SUPPORT VECTOR MACHINE {'=' * 20}

--- Evaluating Support Vector Machine on Dataset_1.csv ---
  K = 10:
    Accuracy: 0.9950 (± 0.0100)
    AUC Score: 1.0000 (± 0.0000)
  K = 20:
    Accuracy: 0.9950 (± 0.0150)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9951 (± 0.0185)
    AUC Score: 1.0000 (± 0.0000)
-------------------------------------------------------

--- Evaluating Support Vector Machine on Dataset_2.csv ---
  K = 10:
    Accuracy: 0.9975 (± 0.0075)
    AUC Score: 1.0000 (± 0.0000)
  K = 20:
    Accuracy: 0.9975 (± 0.0109)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9976 (± 0.0128)
    AUC Score: 1.0000 (± 0.0000)
-------------------------------------------------------

--- Evaluating Support Vector Machine on Dataset_3.csv ---
  K = 10:
    Accuracy: 0.8996 (± 0.0446)
    AUC Score: 0.9199 (± 0.0341)
  K = 20:
    Accuracy: 0.8995 (± 0.0613)
    AUC Score: 

Stage 4: Model Blending (Stacking)

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings

# --- 1. Import All Classifiers for Base Models ---
from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier, 
    AdaBoostClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

# --- 2. Check if datasets and function exist from previous step ---
try:
    datasets
    evaluate_model_kfold
    print("Dependencies from Stage 3 found. Proceeding with Blending...")
except NameError:
    print("Error: 'datasets' or 'evaluate_model_kfold' not found.")
    print("Please re-run the code from Stage 3 (Part A) first.")
    # Stop execution if dependencies are missing
    raise

# --- 3. Define the Base Models (Level 0) ---
# We use the same settings as before for consistency
base_models = [
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42, eval_metric='logloss')),
    ('SVM', SVC(probability=True, random_state=42)),
    ('DecisionTree', DecisionTreeClassifier(random_state=42)),
    ('LogisticRegression', LogisticRegression(random_state=42, max_iter=1000))
]

# --- 4. Define the Meta-Model (Level 1) ---
# A Logistic Regression is a common and effective meta-model
meta_model = LogisticRegression(random_state=42, max_iter=1000)

# --- 5. Create the Stacking (Blending) Classifier ---
# 'passthrough=True' means the meta-model gets both the
# original features AND the predictions from the base models.
# 'cv=5' means the base models' predictions are generated
# using 5-fold cross-validation to prevent data leakage.
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=False, # Set to True if you want meta-model to also see original features
    cv=5,
    n_jobs=-1
)

# --- 6. Run the Blended Model Evaluation ---

print("\n\n{'=' * 20} STARTING STAGE 4: MODEL BLENDING (STACKING) {'=' * 20}")

# Loop through all 7 datasets and evaluate the blended model
for name, data in datasets.items():
    evaluate_model_kfold(stacking_model, "Blended (Stacking) Model", name, data['X'], data['y'])

print("\n{'=' * 20} COMPLETED STAGE 4: MODEL BLENDING (STACKING) {'=' * 20}")

Dependencies from Stage 3 found. Proceeding with Blending...


{'=' * 20} STARTING STAGE 4: MODEL BLENDING (STACKING) {'=' * 20}

--- Evaluating Blended (Stacking) Model on Dataset_1.csv ---
  K = 10:
    Accuracy: 0.9925 (± 0.0160)
    AUC Score: 1.0000 (± 0.0000)
  K = 20:
    Accuracy: 0.9925 (± 0.0238)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9925 (± 0.0300)
    AUC Score: 1.0000 (± 0.0000)
---------------------------------------------------------

--- Evaluating Blended (Stacking) Model on Dataset_2.csv ---
  K = 10:
    Accuracy: 1.0000 (± 0.0000)
    AUC Score: 1.0000 (± 0.0000)
  K = 20:
    Accuracy: 0.9975 (± 0.0109)
    AUC Score: 1.0000 (± 0.0000)
  K = 30:
    Accuracy: 0.9974 (± 0.0138)
    AUC Score: 1.0000 (± 0.0000)
---------------------------------------------------------

--- Evaluating Blended (Stacking) Model on Dataset_3.csv ---
  K = 10:
    Accuracy: 0.8996 (± 0.0446)
    AUC Score: 0.9199 (± 0.0341)
  K = 20:
    Accuracy: 0.8995 (± 0.0613)
  