In [16]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
import shap
import matplotlib.pyplot as plt

# Load the dataset

try:
    df = pd.read_csv('diabetes_dataset.csv')
    print("Dataset loaded successfully.")
    print("First 5 rows of the dataset:")
    print(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("Error: 'your_dataset.csv' not found. Please make sure the file is in the correct directory.")


Dataset loaded successfully.
First 5 rows of the dataset:
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 

In [17]:
from sklearn.preprocessing import LabelEncoder
# Only encode if not already encoded
if 'smoking_history' in df.columns:
    df = pd.get_dummies(df, columns=['smoking_history'])
df.head()   

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,Female,80.0,0,1,25.19,6.6,140,0,0,0,0,0,1,0
1,Female,54.0,0,0,27.32,6.6,80,0,1,0,0,0,0,0
2,Male,28.0,0,0,27.32,5.7,158,0,0,0,0,0,1,0
3,Female,36.0,0,0,23.45,5.0,155,0,0,1,0,0,0,0
4,Male,76.0,1,1,20.14,4.8,155,0,0,1,0,0,0,0


In [18]:
## Splitting the data into features and target variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [19]:
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Scale the features
# # StandardScaler is used as it's effective for both Logistic Regression and SVM
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# print("\nData splitting and scaling complete.")
# print(f"Original shape: {X.shape}")
# print(f"Training set shape: {X_train.shape}")
# print(f"Testing set shape: {X_test.shape}")

In [21]:
# Assuming X is your feature DataFrame
# Identify categorical columns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_cols = ['Gender']  # Add other categorical columns if any
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

print("\nData splitting and scaling complete.")
print(f"Original shape: {X.shape}")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Data splitting and scaling complete.
Original shape: (100000, 13)
Training set shape: (80000, 13)
Testing set shape: (20000, 13)


In [None]:
# Define the base models
# Note: Random Forest doesn't require scaled data
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(solver='liblinear', random_state=42)
svm_model = SVC(probability=True, random_state=42)

# Create a list of estimators for stacking
estimators = [
    ('rf', rf_model),
    ('lr', lr_model),
    ('svm', svm_model)
]

# Define the final estimator (meta-model). A simple Logistic Regression is a good choice.
final_estimator = LogisticRegression()

# Create the stacking classifier
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator,
    cv=5 # Cross-validation folds
)

# Train the ensemble model
# Note: Stacking model trains on the scaled data for all estimators internally
stacking_model.fit(X_train_scaled, y_train)

print("\nStacking ensemble model trained successfully!")

NameError: name 'X_train_scaled' is not defined