In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# 1. LOAD AND EXPLORE DATA
# ============================================================================

# Load the dataset
data = pd.read_csv('diabetes.csv')

print("Dataset Shape:", data.shape)
print("\\nFirst Few Rows:")
print(data.head())
print("\\nDataset Info:")
print(data.info())
print("\\nBasic Statistics:")
print(data.describe())
print("\\nMissing Values:")
print(data.isnull().sum())

# ============================================================================
# 2. DATA PREPROCESSING
# ============================================================================

print("\\n" + "="*80)
print("DATA PREPROCESSING")
print("="*80)

# Check for missing or 'No Info' values
print("\\nUnique values in each column:")
for col in data.columns:
    print(f"{col}: {data[col].unique()}")

# Handle 'No Info' values - replace with most frequent value or mode
data_clean = data.copy()

# For smoking_history 'No Info' - this is categorical, we'll keep it as a separate category
# For heart_disease 'No Info' - replace with mode (0)
if 'No Info' in data_clean['heart_disease'].values:
    data_clean['heart_disease'] = data_clean['heart_disease'].replace('No Info', 
                                   data_clean['heart_disease'].mode()[0])

# Convert data types
data_clean['heart_disease'] = data_clean['heart_disease'].astype(int)
data_clean['hypertension'] = data_clean['hypertension'].astype(int)
data_clean['diabetes'] = data_clean['diabetes'].astype(int)

print("\\nData types after conversion:")
print(data_clean.dtypes)

# ============================================================================
# 3. ONE-HOT ENCODING FOR NON-NUMERIC COLUMNS
# ============================================================================

print("\\n" + "="*80)
print("ONE-HOT ENCODING")
print("="*80)

# Identify categorical columns
categorical_cols = data_clean.select_dtypes(include=['object']).columns.tolist()
print(f"\\nCategorical columns: {categorical_cols}")

# Apply one-hot encoding
data_encoded = pd.get_dummies(data_clean, columns=categorical_cols, drop_first=True)

print(f"\\nShape after encoding: {data_encoded.shape}")
print("\\nColumn names after encoding:")
print(data_encoded.columns.tolist())

# ============================================================================
# 4. CHECK DATA IMBALANCE
# ============================================================================

print("\\n" + "="*80)
print("DATA IMBALANCE CHECK")
print("="*80)

target_dist = data_encoded['diabetes'].value_counts()
print("\\nTarget Variable Distribution:")
print(target_dist)
print(f"\\nPercentage Distribution:")
print(data_encoded['diabetes'].value_counts(normalize=True) * 100)

imbalance_ratio = target_dist.max() / target_dist.min()
print(f"\\nImbalance Ratio: {imbalance_ratio:.2f}")

if imbalance_ratio > 1.5:
    print("⚠️  Dataset is IMBALANCED. SMOTE will be applied.")
else:
    print("✓ Dataset is relatively balanced.")

# ============================================================================
# 5. SPLIT DATA AND APPLY SMOTE
# ============================================================================

print("\\n" + "="*80)
print("TRAIN-TEST SPLIT & SMOTE")
print("="*80)

# Separate features and target
X = data_encoded.drop('diabetes', axis=1)
y = data_encoded['diabetes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                      random_state=42, stratify=y)

print(f"\\nTraining set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\\nTraining set class distribution:")
print(y_train.value_counts())

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\\nAfter SMOTE - Training set size: {X_train_balanced.shape}")
print(f"After SMOTE - Class distribution:")
print(pd.Series(y_train_balanced).value_counts())

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

# ============================================================================
# 6. TRAIN MULTIPLE CLASSIFICATION MODELS
# ============================================================================

print("\\n" + "="*80)
print("MODEL TRAINING & EVALUATION")
print("="*80)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

# Train and evaluate all models
results = {}

for name, model in models.items():
    print(f"\\n{'='*60}")
    print(f"Training: {name}")
    print('='*60)
    
    # Train model
    model.fit(X_train_scaled, y_train_balanced)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score on training data
    cv_scores = cross_val_score(model, X_train_scaled, y_train_balanced, cv=5)
    cv_mean = cv_scores.mean()
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_score': cv_mean,
        'predictions': y_pred
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Cross-Validation Score (mean): {cv_mean:.4f}")
    print(f"\\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"\\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# ============================================================================
# 7. SELECT BEST 3 MODELS
# ============================================================================

print("\\n" + "="*80)
print("MODEL SELECTION - TOP 3 MODELS")
print("="*80)

# Sort models by accuracy
sorted_models = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)

print("\\nAll Models Ranked by Accuracy:")
for i, (name, metrics) in enumerate(sorted_models, 1):
    print(f"{i}. {name}: Accuracy = {metrics['accuracy']:.4f}, CV Score = {metrics['cv_score']:.4f}")

# Select top 3
top_3_names = [name for name, _ in sorted_models[:3]]
print(f"\\n✓ Selected TOP 3 Models: {top_3_names}")

# ============================================================================
# 8. CREATE VOTING ENSEMBLE FROM BEST 3 MODELS
# ============================================================================

print("\\n" + "="*80)
print("VOTING ENSEMBLE USING TOP 3 MODELS")
print("="*80)

# Get predictions from top 3 models
top_3_predictions = []
for name in top_3_names:
    top_3_predictions.append(results[name]['predictions'])

# Convert to numpy array for easier manipulation
predictions_array = np.array(top_3_predictions)

# Hard Voting - majority vote
ensemble_predictions_hard = np.apply_along_axis(
    lambda x: np.bincount(x).argmax(), 
    axis=0, 
    arr=predictions_array
)

# Calculate ensemble accuracy
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions_hard)

print(f"\\nVoting Ensemble Performance:")
print(f"Accuracy: {ensemble_accuracy:.4f}")
print(f"\\nClassification Report:")
print(classification_report(y_test, ensemble_predictions_hard))
print(f"\\nConfusion Matrix:")
print(confusion_matrix(y_test, ensemble_predictions_hard))

# Compare with individual models
print(f"\\n{'='*60}")
print("COMPARISON: Individual Models vs Ensemble")
print('='*60)
for name in top_3_names:
    print(f"{name}: {results[name]['accuracy']:.4f}")
print(f"Voting Ensemble: {ensemble_accuracy:.4f}")

improvement = ensemble_accuracy > max([results[name]['accuracy'] for name in top_3_names])
if improvement:
    print(f"\\n✓ Ensemble IMPROVED over individual models!")
else:
    print(f"\\n→ Best individual model performs as well or better than ensemble.")

# ============================================================================
# 9. VISUALIZATION
# ============================================================================

print("\\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Model Comparison
ax1 = axes[0, 0]
model_names = [name for name, _ in sorted_models]
accuracies = [metrics['accuracy'] for _, metrics in sorted_models]
colors = ['#2ecc71' if name in top_3_names else '#95a5a6' for name in model_names]
ax1.barh(model_names, accuracies, color=colors)
ax1.set_xlabel('Accuracy')
ax1.set_title('Model Performance Comparison\\n(Green = Top 3 Selected)')
ax1.set_xlim([0, 1])

# 2. Class Distribution Before and After SMOTE
ax2 = axes[0, 1]
x_pos = np.arange(2)
before_smote = y_train.value_counts().sort_index().values
after_smote = pd.Series(y_train_balanced).value_counts().sort_index().values
width = 0.35
ax2.bar(x_pos - width/2, before_smote, width, label='Before SMOTE', color='#e74c3c')
ax2.bar(x_pos + width/2, after_smote, width, label='After SMOTE', color='#2ecc71')
ax2.set_xlabel('Class')
ax2.set_ylabel('Count')
ax2.set_title('Class Distribution: Before vs After SMOTE')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(['No Diabetes (0)', 'Diabetes (1)'])
ax2.legend()

# 3. Confusion Matrix for Ensemble
ax3 = axes[1, 0]
cm = confusion_matrix(y_test, ensemble_predictions_hard)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax3)
ax3.set_xlabel('Predicted')
ax3.set_ylabel('Actual')
ax3.set_title('Confusion Matrix - Voting Ensemble')

# 4. Top 3 Models Accuracy Comparison
ax4 = axes[1, 1]
top_3_acc = [results[name]['accuracy'] for name in top_3_names]
top_3_acc.append(ensemble_accuracy)
labels = top_3_names + ['Voting\\nEnsemble']
colors_comp = ['#3498db', '#3498db', '#3498db', '#e74c3c']
ax4.bar(labels, top_3_acc, color=colors_comp)
ax4.set_ylabel('Accuracy')
ax4.set_title('Top 3 Models vs Voting Ensemble')
ax4.set_ylim([0, 1])
ax4.axhline(y=max(top_3_acc), color='green', linestyle='--', alpha=0.3, label='Best Score')

plt.tight_layout()
plt.savefig('diabetes_classification_results.png', dpi=300, bbox_inches='tight')
print("\\n✓ Visualization saved as 'diabetes_classification_results.png'")
plt.show()

# ============================================================================
# 10. SUMMARY
# ============================================================================

print("\\n" + "="*80)
print("ANALYSIS SUMMARY")
print("="*80)

print(f"""
Dataset Information:
- Total samples: {data.shape[0]}
- Features: {data.shape[1] - 1}
- Target: diabetes (binary classification)

Preprocessing:
- One-hot encoding applied to: {categorical_cols}
- Features after encoding: {X.shape[1]}
- SMOTE applied to balance training data

Model Performance:
- Total models trained: {len(models)}
- Best individual model: {sorted_models[0][0]} ({sorted_models[0][1]['accuracy']:.4f})
- Top 3 models: {', '.join(top_3_names)}
- Voting Ensemble accuracy: {ensemble_accuracy:.4f}

Conclusion:
The voting ensemble combines predictions from the top 3 performing models
using hard voting (majority vote) to produce the final classification.
""")

print("\\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

Dataset Shape: (18512, 9)
\nFirst Few Rows:
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                140.0         0  
1          6.6                 80.0         0  
2          5.7                158.0         0  
3          5.0                155.0         0  
4          4.8                155.0         0  
\nDataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18512 entries, 0 to 18511
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender      

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values