<a href="https://colab.research.google.com/github/ucheabaco/500-AI-Machine-learning-Deep-learning-Computer-vision-NLP-Projects-with-code/blob/main/Uche_Agwu_Batch_28_ML_Captstone_Pjt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import files
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
# Step 1: Upload Dataset from Local Machine
print("Upload your dataset (CSV file):")
uploaded = files.upload()

Upload your dataset (CSV file):


Saving Diabetes.csv to Diabetes.csv


In [12]:

# Extract the filename
Diabetes = list(uploaded.keys())[0]

In [13]:
# Load the dataset
df = pd.read_csv(Diabetes)

In [26]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [14]:
# Display basic info
print("\nDataset Loaded Successfully!")
print(df.info())
print(df.head())


Dataset Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        

In [16]:
# Step 2: Data Preprocessing
# Replace zero values in relevant columns with NaN and impute with median
zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_columns] = df[zero_columns].replace(0, np.nan)
df.fillna(df.median(), inplace=True)

In [18]:
# Step 3: Split the dataset into training and testing sets
X = df.drop(columns=['Outcome'])  # Features
y = df['Outcome']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [19]:

# Step 4: Handle Class Imbalance using SMOTE (adjusted for efficiency)
smote = SMOTE(sampling_strategy=0.75, random_state=42)  # Balances the dataset up to 75% of the majority class
X_train, y_train = smote.fit_resample(X_train, y_train)

In [20]:
# Step 5: Feature Scaling (only on training data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
# Step 6: Feature Selection using RandomForestClassifier
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X_train, y_train)

In [22]:
# Select top features automatically
selector = SelectFromModel(rf_selector, prefit=True)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

In [23]:
# Step 7: Model Training and Evaluation
# Hyperparameter tuning using RandomizedSearchCV (faster than GridSearchCV)
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10]
}
rf_random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_dist, cv=5, scoring='accuracy', n_iter=10, random_state=42)
rf_random_search.fit(X_train, y_train)

# Best model
best_rf = rf_random_search.best_estimator_

# Train additional models
lr = LogisticRegression()
xgb = XGBClassifier(n_estimators=150, learning_rate=0.1, random_state=42)
svm = SVC(probability=True, kernel='rbf', random_state=42)  # Added Support Vector Machine

# Voting Classifier (Ensemble Model)
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('xgb', xgb),
    ('lr', lr),
    ('svm', svm)
], voting='soft')  # Soft voting improves probabilistic predictions

voting_clf.fit(X_train, y_train)

In [24]:
# Step 8: Model Evaluation
y_pred = voting_clf.predict(X_test)

print("\n✅ Model Evaluation:")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Model Evaluation:
Accuracy Score: 0.7012987012987013

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.78      0.77       100
           1       0.58      0.56      0.57        54

    accuracy                           0.70       154
   macro avg       0.67      0.67      0.67       154
weighted avg       0.70      0.70      0.70       154


Confusion Matrix:
 [[78 22]
 [24 30]]


In [25]:
# Step 9: Save results and Convert to CSV
df_results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
df_results.to_csv("diabetes_predictions_optimized.csv", index=False)

print("\n📂 Predictions saved as 'diabetes_predictions_optimized.csv'!")


📂 Predictions saved as 'diabetes_predictions_optimized.csv'!
