In [2]:
# Ensemble Learning

#Step 1: Load the Dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = "F:\\AWFERA\\Machine learning\\AwferaMachineLearningProjects\\diabetes.csv"
df = pd.read_csv(file_path)

#Display basic information
print("Dataset Information:")
print(df.info())
print("\nFirst 5 rows: ")
print(df.head())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

First 5 rows: 
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29 

In [3]:
#Step 2: Handle Missing Values
print("\nChecking for missing values:")
print(df.isnull().sum())

#Fill missing numerical values with median
df.fillna(df.median(numeric_only = True), inplace = True)

# Fill missing categorical values with the mode (if any)
for col in df.select_dtypes(include=['object']):
    df[col].fillna(df[col].mode()[0], inplace = True)



Checking for missing values:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [9]:
 #Step 3: Prepare Data
 # Separate features and target variable
x= df.drop(columns = ['Outcome'])
y= df['Outcome']

# Step 4: Apply Standard Scaling 
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Step 5: Split Data into Training and Testing Sets
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.3, random_state = 42)

# Step 6 (Modified): Train a support vector Machine Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#Bagging model 

bagging_model = BaggingClassifier(estimator = DecisionTreeClassifier(), n_estimators = 10)
bagging_model.fit(x_train, y_train)

#Predicition and accuracy
y_pred_bag = bagging_model.predict(x_test)
print("Bagging Accuracy: ", accuracy_score(y_test, y_pred_bag))

Bagging Accuracy:  0.7575757575757576


In [11]:
from sklearn.ensemble import AdaBoostClassifier

# Boosting model 
boosting_model= AdaBoostClassifier(n_estimators = 10)
boosting_model.fit(x_train, y_train)

# Prediction and accuracy 
y_pred_boost = boosting_model.predict(x_test)
print("Boosting Accuracy: ", accuracy_score(y_test, y_pred_boost))



Boosting Accuracy:  0.7489177489177489


In [15]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Define base models 
base_models = [
        ('tree', DecisionTreeClassifier()),
        ('svm', SVC(probability = True))
      ] 

#Meta-model is logistic regression
stacking_model = StackingClassifier(estimators = base_models, final_estimator = LogisticRegression())
stacking_model.fit(x_train, y_train)

# Prediction and accuracy 
y_pred_stack = stacking_model.predict(x_test)
print("Stacking Accuracy: ", accuracy_score(y_test, y_pred_stack))

Stacking Accuracy:  0.7316017316017316
