In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [34]:
df = pd.read_csv('diabetes.csv')

In [35]:
df. head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [36]:
# Split features and target variable
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [37]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
# Model Initialization
model = RandomForestClassifier(n_estimators=10, max_depth=1000, random_state=42)

In [40]:
# Train the model
model.fit(X_train_scaled, y_train)

In [41]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [42]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.76


In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("diabetes.csv")  # Replace with the path to your CSV file

# Handle any missing values (optional, depends on dataset)
df.fillna(df.mean(), inplace=True)

# Split features and target variable
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.2f}")
print(f"Best Parameters: {grid_search.best_params_}")


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Optimized Model Accuracy: 0.75
Best Parameters: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}


In [44]:
pip install xgboost




In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv("diabetes.csv")  # Replace with the path to your CSV file

# Handle any missing values
df.fillna(df.mean(), inplace=True)

# Split features and target variable
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [1000, 2000, 3000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.2f}")
print(f"Best Parameters: {grid_search.best_params_}")


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Parameters: { "use_label_encoder" } are not used.



Optimized Model Accuracy: 0.75
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.7}


In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv("diabetes.csv")  # Replace with the path to your CSV file

# Handle any missing values
df.fillna(df.mean(), inplace=True)

# Split features and target variable
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for RandomForest and XGBoost
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Set parameter grids for tuning
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15],
    'min_samples_split': [2, 5]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

# Perform GridSearchCV on individual models
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
grid_search_rf.fit(X_train_scaled, y_train)
grid_search_xgb.fit(X_train_scaled, y_train)

# Retrieve the best estimators
best_rf = grid_search_rf.best_estimator_
best_xgb = grid_search_xgb.best_estimator_

# Ensemble Model using VotingClassifier (with tuned models)
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('xgb', best_xgb)],
    voting='soft'
)

# Train the ensemble model
voting_clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy after Hyperparameter Tuning: {accuracy:.2f}")
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best Parameters for XGBoost: {grid_search_xgb.best_params_}")


Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy after Hyperparameter Tuning: 0.73
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}


Parameters: { "use_label_encoder" } are not used.



In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('diabetes.csv')

# Data Preprocessing - Replace zero values with NaN, then fill with column mean
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[column].replace(0, np.nan, inplace=True)
    diabetes_dataset[column].fillna(diabetes_dataset[column].mean(), inplace=True)

# Separate data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(X_train, Y_train)

# Best estimator after tuning
classifier = grid.best_estimator_

# Model evaluation
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data: ', training_data_accuracy)

X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data: ', test_data_accuracy)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].fillna(diabetes_dataset[column].mean(), inplace=True)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Accuracy score of the training data:  0.7736156351791531
Accuracy score of the test data:  0.7597402597402597


In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

# Data Preprocessing - Replace zero values with NaN, then fill with column median
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[column].replace(0, np.nan, inplace=True)
    diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)

# Separate data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Handle class imbalance using SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(random_state=2)
X_train, Y_train = smote.fit_resample(X_train, Y_train)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Using RandomForestClassifier with hyperparameter tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=2), param_grid_rf, refit=True, cv=5, verbose=1)
grid_rf.fit(X_train, Y_train)

# Best Random Forest model after tuning
classifier_rf = grid_rf.best_estimator_

# Model evaluation with Random Forest
X_train_prediction_rf = classifier_rf.predict(X_train)
training_data_accuracy_rf = accuracy_score(X_train_prediction_rf, Y_train)
print('Random Forest - Accuracy score of the training data: ', training_data_accuracy_rf)

X_test_prediction_rf = classifier_rf.predict(X_test)
test_data_accuracy_rf = accuracy_score(X_test_prediction_rf, Y_test)
print('Random Forest - Accuracy score of the test data: ', test_data_accuracy_rf)

# Alternatively, try Gradient Boosting Classifier with hyperparameter tuning if Random Forest is still not satisfactory
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_gb = GridSearchCV(GradientBoostingClassifier(random_state=2), param_grid_gb, refit=True, cv=5, verbose=1)
grid_gb.fit(X_train, Y_train)

# Best Gradient Boosting model after tuning
classifier_gb = grid_gb.best_estimator_

# Model evaluation with Gradient Boosting
X_train_prediction_gb = classifier_gb.predict(X_train)
training_data_accuracy_gb = accuracy_score(X_train_prediction_gb, Y_train)
print('Gradient Boosting - Accuracy score of the training data: ', training_data_accuracy_gb)

X_test_prediction_gb = classifier_gb.predict(X_test)
test_data_accuracy_gb = accuracy_score(X_test_prediction_gb, Y_test)
print('Gradient Boosting - Accuracy score of the test data: ', test_data_accuracy_gb)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest - Accuracy score of the training data:  1.0
Random Forest - Accuracy score of the test data:  0.7987012987012987
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Gradient Boosting - Accuracy score of the training data:  1.0
Gradient Boosting - Accuracy score of the test data:  0.7142857142857143


In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('diabetes.csv')

# Data Preprocessing - Replace zero values with NaN, then fill with column median
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[column].replace(0, np.nan, inplace=True)
    diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)

# Separate data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Handle class imbalance using SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(random_state=2)
X_train, Y_train = smote.fit_resample(X_train, Y_train)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Using RandomForestClassifier with hyperparameter tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=2), param_grid_rf, refit=True, cv=5, verbose=1)
grid_rf.fit(X_train, Y_train)

# Best Random Forest model after tuning
classifier_rf = grid_rf.best_estimator_

# Model evaluation with Random Forest
X_train_prediction_rf = classifier_rf.predict(X_train)
training_data_accuracy_rf = accuracy_score(X_train_prediction_rf, Y_train)
print('Random Forest - Accuracy score of the training data: ', training_data_accuracy_rf)

X_test_prediction_rf = classifier_rf.predict(X_test)
test_data_accuracy_rf = accuracy_score(X_test_prediction_rf, Y_test)
print('Random Forest - Accuracy score of the test data: ', test_data_accuracy_rf)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest - Accuracy score of the training data:  1.0
Random Forest - Accuracy score of the test data:  0.7987012987012987


In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

# Data Preprocessing - Replace zero values with NaN, then fill with column median
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[column].replace(0, np.nan, inplace=True)
    diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)

# Separate data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Handle class imbalance using SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(random_state=2)
X_train, Y_train = smote.fit_resample(X_train, Y_train)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=2), param_grid_rf, refit=True, cv=5, verbose=1)
grid_rf.fit(X_train, Y_train)

# Best Random Forest model after tuning
classifier_rf = grid_rf.best_estimator_

# Model evaluation with Random Forest
X_train_prediction_rf = classifier_rf.predict(X_train)
training_data_accuracy_rf = accuracy_score(X_train_prediction_rf, Y_train)
print('Random Forest - Accuracy score of the training data: ', training_data_accuracy_rf)

X_test_prediction_rf = classifier_rf.predict(X_test)
test_data_accuracy_rf = accuracy_score(X_test_prediction_rf, Y_test)
print('Random Forest - Accuracy score of the test data: ', test_data_accuracy_rf)

# Trying Gradient Boosting for comparison
gb_classifier = GradientBoostingClassifier(random_state=2)
gb_classifier.fit(X_train, Y_train)

X_train_prediction_gb = gb_classifier.predict(X_train)
training_data_accuracy_gb = accuracy_score(X_train_prediction_gb, Y_train)
print('Gradient Boosting - Accuracy score of the training data: ', training_data_accuracy_gb)

X_test_prediction_gb = gb_classifier.predict(X_test)
test_data_accuracy_gb = accuracy_score(X_test_prediction_gb, Y_test)
print('Gradient Boosting - Accuracy score of the test data: ', test_data_accuracy_gb)

# Ensemble model: VotingClassifier combining RandomForest, GradientBoosting, and LogisticRegression
voting_classifier = VotingClassifier(
    estimators=[
        ('rf', classifier_rf),
        ('gb', gb_classifier),
        ('lr', LogisticRegression(max_iter=1000))
    ], voting='hard')

voting_classifier.fit(X_train, Y_train)

X_train_prediction_vc = voting_classifier.predict(X_train)
training_data_accuracy_vc = accuracy_score(X_train_prediction_vc, Y_train)
print('Voting Classifier - Accuracy score of the training data: ', training_data_accuracy_vc)

X_test_prediction_vc = voting_classifier.predict(X_test)
test_data_accuracy_vc = accuracy_score(X_test_prediction_vc, Y_test)
print('Voting Classifier - Accuracy score of the test data: ', test_data_accuracy_vc)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits


1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(

Random Forest - Accuracy score of the training data:  1.0
Random Forest - Accuracy score of the test data:  0.7402597402597403
Gradient Boosting - Accuracy score of the training data:  0.905
Gradient Boosting - Accuracy score of the test data:  0.7467532467532467
Voting Classifier - Accuracy score of the training data:  0.92625
Voting Classifier - Accuracy score of the test data:  0.7662337662337663


In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('diabetes.csv')

# Data Preprocessing - Replace zero values with NaN, then fill with column mean
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[column].replace(0, np.nan, inplace=True)
    diabetes_dataset[column].fillna(diabetes_dataset[column].mean(), inplace=True)

# Separate data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers to use for optimization
    'max_iter': [100, 200, 300]  # Number of iterations for convergence
}

grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(X_train, Y_train)

# Best estimator after tuning
classifier = grid.best_estimator_

# Model evaluation
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Logistic Regression - Accuracy score of the training data: ', training_data_accuracy)

X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Logistic Regression - Accuracy score of the test data: ', test_data_accuracy)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_dataset[column].fillna(diabetes_dataset[column].mean(), inplace=True)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Logistic Regression - Accuracy score of the training data:  0.7785016286644951
Logistic Regression - Accuracy score of the test data:  0.7207792207792207
