In [1]:
import pandas as pd
df=pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
import numpy as np
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,0,33.6,0.627,50,1
1,1,85.0,66,29,0,26.6,0.351,31,0
2,8,183.0,64,0,0,23.3,0.672,32,1
3,1,89.0,66,23,94,28.1,0.167,21,0
4,0,137.0,40,35,168,43.1,2.288,33,1


In [3]:
X=df.drop('Outcome',axis=1)
y=df['Outcome']

In [4]:
pd.DataFrame(X,columns=df.columns[:-1])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35,0,33.6,0.627,50
1,1,85.0,66,29,0,26.6,0.351,31
2,8,183.0,64,0,0,23.3,0.672,32
3,1,89.0,66,23,94,28.1,0.167,21
4,0,137.0,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48,180,32.9,0.171,63
764,2,122.0,70,27,0,36.8,0.340,27
765,5,121.0,72,23,112,26.2,0.245,30
766,1,126.0,60,0,0,30.1,0.349,47


In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier=RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction=rf_classifier.predict(X_test)

In [9]:
y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [10]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[92 15]
 [23 24]]
0.7532467532467533
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       107
           1       0.62      0.51      0.56        47

    accuracy                           0.75       154
   macro avg       0.71      0.69      0.69       154
weighted avg       0.74      0.75      0.75       154



In [6]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])

    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)

        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2024-03-18 16:12:44,341] A new study created in memory with name: no-name-0b3206c4-cff9-4713-958c-3a7751f34ce9
[I 2024-03-18 16:12:44,414] Trial 0 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 56244.112816314024}. Best is trial 0 with value: 0.640068547744301.
  n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
[I 2024-03-18 16:12:45,684] Trial 1 finished with value: 0.7475370636059302 and parameters: {'classifier': 'RandomForest', 'n_estimators': 240, 'max_depth': 28.757235814059086}. Best is trial 1 with value: 0.7475370636059302.
  n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
[I 2024-03-18 16:12:52,045] Trial 2 finished with value: 0.7459030766778256 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1360, 'max_depth': 67.06455488371572}. Best is trial 1 with value: 0.7475370636059302.
[I 2024-03-18 16:12:52,116] Trial 3 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', '

Accuracy: 0.7508050374621393
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1920, 'max_depth': 24.52315492763393}


In [12]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1920,
 'max_depth': 24.52315492763393}

In [13]:
trial

FrozenTrial(number=9, state=1, values=[0.7508050374621393], datetime_start=datetime.datetime(2024, 3, 18, 16, 12, 54, 710171), datetime_complete=datetime.datetime(2024, 3, 18, 16, 13, 3, 653918), params={'classifier': 'RandomForest', 'n_estimators': 1920, 'max_depth': 24.52315492763393}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntDistribution(high=2000, log=False, low=200, step=10), 'max_depth': FloatDistribution(high=100.0, log=True, low=10.0, step=None)}, trial_id=9, value=None)

In [14]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[94 13]
 [15 32]]
0.8181818181818182
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       107
           1       0.71      0.68      0.70        47

    accuracy                           0.82       154
   macro avg       0.79      0.78      0.78       154
weighted avg       0.82      0.82      0.82       154

