In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df= pd.read_csv('Diabetes.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report

In [9]:
x = df.drop(['Outcome'],axis=1)
y = df['Outcome']

In [14]:
def class_report_by_algo(x,y):
    X_train,X_test,y_train,y_test = train_test_split(x,y)
    model=[LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,GradientBoostingClassifier]
    for item in model:
        clf=item()
        clf.fit(X_train,y_train)
        prediction_test = clf.predict(X_test)
        prediction_train = clf.predict(X_train)
        print(item)
        print('class report data test')
        print(classification_report(y_test,prediction_test))
        print('============================================')
        print('class report data train')
        print(classification_report(y_train,prediction_train))
        print('\n')

In [15]:
class_report_by_algo(x,y)

<class 'sklearn.linear_model.logistic.LogisticRegression'>
class report data test
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       126
           1       0.68      0.48      0.57        66

    accuracy                           0.74       192
   macro avg       0.72      0.68      0.69       192
weighted avg       0.74      0.74      0.73       192

class report data train
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       374
           1       0.74      0.57      0.65       202

    accuracy                           0.78       576
   macro avg       0.77      0.73      0.74       576
weighted avg       0.78      0.78      0.77       576



<class 'sklearn.tree.tree.DecisionTreeClassifier'>
class report data test
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       126
           1       0.62      0.61      0.61        66





# Clear Imbalance

## Under Sample

In [16]:
df.Outcome.value_counts(normalize=True)

0    0.651042
1    0.348958
Name: Outcome, dtype: float64

In [17]:
minority_data=df[df['Outcome']==1]
mayority_data=df[df['Outcome']==0]

In [18]:
index_for_mayority=np.random.choice(mayority_data.index,len(minority_data))
df_class_0=df.loc[index_for_mayority]
undersampling_df=pd.concat([df_class_0,minority_data])

In [20]:
undersampling_df['Outcome'].value_counts()

1    268
0    268
Name: Outcome, dtype: int64

In [21]:
x_u = undersampling_df.drop(['Outcome'],axis=1)
y_u = undersampling_df['Outcome']

In [25]:
class_report_by_algo(x_u,y_u)

<class 'sklearn.linear_model.logistic.LogisticRegression'>
class report data test
              precision    recall  f1-score   support

           0       0.67      0.87      0.76        61
           1       0.85      0.64      0.73        73

    accuracy                           0.75       134
   macro avg       0.76      0.76      0.75       134
weighted avg       0.77      0.75      0.74       134

class report data train
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       207
           1       0.78      0.74      0.76       195

    accuracy                           0.77       402
   macro avg       0.77      0.77      0.77       402
weighted avg       0.77      0.77      0.77       402



<class 'sklearn.tree.tree.DecisionTreeClassifier'>
class report data test
              precision    recall  f1-score   support

           0       0.64      0.70      0.67        61
           1       0.73      0.67      0.70        73





## OverSampling

In [26]:
from imblearn.over_sampling import RandomOverSampler,SMOTE

In [27]:
def class_report_by_algo_ROS(x,y):
    X_train,X_test,y_train,y_test = train_test_split(x,y)
    ros=RandomOverSampler()
    X_ros,y_ros=ros.fit_sample(X_train,y_train)
    X_ros=pd.DataFrame(X_ros,columns=x.columns)
    model=[LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,GradientBoostingClassifier]
    for item in model:
        clf=item()
        clf.fit(X_ros,y_ros)
        prediction_test = clf.predict(X_test)
        prediction_train = clf.predict(X_ros)
        print(item)
        print('class report data test')
        print(classification_report(y_test,prediction_test))
        print('============================================')
        print('class report data train')
        print(classification_report(y_ros,prediction_train))
        print('\n')

In [28]:
class_report_by_algo_ROS(x,y)

<class 'sklearn.linear_model.logistic.LogisticRegression'>
class report data test
              precision    recall  f1-score   support

           0       0.82      0.75      0.78       124
           1       0.60      0.69      0.64        68

    accuracy                           0.73       192
   macro avg       0.71      0.72      0.71       192
weighted avg       0.74      0.73      0.73       192

class report data train
              precision    recall  f1-score   support

           0       0.79      0.80      0.79       376
           1       0.80      0.78      0.79       376

    accuracy                           0.79       752
   macro avg       0.79      0.79      0.79       752
weighted avg       0.79      0.79      0.79       752



<class 'sklearn.tree.tree.DecisionTreeClassifier'>
class report data test
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       124
           1       0.51      0.51      0.51        68





## SMOTE

In [33]:
def class_report_by_algo_smote(x,y):
    X_train,X_test,y_train,y_test = train_test_split(x,y)
    sm=SMOTE(random_state=101)
    X_sm,y_sm=sm.fit_sample(X_train,y_train)
    model=[LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,GradientBoostingClassifier]
    for item in model:
        clf=item()
        clf.fit(X_sm,y_sm)
        prediction_test = clf.predict(X_test)
        prediction_train = clf.predict(X_sm)
        print(item)
        print('class report data test')
        print(classification_report(y_test,prediction_test))
        print('============================================')
        print('class report data train')
        print(classification_report(y_sm,prediction_train))
        print('\n')

In [34]:
class_report_by_algo_smote(x,y)

<class 'sklearn.linear_model.logistic.LogisticRegression'>
class report data test
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       119
           1       0.64      0.62      0.63        73

    accuracy                           0.72       192
   macro avg       0.71      0.70      0.70       192
weighted avg       0.72      0.72      0.72       192

class report data train
              precision    recall  f1-score   support

           0       0.78      0.79      0.79       381
           1       0.79      0.78      0.78       381

    accuracy                           0.78       762
   macro avg       0.78      0.78      0.78       762
weighted avg       0.78      0.78      0.78       762



<class 'sklearn.tree.tree.DecisionTreeClassifier'>
class report data test
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       119
           1       0.57      0.51      0.54        73





# CLASS WEIGHT 

### Logistic Regressions

In [41]:
def class_Weight_by_LogisticRegression(x,y,weight):
    X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=101)
    clf=LogisticRegression(random_state = 101, class_weight = weight)
    clf.fit(X_train,y_train)
    prediction_test = clf.predict(X_test)
    prediction_train = clf.predict(X_train)
    print('class report data test')
    print(classification_report(y_test,prediction_test))
    print('============================================')
    print('class report data train')
    print(classification_report(y_train,prediction_train))
    print('\n')

In [60]:
class_Weight_by_LogisticRegression(x,y,{0:1,1:3})

class report data test
              precision    recall  f1-score   support

           0       0.89      0.60      0.72       124
           1       0.55      0.87      0.67        68

    accuracy                           0.70       192
   macro avg       0.72      0.74      0.70       192
weighted avg       0.77      0.70      0.70       192

class report data train
              precision    recall  f1-score   support

           0       0.89      0.62      0.73       376
           1       0.54      0.86      0.67       200

    accuracy                           0.70       576
   macro avg       0.72      0.74      0.70       576
weighted avg       0.77      0.70      0.71       576







### DecisionsTree

In [46]:
def class_Weight_by_dTree(x,y,weight,depth):
    X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=101)
    clf=DecisionTreeClassifier(random_state = 101, class_weight = weight,max_depth=depth)
    clf.fit(X_train,y_train)
    prediction_test = clf.predict(X_test)
    prediction_train = clf.predict(X_train)
    print('class report data test')
    print(classification_report(y_test,prediction_test))
    print('============================================')
    print('class report data train')
    print(classification_report(y_train,prediction_train))
    print('\n')

In [63]:
class_Weight_by_dTree(x,y,{0:1,1:2},3)

class report data test
              precision    recall  f1-score   support

           0       0.89      0.58      0.70       124
           1       0.53      0.87      0.66        68

    accuracy                           0.68       192
   macro avg       0.71      0.72      0.68       192
weighted avg       0.76      0.68      0.69       192

class report data train
              precision    recall  f1-score   support

           0       0.93      0.62      0.75       376
           1       0.56      0.91      0.69       200

    accuracy                           0.72       576
   macro avg       0.74      0.77      0.72       576
weighted avg       0.80      0.72      0.73       576





### RANDOM FOREST

In [49]:
def class_Weight_by_Randomforest(x,y,weight,depth):
    X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=101)
    clf=RandomForestClassifier(random_state = 101, class_weight = weight,max_depth=depth)
    clf.fit(X_train,y_train)
    prediction_test = clf.predict(X_test)
    prediction_train = clf.predict(X_train)
    print('class report data test')
    print(classification_report(y_test,prediction_test))
    print('============================================')
    print('class report data train')
    print(classification_report(y_train,prediction_train))
    print('\n')

In [56]:
class_Weight_by_Randomforest(x,y,{0:1,1:2},3)

class report data test
              precision    recall  f1-score   support

           0       0.87      0.66      0.75       124
           1       0.57      0.82      0.67        68

    accuracy                           0.72       192
   macro avg       0.72      0.74      0.71       192
weighted avg       0.77      0.72      0.72       192

class report data train
              precision    recall  f1-score   support

           0       0.90      0.66      0.76       376
           1       0.57      0.85      0.69       200

    accuracy                           0.73       576
   macro avg       0.73      0.76      0.72       576
weighted avg       0.78      0.73      0.73       576







### DARI MODEL Diatas didapat model terbaik menggunakan DecisionTree dengan class_weight={0:1,1:2} dan Max_depth = 3