In [148]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [149]:
df = pd.read_csv("data/diabetes.csv")

In [150]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [151]:
df["AGEs"] = 0

In [152]:
df.loc[(df["Age"]>=30) & (df["Age"]<40),"AGEs"] = "H1"

In [153]:
df.loc[(df["Age"]>=40) & (df["Age"]<50),"AGEs"] = "H2"

In [154]:
df.loc[(df["Age"]>=50) & (df["Age"]<60),"AGEs"] = "H3"

In [155]:
df.loc[(df["Age"]>=60) & (df["Age"]<70),"AGEs"] = "H4"

In [156]:
df.loc[(df["Age"]>=70) & (df["Age"]<80),"AGEs"] = "H5"

In [157]:
df.loc[(df["Age"]<30) | (df["Age"]>=80),"AGEs"] = "H6"

In [158]:
df["AGEs"].unique()

array(['H3', 'H1', 'H6', 'H2', 'H4', 'H5'], dtype=object)

In [159]:
df.groupby("AGEs").sample(frac=0.8)

AttributeError: Cannot access callable attribute 'sample' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [160]:
df.sample(frac=0.8)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,AGEs
494,3,80,0,0,0,0.0,0.174,22,0,H6
438,1,97,70,15,0,18.2,0.147,21,0,H6
22,7,196,90,0,0,39.8,0.451,41,1,H2
167,4,120,68,0,0,29.6,0.709,34,0,H1
555,7,124,70,33,215,25.5,0.161,37,0,H1
...,...,...,...,...,...,...,...,...,...,...
588,3,176,86,27,156,33.3,1.154,52,1,H3
697,0,99,0,0,0,25.0,0.253,22,0,H6
695,7,142,90,24,480,30.4,0.128,43,1,H2
727,0,141,84,26,0,32.4,0.433,22,0,H6


In [161]:
df.shape

(768, 10)

In [162]:
df_train = df.groupby("AGEs").apply(func=pd.DataFrame.sample, frac=0.8, random_state=123)

In [163]:
df_train.shape

(615, 10)

In [164]:
df_train = df_train.reset_index(drop=True)

In [165]:
df_test = df.drop(index=df_train.index)

In [166]:
df_test.shape

(153, 10)

In [167]:
from sklearn.model_selection import train_test_split

In [168]:
print(pd.__version__)

0.25.1


In [169]:
col_x = ["Glucose","BloodPressure","BMI"]

In [170]:
col_x_S =[x+"_S" for x in col_x]

In [171]:
col_x_S

['Glucose_S', 'BloodPressure_S', 'BMI_S']

In [172]:
for param in col_x_S:
    df.loc[:,param]=0

In [173]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,AGEs,Glucose_S,BloodPressure_S,BMI_S
0,6,148,72,35,0,33.6,0.627,50,1,H3,0,0,0
1,1,85,66,29,0,26.6,0.351,31,0,H1,0,0,0
2,8,183,64,0,0,23.3,0.672,32,1,H1,0,0,0
3,1,89,66,23,94,28.1,0.167,21,0,H6,0,0,0
4,0,137,40,35,168,43.1,2.288,33,1,H1,0,0,0


In [174]:
df_scaled = StandardScaler().fit(X=df[col_x])

In [175]:
df.loc[:,col_x_S]=df_scaled.transform(X=df[col_x])

In [176]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,AGEs,Glucose_S,BloodPressure_S,BMI_S
0,6,148,72,35,0,33.6,0.627,50,1,H3,0.848324,0.149641,0.204013
1,1,85,66,29,0,26.6,0.351,31,0,H1,-1.123396,-0.160546,-0.684422
2,8,183,64,0,0,23.3,0.672,32,1,H1,1.943724,-0.263941,-1.103255
3,1,89,66,23,94,28.1,0.167,21,0,H6,-0.998208,-0.160546,-0.494043
4,0,137,40,35,168,43.1,2.288,33,1,H1,0.504055,-1.504687,1.409746


In [177]:
glu_mean = df["Glucose_S"].mean()

In [178]:
glu_std = df["Glucose_S"].std()

In [179]:
upper = glu_mean+1.5*glu_std

In [180]:
lower = glu_mean-1.5*glu_std

In [181]:
upper

1.5009775172102993

In [182]:
lower

-1.5009775172102993

In [183]:
glu_mean

3.614007241618348e-18

In [184]:
glu_std

1.0006516781401995

In [185]:
df_sub = df.loc[(df["Glucose_S"]>lower)&(df["Glucose_S"]<upper)]

In [186]:
df_sub

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,AGEs,Glucose_S,BloodPressure_S,BMI_S
0,6,148,72,35,0,33.6,0.627,50,1,H3,0.848324,0.149641,0.204013
1,1,85,66,29,0,26.6,0.351,31,0,H1,-1.123396,-0.160546,-0.684422
3,1,89,66,23,94,28.1,0.167,21,0,H6,-0.998208,-0.160546,-0.494043
4,0,137,40,35,168,43.1,2.288,33,1,H1,0.504055,-1.504687,1.409746
5,5,116,74,0,0,25.6,0.201,30,0,H1,-0.153185,0.253036,-0.811341
...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,H4,-0.622642,0.356432,0.115169
764,2,122,70,27,0,36.8,0.340,27,0,H6,0.034598,0.046245,0.610154
765,5,121,72,23,112,26.2,0.245,30,0,H1,0.003301,0.149641,-0.735190
766,1,126,60,0,0,30.1,0.349,47,1,H2,0.159787,-0.470732,-0.240205


In [187]:
df_train = df_sub.groupby(["AGEs"]).apply(func=pd.DataFrame.sample, frac=0.8, random_state=1234)

In [188]:
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,AGEs,Glucose_S,BloodPressure_S,BMI_S
AGEs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
H1,16,0,118,84,47,230,45.8,0.551,31,1,H1,-0.090591,0.770014,1.752428
H1,468,8,120,0,0,0,30.0,0.183,38,1,H1,-0.027996,-3.572597,-0.252897
H1,443,8,108,70,0,0,30.5,0.955,33,1,H1,-0.403562,0.046245,-0.189437
H1,463,5,88,78,30,0,27.6,0.258,37,0,H1,-1.029505,0.459827,-0.557503
H1,131,9,122,56,0,0,33.3,1.114,33,1,H1,0.034598,-0.677523,0.165937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H6,144,4,154,62,31,284,32.8,0.237,23,0,H6,1.036107,-0.367337,0.102477
H6,726,1,116,78,29,180,36.1,0.496,25,0,H6,-0.153185,0.459827,0.521311
H6,328,2,102,86,36,120,45.5,0.127,23,1,H6,-0.591345,0.873409,1.714352
H6,497,2,81,72,15,76,30.1,0.547,25,0,H6,-1.248585,0.149641,-0.240205


In [189]:
df_train = df_train.droplevel(level=0)

In [190]:
df_test = df_sub.drop(index=df_train.index, axis=0)

In [191]:
df_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,AGEs,Glucose_S,BloodPressure_S,BMI_S
1,1,85,66,29,0,26.6,0.351,31,0,H1,-1.123396,-0.160546,-0.684422
20,3,126,88,41,235,39.3,0.704,27,0,H6,0.159787,0.976805,0.927452
23,9,119,80,35,0,29.0,0.263,29,1,H6,-0.059293,0.563223,-0.379816
35,4,103,60,33,192,24.0,0.966,33,0,H1,-0.560048,-0.470732,-1.014412
46,1,146,56,0,0,29.7,0.564,29,0,H6,0.785730,-0.677523,-0.290972
...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,9,89,62,0,0,22.5,0.142,33,0,H1,-0.998208,-0.367337,-1.204791
763,10,101,76,48,180,32.9,0.171,63,0,H4,-0.622642,0.356432,0.115169
765,5,121,72,23,112,26.2,0.245,30,0,H1,0.003301,0.149641,-0.735190
766,1,126,60,0,0,30.1,0.349,47,1,H2,0.159787,-0.470732,-0.240205


In [192]:
GB_model = GaussianNB().fit(X=df_train[col_x_S], y=df_train["Outcome"])

In [193]:
df_test["pred"] = GB_model.predict(X=df_test[col_x_S])

In [194]:
df_test["pred"].unique()

array([0, 1], dtype=int64)

In [195]:
GB_model.predict_proba(X=df_test[col_x])

array([[0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000, 1.0000e+000],
       [0.0000e+000,

In [196]:
GB_f1 = f1_score(y_pred=df_test["pred"], y_true=df_test["Outcome"])

In [197]:
GB_f1

0.5333333333333333

In [198]:
RF_model = RandomForestClassifier(n_estimators=10, min_samples_leaf=10, random_state=1234).fit(X=df_train[col_x_S], y=df_train["Outcome"])

In [199]:
df_test["pred"] = RF_model.predict(X=df_test[col_x_S])

In [200]:
RF_model.predict_proba(X=df_test[col_x_S])

array([[0.82426151, 0.17573849],
       [0.65898628, 0.34101372],
       [0.80802923, 0.19197077],
       [0.97348777, 0.02651223],
       [0.68920657, 0.31079343],
       [0.96960623, 0.03039377],
       [0.94011905, 0.05988095],
       [0.97679739, 0.02320261],
       [0.57651246, 0.42348754],
       [0.82193778, 0.17806222],
       [0.62643469, 0.37356531],
       [0.76253932, 0.23746068],
       [0.8357004 , 0.1642996 ],
       [0.4348041 , 0.5651959 ],
       [0.84936637, 0.15063363],
       [0.25061228, 0.74938772],
       [0.65428328, 0.34571672],
       [0.80537145, 0.19462855],
       [0.88503823, 0.11496177],
       [0.9622549 , 0.0377451 ],
       [0.4834002 , 0.5165998 ],
       [0.68492913, 0.31507087],
       [0.70627684, 0.29372316],
       [0.48665225, 0.51334775],
       [0.51335489, 0.48664511],
       [0.975     , 0.025     ],
       [0.6753991 , 0.3246009 ],
       [0.96805556, 0.03194444],
       [0.97463768, 0.02536232],
       [0.98333333, 0.01666667],
       [0.

In [201]:
RF_f1 = f1_score(y_pred=df_test["pred"], y_true=df_test["Outcome"])

In [202]:
RF_f1

0.4657534246575343

In [203]:
abs(GB_f1-RF_f1).round(3)

0.068