In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [9]:
# Reference the data file where the CSV is located
# Reference the data file where the CSV is located
diabetes_csv_path = "data/data.csv"

# Import the data into a Pandas DataFrame
df = pd.read_csv(diabetes_csv_path)
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
df = pd.read_csv(diabetes_csv_path)
predictors = ['Glucose','BloodPressure', 'BMI', 'Age', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']
label = 'Outcome'


In [33]:
df_train, df_test, y_train, y_test = train_test_split(df[predictors], df[label], test_size=0.20, random_state=42)

In [34]:
le = dict()
for column in df_train.columns:
    if df_train[column].dtype == np.object:
        le[column] = LabelEncoder()
        df_train[column] = le[column].fit_transform(df_train[column])
        
for column in df_test.columns:
    if df_test[column].dtype == np.object:
        df_test[column] = le[column].transform(df_test[column])

In [32]:
for col in df_train.columns:
    print(col, df_train[col].unique())

Glucose [ 84 112 139 161 134 130 132 108  80  87 171  88 146 191 122 159 100 123
 155 102 125 104 131 145 152 147 128 194  78 106 129  61  81  99 107 170
  85  97 127 133  94  83 187 111 143 124 165 137 179 101 120  89 119 109
 105  77  95 135 103  82 138 117 110   0 115 142 116 126  79  90 160 162
 144 114 180 168 195  76  86 189  73 196 158 121 151 141 153 181 136 164
 148 190  68 197 186  91 154 173 169 118 188  93 157 163 150 113 174 166
 140 193 184 183  65  57  56  92 156 176  72  96  67 182  98  75 149  71
  44 175 198 178 167 199  74]
BloodPressure [  0  82  46  50  80  70  68  60  55  72  74  85  56  64  66  62 100  84
  86  88  76  94  78  75  54  58  90  65  30  52  92  38  98 110  96 104
 102  48 106  44  24  40  61 122]
BMI [ 0.  28.2 28.7 21.9 46.2 25.9 32.9 25.5 35.5 19.1 37.2 43.6 29.  28.9
 30.9 33.3 27.4 32.  57.3 38.7 29.3 33.8 30.  29.9 31.6 22.2 41.5 49.3
 43.3 30.8 36.9 23.5 33.6 31.  25.8 39.9 35.1 34.4 30.1 25.6 36.6 34.5
 35.6 27.7 40.2 34.3 35.2 37.7 26.8 33.2

In [35]:
model = RandomForestClassifier(n_estimators=25, random_state=42)
model.fit(X=df_train, y=y_train)
# y_pred = model.predict(X=df_test)
# print(confusion_matrix(y_test, y_pred))
# print(f1_score(y_test, y_pred))
# from sklearn.externals import joblib
# joblib.dump(model, './model/model.pkl')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
from sklearn.externals import joblib

In [37]:
model = RandomForestClassifier(n_estimators=25, random_state=42)
model.fit(X=df_train, y=y_train)
joblib.dump(model, 'model.pkl')

['model.pkl']

In [38]:
df_test.head()

Unnamed: 0,Glucose,BloodPressure,BMI,Age,SkinThickness,Insulin,DiabetesPedigreeFunction
668,98,58,34.0,43,33,190,0.43
324,112,75,35.7,21,32,0,0.148
624,108,64,30.8,21,0,0,0.158
690,107,80,24.6,34,0,0,0.856
473,136,90,29.9,50,0,0,0.21


In [39]:
predict_request = [3, 0, 6.0, 0, 1, 33.0, 2]
predict_request = np.array(predict_request).reshape(1, -1)
model.predict_proba(predict_request)[0][1]

0.2

In [40]:
y_test.head()

668    0
324    0
624    0
690    0
473    0
Name: Outcome, dtype: int64

In [41]:
y_pred_prob = model.predict_proba(df_test)
y_pred_prob

array([[0.56, 0.44],
       [0.72, 0.28],
       [0.92, 0.08],
       [0.64, 0.36],
       [0.68, 0.32],
       [0.4 , 0.6 ],
       [1.  , 0.  ],
       [0.16, 0.84],
       [0.32, 0.68],
       [0.32, 0.68],
       [0.96, 0.04],
       [0.32, 0.68],
       [0.8 , 0.2 ],
       [0.52, 0.48],
       [0.96, 0.04],
       [0.76, 0.24],
       [0.84, 0.16],
       [1.  , 0.  ],
       [0.4 , 0.6 ],
       [0.48, 0.52],
       [0.72, 0.28],
       [1.  , 0.  ],
       [0.72, 0.28],
       [0.96, 0.04],
       [0.6 , 0.4 ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.92, 0.08],
       [0.92, 0.08],
       [0.24, 0.76],
       [0.28, 0.72],
       [0.2 , 0.8 ],
       [0.12, 0.88],
       [0.28, 0.72],
       [0.36, 0.64],
       [0.24, 0.76],
       [0.44, 0.56],
       [0.8 , 0.2 ],
       [0.24, 0.76],
       [0.92, 0.08],
       [0.72, 0.28],
       [0.36, 0.64],
       [0.32, 0.68],
       [1.  , 0.  ],
       [0.6 , 0.4 ],
       [0.64, 0.36],
       [0.88,

In [42]:
y_pred = model.predict(df_test)
y_pred[:10]

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1], dtype=int64)

In [49]:
model_columns = list(df_train.columns)
joblib.dump(model_columns, 'model_columns.pkl')

['model_columns.pkl']