In [37]:
import pandas as pd
diabetes_df = pd.read_csv('diabetes.csv')
diabetes_df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [38]:
diabetes_df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [39]:
sensible_col = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
diabetes_df[sensible_col].eq(0).sum()

Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

# Magic

====

In [40]:
# Replace all 0 with median + watch out for Outcome value
masks = [ diabetes_df['Outcome'] == 0, diabetes_df['Outcome'] == 1 ]

for mask in masks:
    diabetes_df.loc[mask, sensible_col] = diabetes_df.loc[mask, sensible_col].replace(0, diabetes_df.loc[mask].mean())

====

In [41]:
diabetes_df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,100.335821,33.6,0.627,50,1
1,1,85.0,66.0,29.0,68.792,26.6,0.351,31,0
2,8,183.0,64.0,22.164179,100.335821,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,19.664,68.792,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,68.184,19.664,68.792,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,22.164179,100.335821,35.142537,0.232,54,1


# Test Accuracy

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']
clf = RandomForestClassifier(max_depth = 50, n_estimators=200, random_state=69)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=420)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8700787401574803

Result is better and dataframe is much more beautiful!