In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score


In [2]:
train = pd.read_csv('dataset_37_diabetes.csv')

In [3]:
zeros = (train==0).astype(int).sum()
zeros

preg     111
plas       5
pres      35
skin     227
insu     374
mass      11
pedi       0
age        0
class      0
dtype: int64

In [4]:
# Change the 0 values to NaNs to fill in the missing values
cols = ['mass', 'pres', 'plas']
train[cols] = train[cols].replace({0: np.nan})

In [5]:
train['mass'].fillna(train.groupby('age')['mass'].transform('mean'), inplace=True)
train['plas'].fillna(train.groupby('mass')['plas'].transform('mean'), inplace=True)
train['pres'].fillna(train.groupby('mass')['pres'].transform('mean'), inplace=True)
train['pres'].fillna(train.groupby('age')['pres'].transform('mean'), inplace=True)

In [6]:
train.isna().sum()

preg     0
plas     1
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [7]:
train['plas'].fillna(train.groupby('age')['plas'].transform('mean'), inplace=True)

In [8]:
train.isna().sum()

preg     0
plas     0
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [9]:
rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [10]:
x_train = train.drop(['class'], axis=1)
y_train = train['class']

In [11]:
x_train

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148.0,72.0,35,0,33.6,0.627,50
1,1,85.0,66.0,29,0,26.6,0.351,31
2,8,183.0,64.0,0,0,23.3,0.672,32
3,1,89.0,66.0,23,94,28.1,0.167,21
4,0,137.0,40.0,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180,32.9,0.171,63
764,2,122.0,70.0,27,0,36.8,0.340,27
765,5,121.0,72.0,23,112,26.2,0.245,30
766,1,126.0,60.0,0,0,30.1,0.349,47


In [12]:
y_train

0      tested_positive
1      tested_negative
2      tested_positive
3      tested_negative
4      tested_positive
            ...       
763    tested_negative
764    tested_negative
765    tested_negative
766    tested_positive
767    tested_negative
Name: class, Length: 768, dtype: object

In [13]:
class_rename = {'tested_negative': 1, 'tested_positive': 0}
train['class'] = train['class'].map(class_rename)

In [14]:
train

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148.0,72.0,35,0,33.6,0.627,50,0
1,1,85.0,66.0,29,0,26.6,0.351,31,1
2,8,183.0,64.0,0,0,23.3,0.672,32,0
3,1,89.0,66.0,23,94,28.1,0.167,21,1
4,0,137.0,40.0,35,168,43.1,2.288,33,0
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180,32.9,0.171,63,1
764,2,122.0,70.0,27,0,36.8,0.340,27,1
765,5,121.0,72.0,23,112,26.2,0.245,30,1
766,1,126.0,60.0,0,0,30.1,0.349,47,0


In [15]:
y_train = train['class']

In [16]:
x_train

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148.0,72.0,35,0,33.6,0.627,50
1,1,85.0,66.0,29,0,26.6,0.351,31
2,8,183.0,64.0,0,0,23.3,0.672,32
3,1,89.0,66.0,23,94,28.1,0.167,21
4,0,137.0,40.0,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180,32.9,0.171,63
764,2,122.0,70.0,27,0,36.8,0.340,27
765,5,121.0,72.0,23,112,26.2,0.245,30
766,1,126.0,60.0,0,0,30.1,0.349,47


In [17]:
y_train

0      0
1      1
2      0
3      1
4      0
      ..
763    1
764    1
765    1
766    0
767    1
Name: class, Length: 768, dtype: int64

In [18]:
rf_clf.fit(x_train, y_train)
rf_clf_cvs = cross_val_score(rf_clf, x_train, y_train, cv=10, scoring='accuracy').mean()
rf_clf_cvs

0.7655673274094328

In [19]:
X_train, x_test, Y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=3)

In [20]:
rf_clf2 = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [21]:
rf_clf2.fit(X_train, Y_train)
rf_clf2_cvs = cross_val_score(rf_clf2, X_train, Y_train, cv=10, scoring='accuracy').mean()
rf_clf2_cvs

0.7767847699629826

In [24]:
y_pred = rf_clf2.predict(x_test)