In [1]:
# Import Libraries needed
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# importing our dataset
df = pd.read_csv("diabetes.csv")
# reading the file
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
# Checking missing values
missing_values = df.isnull().sum()
print(missing_values)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
# Correlation
corr = df.corr()
print(corr)

                          Pregnancies   Glucose  BloodPressure  SkinThickness  \
Pregnancies                  1.000000  0.120405       0.149672      -0.063375   
Glucose                      0.120405  1.000000       0.138044       0.062368   
BloodPressure                0.149672  0.138044       1.000000       0.198800   
SkinThickness               -0.063375  0.062368       0.198800       1.000000   
Insulin                     -0.076600  0.320371       0.087384       0.448859   
BMI                          0.019475  0.226864       0.281545       0.393760   
DiabetesPedigreeFunction    -0.025453  0.123243       0.051331       0.178299   
Age                          0.539457  0.254496       0.238375      -0.111034   
Outcome                      0.224437  0.458421       0.075958       0.076040   

                           Insulin       BMI  DiabetesPedigreeFunction  \
Pregnancies              -0.076600  0.019475                 -0.025453   
Glucose                   0.320371  0.226

In [6]:
# Separting the data(input and Outcome)
X= df[["Glucose","BMI", "Age","Pregnancies"]]
Y = df["Outcome"]

In [7]:
# Data Standardization
scaler = StandardScaler()

In [8]:
# Fitting the data
scaler.fit(X)
standardized_data = scaler.transform(X)
print(standardized_data)

[[ 0.52455322  0.17268332  1.18042417 -0.5153943 ]
 [-1.1597562   0.73724853 -0.85632626 -1.12049474]
 [ 0.74288962  1.47363794 -0.17740945 -1.12049474]
 ...
 [-1.12856529 -0.12187245  0.75610116  0.69480658]
 [ 0.24383498  4.28419085 -0.60173245 -1.12049474]
 [-1.25332895 -0.25687717 -0.68659705 -0.5153943 ]]


In [9]:
# Storing standardized_data In X variable.
X = standardized_data
Y = df["Outcome"]

In [10]:
#  Train Test Split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 10)
print(X.shape, X_train.shape, X_test.shape)

(2000, 4) (1600, 4) (400, 4)


In [11]:
# Train The Model:
classifier = RandomForestClassifier()

In [12]:
# Training the support vector machine  classifier
classifier.fit(X_train, Y_train)

RandomForestClassifier()

In [13]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print("The accuracy score of the training data:",training_data_accuracy)

The accuracy score of the training data: 1.0


In [14]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print("The accuracy score of the test data:",test_data_accuracy)

The accuracy score of the test data: 0.975


In [None]:
# importing Pickle
import pickle
pickle.dump(classifier, open("model.pkl",'wb'))