# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Reading Data from CSV

In [2]:
data = pd.read_csv("./data.csv")

# Checking Data Shape (Rows/Columns)

In [3]:
data.shape

(768, 9)

# First 5 Data Entries

In [4]:
data.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Number of Diabetics/Non-Diabetics

In [5]:
diabetes_true = len(data.loc[data['Outcome'] == True])
diabetes_false = len(data.loc[data['Outcome'] == False])
(diabetes_true,diabetes_false)

(268, 500)

# Test Split Model

In [6]:
from sklearn.model_selection import train_test_split
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
predicted_class = ['Outcome']

X = data[feature_columns].values
y = data[predicted_class].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

# Number of Rows with 0 Value

In [7]:
print("number of data rows : {0}".format(len(data)))
print("number of rows missing Pregnancies: {0}".format(len(data.loc[data['Pregnancies'] == 0])))
print("number of rows missing Glucose: {0}".format(len(data.loc[data['Glucose'] == 0])))
print("number of rows missing BloodPressure: {0}".format(len(data.loc[data['BloodPressure'] == 0])))
print("number of rows missing SkinThickness: {0}".format(len(data.loc[data['SkinThickness'] == 0])))
print("number of rows missing Insulin: {0}".format(len(data.loc[data['Insulin'] == 0])))
print("number of rows missing BMI: {0}".format(len(data.loc[data['BMI'] == 0])))
print("number of rows missing DiabetesPedigreeFunction: {0}".format(len(data.loc[data['DiabetesPedigreeFunction'] == 0])))
print("number of rows missing Age: {0}".format(len(data.loc[data['Age'] == 0])))

number of data rows : 768
number of rows missing Pregnancies: 111
number of rows missing Glucose: 5
number of rows missing BloodPressure: 35
number of rows missing SkinThickness: 227
number of rows missing Insulin: 374
number of rows missing BMI: 11
number of rows missing DiabetesPedigreeFunction: 0
number of rows missing Age: 0


# Filling Values with Mean Data

In [8]:
from sklearn.preprocessing import Imputer
fill_values = Imputer(missing_values=0, strategy="mean", axis=0)

X_train = fill_values.fit_transform(X_train)
X_test = fill_values.fit_transform(X_test)



# Using RandomForestClassifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=10)
random_forest_model.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)

# Predicting and Checking Model Accuracy

In [10]:
predict_train_data = random_forest_model.predict(X_test)
from sklearn import metrics
print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, predict_train_data)))

Accuracy = 0.727


In [None]:
pregnancy = int(input("Enter Pregnancy:"))
glucose = int(input("Enter Glucose:"))
bloodpressure = int(input("Enter Blood Pressue:"))
skinthickness = int(input("Enter Skin Thickness:"))
insulin = int(input("Enter Insulin:"))
bmi = float(input("Enter BMI:"))
DiabetesPedigreeFunction = float(input("Enter DiabetesPedigreeFunction:"))
age = int(input("Enter Age:"))
user_input = [pregnancy, glucose, bloodpressure, skinthickness, insulin, bmi, DiabetesPedigreeFunction, age]
result = random_forest_model.predict([user_input])[0]
print("Outcome:", result)