In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

pip install mlxtend
pip install missingno

from mlxtend.plotting import plot_decision_regions
import missingno as msno
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Collecting missingno
  Using cached missingno-0.5.2-py3-none-any.whl (8.7 kB)
Installing collected packages: missingno
Successfully installed missingno-0.5.2


In [4]:
diabetes_df = pd.read_csv('diabetes.csv')
diabetes_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

Exploratory Data Analysis (EDA)

In [None]:
diabetes_df.columns

Information about the dataset

In [None]:
diabetes_df.info()

To know more about the dataset

In [None]:
diabetes_df.describe()

In [None]:
diabetes_df.describe().T # Transpose

In [None]:
# Now let’s check that if our dataset have null values or not
diabetes_df.isnull().head(10)

In [None]:
# Now let’s check the number of null values our dataset has.
diabetes_df.isnull().sum()

In [None]:
# ere from the above code we first checked that is there any null values from the IsNull() function then we are going to take the sum of all those missing values from the sum() function and the inference we now get is that there are no missing values but that is actually not a true story as in this particular dataset all the missing values were given the 0 as a value which is not good for the authenticity of the dataset. 
# Hence we will first replace the 0 value with the NAN value then start the imputation process.
diabetes_df_copy = diabetes_df.copy(deep = True)
diabetes_df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

# Showing the Count of NANs
print(diabetes_df_copy.isnull().sum())

Data Visualization

In [None]:
# Plotting the data distribution plots before removing null values
p = diabetes_df.hist(figsize = (20,20))

In [None]:
# Now we will be imputing the mean value of the column to each missing value of that particular column.
diabetes_df_copy['Glucose'].fillna(diabetes_df_copy['Glucose'].mean(), inplace = True)
diabetes_df_copy['BloodPressure'].fillna(diabetes_df_copy['BloodPressure'].mean(), inplace = True)
diabetes_df_copy['SkinThickness'].fillna(diabetes_df_copy['SkinThickness'].median(), inplace = True)
diabetes_df_copy['Insulin'].fillna(diabetes_df_copy['Insulin'].median(), inplace = True)
diabetes_df_copy['BMI'].fillna(diabetes_df_copy['BMI'].median(), inplace = True)

# Plotting the distributions after removing the NAN values.
p = diabetes_df_copy.hist(figsize = (20,20))

In [None]:
# Plotting Null Count Analysis Plot
p = msno.bar(diabetes_df)

In [None]:
# Now, let’s check that how well our outcome column is balanced
color_wheel = {1: "#0392cf", 2: "#7bc043"}
colors = diabetes_df["Outcome"].map(lambda x: color_wheel.get(x + 1))
print(diabetes_df.Outcome.value_counts())
p=diabetes_df.Outcome.value_counts().plot(kind="bar")

In [None]:
plt.subplot(121), sns.distplot(diabetes_df['Insulin'])
plt.subplot(122), diabetes_df['Insulin'].plot.box(figsize=(16,5))
plt.show()

In [None]:
# Correlation between all the features before cleaning
plt.figure(figsize=(12,10))
# seaborn has an easy method to showcase heatmap
p = sns.heatmap(diabetes_df.corr(), annot=True,cmap ='RdYlGn')

Scaling the Data

In [None]:
# Before scaling down the data let’s have a look into it

diabetes_df_copy.head()

In [None]:
# After Standard scaling
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(diabetes_df_copy.drop(["Outcome"],axis = 1),), columns=['Pregnancies', 
'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
X.head()

In [None]:
# Model Building
# Splitting the dataset

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

In [None]:
# Now we will split the data into training and testing data using the train_test_split function
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33,
                                                    random_state=7)
# Random Forest
# Building the model using RandomForest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

# Now after building the model let’s check the accuracy of the model on the training dataset.
rfc_train = rfc.predict(X_train)
from sklearn import metrics

print("Accuracy_Score =", format(metrics.accuracy_score(y_train, rfc_train)))

In [None]:
# So here we can see that on the training dataset our model is overfitted.
# Getting the accuracy score for Random Forest
from sklearn import metrics

predictions = rfc.predict(X_test)
print("Accuracy_Score =", format(metrics.accuracy_score(y_test, predictions)))

In [None]:
# Random Forest Output
# Decision Tree
# Building the model using DecisionTree

from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
# Now we will be making the predictions on the testing data directly as it is of more importance.
# Getting the accuracy score for Decision Tree

from sklearn import metrics

predictions = dtree.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test,predictions)))

In [None]:
# Classification report and confusion matrix of the decision tree model
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

In [None]:
# Decision Tree
#  XgBoost classifier
# Building model using XGBoost

pip install xgboost
from xgboost import XGBClassifier

xgb_model = XGBClassifier(gamma=0)
xgb_model.fit(X_train, y_train)

In [None]:
# Getting the accuracy score for the XgBoost classifier
from sklearn import metrics

xgb_pred = xgb_model.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test, xgb_pred)))

In [None]:
# Support Vector Machine (SVM)
# Building the model using Support Vector Machine (SVM)
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)

# Prediction from support vector machine model on the testing data
svc_pred = svc_model.predict(X_test)

# Accuracy score for SVM model
from sklearn import metrics

print("Accuracy Score =", format(metrics.accuracy_score(y_test, svc_pred)))

In [None]:
# Classification report and confusion matrix of the SVM classifier
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, svc_pred))
print(classification_report(y_test,svc_pred))

In [None]:
# Getting feature importances
rfc.feature_importances_

# Plotting feature importances
(pd.Series(rfc.feature_importances_, index=X.columns).plot(kind='barh'))

In [None]:
# Saving Model – Random Forest
import pickle

# Firstly we will be using the dump() function to save the model using pickle
saved_model = pickle.dumps(rfc)

# Then we will be loading that saved model
rfc_from_pickle = pickle.loads(saved_model)

# lastly, after loading that model we will use this to make predictions
rfc_from_pickle.predict(X_test)

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.tail()

In [None]:
# Putting data points in the model will either return 0 or 1 i.e. person suffering from diabetes or not.
rfc.predict([[0,137,40,35,168,43.1,2.228,33]]) #4th patient

In [None]:
# Another one
rfc.predict([[10,101,76,48,180,32.9,0.171,63]])  # 763 th patient