In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



## Let us start with importing our dataset into a pandas daatframe.

In [None]:
data=pd.read_csv('/kaggle/input/review/diabetes.csv')
data.head()

### We have 9 Columns and 768 Rows of data. Let's quickly check for null values in our dataset

In [None]:
data.isnull().sum()

In [None]:
data.info()

### We do not have any null values and all the columns are numerical. Lucky us!

In [None]:
data.describe()

### Here we see some statistics of the dataset columns. For example the mean age of our sample is 33.2.
### We see that there are some non-sensical outliers in columns "Glucose","BloodPressure","BMI" and "Insulin". They have their minimum values as 0 which is not a clinically accurate value.
### So we need to clean the data.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig,axes=plt.subplots(1,8,figsize=(25,5))
n=0
for i in data.drop('Outcome',axis=1).columns:
    sns.boxplot(data=data,x=i,ax=axes[n])
    n+=1


### Here we see the outliers clearly.

In [None]:
data = data[(data['Glucose']>0)&(data['BloodPressure']>0)&(data['SkinThickness']<90)&(data['BMI']>0)]

### In the above codecell we removed the rows with Glucose=0,BP=0,SkinThickness>90 and BMI<1

In [None]:
data.head()

In [None]:
fig,axes=plt.subplots(1,8,figsize=(25,5))
n=0
for i in data.drop('Outcome',axis=1).columns:
    sns.boxplot(data=data,x=i,ax=axes[n])
    n+=1

## After removing the extreme outliers, we go for EDA

## EDA

In [None]:
fig,axes = plt.subplots(2,4,figsize=(15,15))
n=0
m=0
for i in data.drop('Outcome',axis=1):
    sns.histplot(data,x=i,ax=axes[n,m],kde=True,hue='Outcome')
    if(m>=3):
        n+=1
        m=0
    else:m+=1
    
        

### The figures above tell us a lot about diabetic trends.

* The PDF of Glucose for Diabetic Patients is shifted towards the right. This is expected because, Glucose level in diabetic patients tend to be more.
* We see Blood Pressure,Skin Thickness and Insulin have an identical distribution for both Diabetic Patients and Healthy Patients.
* BMI has a litle right shifted distribution for the diabetic patients. So more BMI is bad for Health(Diabetes).

In [None]:

 sns.catplot(data=data,y='BMI',x='Outcome')

In [None]:

 sns.catplot(data=data,y='Age',x='Outcome')

### No visible trend, other than - A lot of low age females do not have diabetes.

In [None]:

 sns.catplot(data=data,y='Pregnancies',x='Outcome')

In [None]:
 sns.catplot(data=data,y='Glucose',x='Outcome')

In [None]:
sns.catplot(data=data,y='DiabetesPedigreeFunction',x='Outcome')

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.drop('Outcome',axis=1).corr(),cmap = 'Blues')

### Here we see some interesting results:
* Age and Blood Pressure are somewhat positively correlated. As is expected from General Knowledge.
* BMI and Skin Thicknes have a positive correlation. More BMI means more weight per height squared, therefore SkinThickness is also more.
* BMI and Blood Pressure also have a positive correlation. More the BMI, more the heart needs to work,more the Blood Pressure.
* Insulin and Skin Thickness also have a high positive correlation.

In [None]:
data.head()

## Let us now do some Predictions! Let's train Machine Learning Models to predict the Outcome(Diabetic or not)

In [None]:
y=data['Outcome']
x=data.drop('Outcome',axis=1)

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25)


### Splitting into Training set and Test set

### Model 1 - Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier(n_estimators=100)

In [None]:
model.fit(xtrain,ytrain)
ypred=model.predict(xtest)


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
confusion_matrix(ytest,ypred)


In [None]:
accuracy_score(ytest,ypred)

### From the above accuracy score and Confusion Matrix, 
### we see that we get a correct prediction on our test set 80.66% of the time, which is good!

### Model 2 - Support Vector Classifier

In [None]:
from sklearn.svm import SVC
model2=SVC(kernel='poly')
model2.fit(xtrain,ytrain)
ypred2=model2.predict(xtest)

In [None]:
confusion_matrix(ytest,ypred2)

In [None]:
accuracy_score(ytest,ypred2)

### We see the accuracy score of this model to be 82.87% which is better than RFC.

### Model 3 - K Neighbours Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model3=KNeighborsClassifier(n_neighbors=10)
model3.fit(xtrain,ytrain)
ypred3=model3.predict(xtest)

In [None]:
confusion_matrix(ytest,ypred3)

In [None]:
accuracy_score(ytest,ypred3
              )

## This model gives us a nice 78.45% accuracy score. This is the worst model when compared with the two above, but an accuracy score of 78% is also good.

# The best choice for our model would be The Polynomial Kernel Support Vector Classifier, with an accuracy score of 82.87% !