# Pima Indian Diabetes Data Classification

## Reading and exploring the data 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv("diabetes.csv",sep=",")
df.head()

In [None]:
print(str(df.describe())+'\n')
print(str(df.info())+'\n')
print((df==0).sum(axis=0))

In [None]:
X0 = df.drop("Outcome",inplace=False,axis=1)
for col in ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']:
    df[col] = df[col].replace(0,np.nan)
df.isnull().sum()

## Visualizing the data 

In [None]:
f = plt.figure(figsize=(15,10))
corr = df.corr()
plt.matshow(corr,fignum=f.number)
plt.xticks(range(len(corr.columns)),corr.columns,fontsize=10,rotation=45)
plt.yticks(range(len(corr.columns)),corr.columns,fontsize=10)
colourBar = plt.colorbar()
colourBar.ax.tick_params(labelsize=10)
plt.title('Correlation Matrix',fontsize=18)

In [None]:
figure, axis = plt.subplots(9,1,figsize=(10,20))
figure.tight_layout(pad=10,w_pad=10,h_pad=10)
sns.histplot(x = df['Pregnancies'],ax=axis[0])
sns.histplot(x = df['Glucose'],ax=axis[1])
sns.histplot(x = df['BloodPressure'], ax=axis[2])
sns.histplot(x=df['SkinThickness'], ax=axis[3])
sns.histplot(x=df['Insulin'],ax=axis[4])
sns.histplot(x=df['BMI'],ax=axis[5])
sns.histplot(x=df['DiabetesPedigreeFunction'],ax=axis[6])
sns.histplot(x=df['Age'],ax=axis[7])
sns.histplot(x=df['Outcome'],ax=axis[8])

## Using different types of data imputation 

In [None]:
y = df['Outcome']
df.drop("Outcome",inplace=True,axis=1)
X1 = df
for col in ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']:
    X1[col] = X1[col].replace(np.nan,X1[col].median())

In [None]:
from sklearn.impute import KNNImputer
X2 = df
KNNImputer().fit_transform(X2)

## Testing and evaluating the models 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
def useModel(X,y,variation):
    print(variation)
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    models = [RandomForestClassifier(),LogisticRegression(),KNeighborsClassifier()]
    for model in models:
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test,pred)
        print(str(model))
        print("Accuracy: {}".format(acc))
        print("Confustion Matrix:  \n {} \n".format(confusion_matrix(y_test, pred)))

In [None]:
useModel(X0,y,"Using zeros for all missing values \n -------------------- \n")
useModel(X1,y,"Using median to replace all \n -------------------- \n")
useModel(X2,y,"Using KNNImputer to replace all \n -------------------- \n")