In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline

In [2]:
def about_this_file():
  """

  This dataset describes the medical records for Pima Indians
  and whether or not each patient will have an onset of diabetes within ve years.

  Fields description follow:

  preg = Number of times pregnant #didnt change 

  plas = Plasma glucose concentration a 2 hours in an oral glucose tolerance test

  pres = Diastolic blood pressure (mm Hg)

  skin = Triceps skin fold thickness (mm)

  test = 2-Hour serum insulin (mu U/ml)

  mass = Body mass index (weight in kg/(height in m)^2)

  pedi = Diabetes pedigree function #didnt change 

  age = Age (years)

  class = Class variable (1:tested positive for diabetes, 0: tested negative for diabetes)
  """

In [3]:
df = pd.read_csv('/content/drive/MyDrive/IST347/Week_7/pima-indians-diabetes.csv', names = ['preg','plas','pres','skin','test','mess','pedi','age','class'])

In [4]:
df.head(10) #check first 10 rows  

Unnamed: 0,preg,plas,pres,skin,test,mess,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [5]:
df.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mess', 'pedi', 'age', 'class'], dtype='object')

In [6]:
# when list.pop, dont assign to a variable because it will save the popped data
cols = list(df.columns)
cols.pop(0) # exclude preg because preg could be 0
cols

['plas', 'pres', 'skin', 'test', 'mess', 'pedi', 'age', 'class']

In [7]:
for col in cols[:-1]: 
  df[col] = df[col].mask(df[col] == 0, df[col].mean())

In [8]:
df.head(10) # check 0 has been replaced with mean

Unnamed: 0,preg,plas,pres,skin,test,mess,pedi,age,class
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,20.536458,79.799479,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,69.105469,20.536458,79.799479,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,20.536458,79.799479,31.992578,0.232,54,1


In [9]:
df.describe() # check the statistic 

Unnamed: 0,preg,plas,pres,skin,test,mess,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.681605,72.254807,26.606479,118.660163,32.450805,0.471876,33.240885,0.348958
std,3.369578,30.436016,12.115932,9.631241,93.080358,6.875374,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,20.536458,79.799479,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,79.799479,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [10]:
X = df.drop('class', axis = 1)
y = df['class']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [13]:
model = GaussianNB()
clf = model.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.77      0.79       123
           1       0.62      0.65      0.63        69

    accuracy                           0.73       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.73      0.73      0.73       192



In [16]:
print(confusion_matrix(y_test, y_pred))

[[95 28]
 [24 45]]


In [17]:
model_cv = GaussianNB()

In [18]:
scores = cross_validate(model_cv, X_train, y_train, scoring=['accuracy', 'f1'], cv = 10)

In [19]:
scores_df = pd.DataFrame(scores)
print(scores_df)

   fit_time  score_time  test_accuracy   test_f1
0  0.004051    0.004074       0.724138  0.529412
1  0.003802    0.004512       0.827586  0.761905
2  0.004048    0.002529       0.810345  0.685714
3  0.002048    0.001998       0.637931  0.461538
4  0.001881    0.001983       0.775862  0.628571
5  0.002025    0.001956       0.810345  0.731707
6  0.001823    0.001964       0.736842  0.571429
7  0.001894    0.001985       0.684211  0.571429
8  0.001790    0.001990       0.754386  0.631579
9  0.001840    0.001900       0.684211  0.470588


In [20]:
scores_df.test_accuracy.mean()

0.7445856019358741