In [3]:
import numpy as np
import pandas as pd


In [4]:
df = pd.read_csv('diabetes2.csv')

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
df.describe()

In [8]:
df.isnull().sum()*100/df.isnull().count()

Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64

In [9]:
def outlier_count(col, data=df):
    print(15*'-' + col + 15*'-')
    q75, q25 = np.percentile(data[col], [75, 25])
    iqr = q75 - q25
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])
    outlier_percent = round(outlier_count/len(data[col])*100, 2)
    print('Number of outliers: {}'.format(outlier_count))
    print('Percent of data that is outlier: {}%'.format(outlier_percent))

In [10]:
cont_vars = list(df.columns)

In [11]:
for col in cont_vars:
    outlier_count(col)

---------------Pregnancies---------------
Number of outliers: 4
Percent of data that is outlier: 0.52%
---------------Glucose---------------
Number of outliers: 5
Percent of data that is outlier: 0.65%
---------------BloodPressure---------------
Number of outliers: 45
Percent of data that is outlier: 5.86%
---------------SkinThickness---------------
Number of outliers: 1
Percent of data that is outlier: 0.13%
---------------Insulin---------------
Number of outliers: 34
Percent of data that is outlier: 4.43%
---------------BMI---------------
Number of outliers: 19
Percent of data that is outlier: 2.47%
---------------DiabetesPedigreeFunction---------------
Number of outliers: 29
Percent of data that is outlier: 3.78%
---------------Age---------------
Number of outliers: 9
Percent of data that is outlier: 1.17%
---------------Outcome---------------
Number of outliers: 0
Percent of data that is outlier: 0.0%


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]

In [22]:
Y = df['Outcome']

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#we are not using random_state variable to ensure split is being done randomly

In [24]:
model = LogisticRegression(random_state=0)

In [25]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [26]:
model = LogisticRegression(solver='liblinear', random_state=0).fit(X_train, Y_train)

In [27]:
model.classes_

array([0, 1], dtype=int64)

In [28]:
model.intercept_

array([-5.38318143])

In [29]:
model.coef_

array([[ 1.20970985e-01,  2.69526001e-02, -1.55003995e-02,
         1.13186257e-03,  4.99891202e-05,  4.75385087e-02,
         8.24799209e-01,  2.21710881e-03]])

In [30]:
predict = model.predict(X_test)

In [31]:
model.score(X_test, Y_test)

0.7727272727272727

In [32]:
confusion_matrix(Y_test, predict)

array([[87, 11],
       [24, 32]], dtype=int64)

In [33]:
print(classification_report(Y_test, predict))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83        98
           1       0.74      0.57      0.65        56

    accuracy                           0.77       154
   macro avg       0.76      0.73      0.74       154
weighted avg       0.77      0.77      0.76       154



In [34]:
X_std = np.copy(X)

X_std[:,0] = (X_std[:,0] - X_std[:,0].mean()) / X_std[:,0].std()
X_std[:,1] = (X_std[:,1] - X_std[:,1].mean()) / X_std[:,1].std()

In [35]:
# Define Logistic Regression hypothesis or sigmoid function

def sigmoid(X, theta):
    
    z = np.dot(X, theta[1:]) + theta[0]
    
    return 1.0 / ( 1.0 + np.exp(-z))

In [36]:
def lrCostFunction(Y, hx):
  
    # compute cost for given theta parameters
    j = -Y.dot(np.log(hx)) - ((1 - Y).dot(np.log(1-hx)))
    
    return j

In [37]:
def lrGradient(X, Y, theta, alpha, num_iter):
    # empty list to store the value of the cost function over number of iterations
    cost = []
    
    for i in range(num_iter):
        # call sigmoid function 
        hx = sigmoid(X, theta)
        # calculate error
        error = hx - Y
        # calculate gradient
        grad = X.T.dot(error)
        # update values in theta
        theta[0] = theta[0] - alpha * error.sum()
        theta[1:] = theta[1:] - alpha * grad
        
        cost.append(lrCostFunction(Y, hx))
        
    return cost     

In [38]:
m, n = X.shape

# initialize theta(weights) parameters to zeros
theta = np.zeros(1+n)

# set learning rate to 0.01 and number of iterations to 500
alpha = 0.01
num_iter = 500

cost = lrGradient(X_std, Y, theta, alpha, num_iter)

  return 1.0 / ( 1.0 + np.exp(-z))
  j = -Y.dot(np.log(hx)) - ((1 - Y).dot(np.log(1-hx)))


In [39]:
print ('\n Logisitc Regression bias(intercept) term :', theta[0])
print ('\n Logisitc Regression estimated coefficients :', theta[1:])


 Logisitc Regression bias(intercept) term : -141.1396437856634

 Logisitc Regression estimated coefficients : [409.7981491  781.52456618 -32.6442238  -80.18011307 100.25294793
 121.57896558  32.96245481 249.46074741]


In [40]:
def lrPredict(X):
    
    return np.where(sigmoid(X,theta) >= 0.5, 1, 0)

In [None]:
model.intercept_ = -141.10642916992526

In [None]:
model.coef_ = [[ 403.01394505,  775.22842044,-138.02126201,  -49.62383344 , 193.29476266,
   86.55557229 ,  32.8753586,   150.57091712]]