In [0]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [79]:
!wget https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/diabetes.csv

--2018-10-19 00:52:55--  https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23875 (23K) [text/plain]
Saving to: ‘diabetes.csv’


2018-10-19 00:52:55 (1.63 MB/s) - ‘diabetes.csv’ saved [23875/23875]



In [0]:
dataset = pd.read_csv("diabetes.csv")

In [82]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [0]:
features = dataset.drop(["Outcome"], axis=1)
X = np.array(features)
y = np.array(dataset["Outcome"])

In [84]:
X[:3] #las primeras tres filas

array([[  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
         50.   ],
       [  1.   ,  85.   ,  66.   ,  29.   ,   0.   ,  26.6  ,   0.351,
         31.   ],
       [  8.   , 183.   ,  64.   ,   0.   ,   0.   ,  23.3  ,   0.672,
         32.   ]])

In [85]:
y[:3] # las primeras tres clases

array([1, 0, 1])

In [0]:
X = (X - np.min(X)) / (np.max(X) - np.min(X))

In [87]:
X[:3] #las primeras tres filas con los datos normalizados

array([[0.0070922 , 0.1749409 , 0.08510638, 0.04137116, 0.        ,
        0.03971631, 0.00074113, 0.05910165],
       [0.00118203, 0.10047281, 0.07801418, 0.03427896, 0.        ,
        0.03144208, 0.00041489, 0.03664303],
       [0.00945626, 0.21631206, 0.07565012, 0.        , 0.        ,
        0.02754137, 0.00079433, 0.03782506]])

In [48]:
X.shape

(768, 8)

In [0]:
def sigmoid(z):
   return 1 / (1 + np.exp(-z))

In [0]:
def forward_propagation(W, b, X, y):
  data_number = X.shape[0]
  Z = np.dot(W, X.T) + b
  A = sigmoid(Z)
  
  cost = (- 1 / data_number) * np.sum(y * np.log(A) + (1 - y) * (np.log(1 - A)))
  
  return A, cost

In [0]:
def backward_propagation(X, A, y):
  data_number = X.shape[0]
  dW = (1 / data_number) * np.dot((A - y), X)
  db = (1 / data_number) * np.sum(A - y)
  
  return dW, db

In [0]:
def optimize(W, b, X, y, num_iterations, learning_rate):
  costs = []
  
  for i in range(num_iterations):
    A, cost = forward_propagation(W, b, X, y)
    dW, db = backward_propagation(X, A, y)
    
    W = W - learning_rate * dW
    b = b - learning_rate * db
    
    if i % 100 == 0:
      costs.append(cost)
      print ("Costo e iteracion %i: %f" % (i, cost))
    
  params = {
    "W": W,
    "b": b
  }
    
  gradients = {
    "dW": dW,
    "db": db
  }
    
  return params, gradients, costs

In [0]:
def predict(W, b, X):
  data_number = X.shape[0]
  y_prediction = np.zeros((1, data_number))
  
  Z = np.dot(W, X.T) + b
  A = sigmoid(Z)
    
  for i in range(A.shape[1]):
    y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
   
  return y_prediction

In [0]:
def model(X_train, y_train, X_val, Y_val, num_iterations=2000, learning_rate=0.5):
  dimensions = X.shape[1]
  W = np.zeros(shape=(1, dimensions))
  b = 0
  
  params, gradients, costs = optimize(W, b, X_train, y_train, num_iterations, learning_rate)

  W = params["W"]
  b = params["b"]

  y_prediction_validation = predict(W, b, X_val)
  y_prediction_train = predict(W, b, X_train)

  print("train accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_train - y_train)) * 100))
  print("test accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_validation - y_val)) * 100))

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.20)

In [82]:
model(X_train, y_train, X_val, y_val, num_iterations = 1000, learning_rate = 0.003)

Costo e iteracion 0: 0.693147
Costo e iteracion 100: 0.687603
Costo e iteracion 200: 0.682854
Costo e iteracion 300: 0.678784
Costo e iteracion 400: 0.675295
Costo e iteracion 500: 0.672302
Costo e iteracion 600: 0.669732
Costo e iteracion 700: 0.667524
Costo e iteracion 800: 0.665626
Costo e iteracion 900: 0.663992
train accuracy: 64.00651465798046 %
test accuracy: 69.48051948051949 %
