In [121]:
import numpy as np
import pandas as pd

df = pd.read_csv('fetal_health.csv')
print(df)

      baseline value  accelerations  fetal_movement  uterine_contractions  \
0              120.0          0.000           0.000                 0.000   
1              132.0          0.006           0.000                 0.006   
2              133.0          0.003           0.000                 0.008   
3              134.0          0.003           0.000                 0.008   
4              132.0          0.007           0.000                 0.008   
...              ...            ...             ...                   ...   
2121           140.0          0.000           0.000                 0.007   
2122           140.0          0.001           0.000                 0.007   
2123           140.0          0.001           0.000                 0.007   
2124           140.0          0.001           0.000                 0.006   
2125           142.0          0.002           0.002                 0.008   

      light_decelerations  severe_decelerations  prolongued_decelerations  

In [122]:
#checking for null_values
null_values = df.isnull().sum()
#count null values for eachn column
print(null_values)

baseline value                                            0
accelerations                                             0
fetal_movement                                            0
uterine_contractions                                      0
light_decelerations                                       0
severe_decelerations                                      0
prolongued_decelerations                                  0
abnormal_short_term_variability                           0
mean_value_of_short_term_variability                      0
percentage_of_time_with_abnormal_long_term_variability    0
mean_value_of_long_term_variability                       0
histogram_width                                           0
histogram_min                                             0
histogram_max                                             0
histogram_number_of_peaks                                 0
histogram_number_of_zeroes                                0
histogram_mode                          

In [123]:
#Define training Set
X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Only the last column


# Displaying the shapes of X and y to verify
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (2126, 21)
Shape of y: (2126,)


In [124]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
#scaling the features for more efficient gradient descent

# Initialize the class
scaler_linear = StandardScaler()

x = scaler_linear.fit_transform(X)

print(x[0])

[-1.35222005 -0.8223883  -0.20320955 -1.48246456 -0.63843755 -0.0574756
 -0.2687543   1.51319018 -0.94309501  1.80254152 -1.02856029 -0.1655066
 -1.06856207 -2.11959194 -0.70139685 -0.45844382 -1.06561383  0.15326971
 -1.18164215  1.87056871  1.11298001]


In [125]:
from sklearn.model_selection import train_test_split
#splitting training set for model evaluation after fitting is done

# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")


the shape of the training set (input) is: (1275, 21)
the shape of the training set (target) is: (1275,)

the shape of the cross validation set (input) is: (425, 21)
the shape of the cross validation set (target) is: (425,)

the shape of the test set (input) is: (426, 21)
the shape of the test set (target) is: (426,)


In [126]:
#define the model 
#the neural network is a softmax model with one layer 
#i.e outputs g(z_1)...(g(z_(number of activations)))
#the model in equation is given by can be given by g(z_1) = e(z_1)/(e(z_1) +...e(z_n)).... g(z_n) = e(z_n)/(e(z_1) +...e(z_n))


def softmax(z):
    ez = np.exp(z)              #element-wise exponenial
    sm = ez/np.sum(ez)
    return(sm)

def my_dense(a_in, W, b):
    """
    Computes dense layer
    Args:
      a_in (ndarray (n, )) : Data, 1 example 
      W    (ndarray (n,j)) : Weight matrix, n features per unit, j units
      b    (ndarray (j, )) : bias vector, j units  
    Returns
      a_out (ndarray (j,))  : j units|
    """
    units = W.shape[1]
    z_out = np.matmul(a_in,W)
    #matrix multiplication of input features and weight paramters(n.j)
    a_out = softmax(z_out)
    return(a_out)



In [127]:
#specify cost function and gradient descent to get most suitable parameters
from sklearn.preprocessing import OneHotEncoder
#one hot encoding y[i] to set the target to 1 and all other classes to 0
encoder = OneHotEncoder(sparse=False)


In [128]:
def compute_cost_logistic(X, y_enc, w, b):
    """
    Computes cost

    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,n)) : target values
      w (ndarray (n,j)) : model parameters  
      b (ndarray)  (j,)     : model parameter
      
    Returns:
      cost (scalar): cost
    """

    m = X.shape[0]
    units = w.shape[1]
    cost = 0.0
    


    
    for i in range(m):
        #iterate through each training example
        z = np.matmul(X[i],w)
        #matrix multiplication of input features of each training example and weight paramters(n.j)
        f_wb_i = softmax(z)
        
        #calculate loss for each training example that is given by the dot product of the y-encoded fr that 
        #training example(1d vector) and log(f_wb_i) for that training example(1d vector)
        
        cost += -(np.dot(y_enc[i],np.log(f_wb_i)))
             
    cost = cost / m
    return cost


In [140]:
#test cost function
vector = np.array([1, 2, 3])
vector_encoded = encoder.fit_transform(vector.reshape(-1, 1))
print(y_encoded[0])

# Create a 3x3 matrix
matrix = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])

cost = compute_cost_logistic(matrix, vector_encoded, matrix, vector)
print(cost)

[1. 0. 0.]




In [130]:
#train parameters i.e run gradient descent to find best w and b that reduces the cost the most
def compute_gradient_logistic(X, y_enc, w, b): 
    """
    Computes the gradient for logistic regression 
 
    Args:
      X (ndarray (m,n): Data, m examples with n features
      y (ndarray (m,n)): target values
      w (ndarray (n,j)): model parameters  
      b (vector)  (n,)    : model parameter
    Returns
      dj_dw (ndarray (n,j)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (vector)  (n.)    : The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape
    units = w.shape[1]
    dj_dw = np.zeros((n,units))#(n,)
    dj_db = np.zeros(units)
    z = np.zeros(units)
    
    
   
    #one hot encoding y[i] to set the target to 1 and all other classes to 0

    for i in range(m):
        #iterating through each example
        z = np.matmul(X[i],w)
        #matrix multiplication of input features of each training example and weight paramters(n.j)
        f_wb = softmax(z)
        #error for all units or activation
        err = f_wb - y_enc[i]
        for index,unit_err in enumerate(err):
            #iterating through each activation error
            for k in range(n):
                #iterating through each feature in each training example and multiply by that activation error
                #and store in n of W(n.j)
                dj_dw[k,index] += unit_err * X[i,k] #(sum over all training examples)
            dj_db[index] += unit_err #(sum over all training examples)

    dj_dw = dj_dw/m                                   
    dj_db = dj_db/m                                   
        
    return dj_db, dj_dw  

In [131]:
#test gradient function
dj,dw = compute_gradient_logistic(matrix, vector_encoded, matrix, vector)
print(dj)

[-0.33333129 -0.33250903  0.66584032]


In [132]:
import math
def gradient_descent(X, y_enc, w_in, b_in, alpha, num_iters): 
    """
    Performs batch gradient descent
    
    Args:
      X (ndarray (m,n)   : Data, m examples with n features
      y (ndarray (m,))   : target values
      w_in (ndarray (n,j)): Initial values of model parameters  
      b_in (ndarray (n,))     : Initial values of model parameter
      alpha (float)      : Learning rate
      num_iters (scalar) : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,j))   : Updated values of parameters
      b (ndarray(n,))         : Updated value of parameter 
    """
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = w_in  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):
        # Calculate the gradient and update the parameters
        dj_db, dj_dw = compute_gradient_logistic(X, y_enc, w, b)   

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( compute_cost_logistic(X, y_enc, w, b) )

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
        
    return w, b, J_history         #return final w,b and J history for graphing


In [141]:
y_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
m,n = x_train.shape
i,j = y_encoded.shape
w_tmp = np.zeros((n,j))
b_tmp = np.zeros(j)
alph = 0.1
iters = 10000

w_out, b_out, _ = gradient_descent(x_train, y_encoded, w_tmp, b_tmp, alph, iters) 
print(w_out)
print(b_out)
print(f"\nupdated parameters: w:{w_out}, b:{b_out}")



[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[0. 0. 0.]
Iteration    0: Cost 1.0613743119820704   
Iteration 1000: Cost 0.7019270080076103   
Iteration 2000: Cost 0.6991955477085571   
Iteration 3000: Cost 0.6981991279881226   
Iteration 4000: Cost 0.697513866887841   
Iteration 5000: Cost 0.6969596768047015   
Iteration 6000: Cost 0.696478130357526   
Iteration 7000: Cost 0.6960433836588407   
Iteration 8000: Cost 0.6956426982220274   
Iteration 9000: Cost 0.6952693803561969   
[[-0.64819785 -0.11348069  0.76167854]
 [ 0.0842043  -0.49538381  0.41117951]
 [ 0.050053   -0.00244436 -0.04760865]
 [ 0.38165127 -0.2410268  -0.14062447]
 [ 0.24815317 -0.08012542 -0.16802775]
 [-1.97084944  1.01207772  0.95877172]
 [-1.44972559  0.70577665  0.74394894]
 [-0.54539999  0.26588419  0.279515

In [165]:
#model evaluation
#testing the model on the training data, cross-validation data to check for evaluate it's errors (if the predicted values 
#align with the target)
#finding the error on differnt training sets will help us evaluate if the model has a problem of overfitting(high variance) or
#a problem of underfitting(high bias)
#if training error is very high then it has a problem of high bias
#if cross validation is much higher than training error then it has the problem of high variance


def evaluate_error(x_samp, y_samp):
    y_pred = []
    count = 0
    error = 0
    for x_in in x_samp:
        x_out = my_dense(x_in, w_out, b_out)
        y_pred.append((np.argmax(x_out)+ 1))
    for i in range(len(y_pred)):
        if y_pred[i] != y_train_samp.values[i]:
            count +=1
    error = count/len(y_pred)
    return error
train_error = evaluate_error(x_train, y_train)
cv_error = evaluate_error(x_cv, y_cv)
test_error =  evaluate_error(x_test, y_test)
print(train_error)
print(cv_error)
print(test_error)

0.19294117647058823
0.46588235294117647
0.5187793427230047


In [174]:
#from the evaluation, we see the model does really well on the training set doesn't do as well on the cross validation set,
#the difference in error depicts a problem of overfitting(high variance) where the parameters fit the training data really well
#but doesn't do as well on new data
#the solution to this is simplifying the model and one way (out of multiple ways) is by reducing the amount of features
#using all features except histogram features

y_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
x_train_new = x_train[:, :11]
print(x_train_new)
m,n = x_train_new.shape
i,j = y_encoded.shape
w_tmp = np.zeros((n,j))
b_tmp = np.zeros(j)
alph = 0.1
iters = 10000

w_out, b_out, _ = gradient_descent(x_train_new, y_encoded, w_tmp, b_tmp, alph, iters) 
print(w_out)
print(b_out)
print(f"\nupdated parameters: w:{w_out}, b:{b_out}")



[[-0.84401403  0.4713793  -0.16034157 ...  0.64234865 -0.53536128
  -0.26437707]
 [-0.53909042 -0.8223883   0.13973429 ...  1.7748084  -0.53536128
  -0.06888834]
 [ 1.08716886 -0.8223883  -0.20320955 ... -0.60335709 -0.42662161
   0.39317594]
 ...
 [-1.04729644  3.57642154 -0.20320955 ...  0.98208657 -0.53536128
  -1.15296222]
 [-1.25057885 -0.04612774 -0.07460561 ... -0.82984904 -0.42662161
   0.41094764]
 [-0.64073162  0.21262578 -0.20320955 ... -0.37686514 -0.53536128
  -0.12220345]]
Iteration    0: Cost 1.0705767180527315   
Iteration 1000: Cost 0.7530869408169706   
Iteration 2000: Cost 0.7513038093913915   
Iteration 3000: Cost 0.7503722135136556   
Iteration 4000: Cost 0.7495140342952283   
Iteration 5000: Cost 0.7486914190698973   
Iteration 6000: Cost 0.747902948530841   
Iteration 7000: Cost 0.7471488309674976   
Iteration 8000: Cost 0.746429160689353   
Iteration 9000: Cost 0.7457438502347554   
[[-0.12144817  0.23670113 -0.11525295]
 [ 0.14469748 -0.25698603  0.11228855]
 [

In [175]:
print(w_out)

[[-0.12144817  0.23670113 -0.11525295]
 [ 0.14469748 -0.25698603  0.11228855]
 [ 0.06081182  0.01389577 -0.07470759]
 [ 0.28812657 -0.19405856 -0.09406801]
 [ 0.01551792 -0.10568266  0.09016474]
 [-2.5747259   1.25245903  1.32226687]
 [-1.69380956  0.6795429   1.01426666]
 [-0.52001016  0.15580333  0.36420683]
 [-0.10236051 -0.06121132  0.16357183]
 [-0.70830172  0.28456305  0.42373867]
 [-0.11341123  0.04813811  0.06527312]]


In [176]:
x_cv_new = x_cv[:,:11]
x_test_new = x_test[:,:11]

train_error = evaluate_error(x_train_new, y_train)
cv_error = evaluate_error(x_cv_new, y_cv)
test_error =  evaluate_error(x_test_new, y_test)

print(train_error)
print(cv_error)
print(test_error)

0.19137254901960785
0.4470588235294118
0.4953051643192488


In [None]:
#we have reduced th error in the cv and test models,further alterations to the complexity of the model will 
#help reduce the difference in error between the training and the cross-validation set
#Note: typically a baseline performance is established to measure how accurate the training and cross validation set is
#i.e if the ideal performance of fetal mortality prediction is 70% that is 0.3 error, that is ued to compare to
#the training and cross validation set