Implement and train Softmax Regression with mini-batch SGD and early stopping.

The expected outcome.
* Implement Softmax Regression Model.
* Implement mini-batch SGD.
* The training should support early stopping.
* Train and evaluate the model with cross-validation. The evaluation metric is the *accuracy*.
* Retrain the model with early stopping.


**DO NOT USE SKLEARN**

In [283]:
import numpy as np
import pandas as pd 

from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit

np.random.seed(42)

In [284]:

iris = datasets.load_iris()
X = iris["data"]
y = iris["target"]
df = pd.DataFrame({fname: values for fname, values in zip(iris["feature_names"], X.T)})
df["target"] = y

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Your Code
You can start writing your code from here. Please don't modify any of the previous code.

In [285]:
def create_mini_batches(X, y, n):
    '''
    X: matrix of features values of the training set
    y: vector of targe feature 
    n: number of batches
    '''
    return np.array_split(X, n), np.array_split(y, n)

In [286]:
def one_hot_encoder(vect):
    '''vect:the vectore to encode'''
    
    # Get categories  
    cats = np.unique(vect)
    
    # Creaty empty array (n, k)
    n = vect.shape[0]
    k = cats.shape[0]
    new_matrix = np.empty((n, k))
    
    for c in cats :
        new_matrix[:, c] = [1 if i == c else 0 for i in vect]
        
    return new_matrix

In [314]:
def SoftMax(S):
    ''' S: Score Matrix'''
    # Number of observations
    n = S.shape[0]
    
    # predict: vector contains the index with heigher probability
    # Initalize predict array
    predict = np.empty(n)
    
    # Initalize probabilty array
    probabilities = np.empty(S.shape)
    
    # Index refers to class that contain higher probabilty
    # Foreach vector(1,3) in S, save the index with heigher probability in predict 
    
    for i in range(n):
        v_exponentials = np.exp(S[i, :])
        probabilities[i] = v_exponentials / np.sum(v_exponentials)
        predict[i] = np.argmax(probabilities[i])
        
    return predict, probabilities

In [315]:
def cross_entropy(prop, Y):
    '''
    cross_entropy = -1/m * summation yk * log(pk)
    prop: Probability Matrix
    Y: Matrix of onehot encoded Y, each column represent class and it's value one only for the instence who 
       belongg to that class
    '''
    # Number of classes 
    k = Y.shape[1]
    
    # Number of observations 
    n = Y.shape[0]
    
    # Calculate log of probabilities Matrix
    log_prop = np.log(prop)
    
    sum = 0
    for i in range(k):
        sum += Y[:, i].dot(log_prop[:, i])
        
    return -1 * (sum / n)

In [316]:
def Softmax_Gradient_Descent(X_data, y_data, batch, alpha=0.05, threshold = 0.2, max_iter=5000):
    '''
   # X_data (n, m+1): Feature Matrix
                     n observations 
                     m features +1 for bias
    
   # Y_data (n, k): Target Vector "y_data" adjust to be matrix 
                   n observations
                   k class labels
    
   # batch:
       - 1                     = Vanilla GD
       - data_size(m)          = Stochastic GD
       - number of batches(n)  = Mini Batch GD    
       
   # alpha: Learning Rate
   
   # threshold: if Cost Function near to the minmum "threshold" so BREAK, else update theats
   
   # max_iter: number of iteration over all data
                -------------------------------------------------------------
                
    W (m, k): Weight Matrix
              m features
              k class labels
    
    Z(n+1, k): Logit Score Matrix
               n observations, +1 for bias
               k class labels
    '''
    n = X_data.shape[0]
    m = X_data.shape[1]
    k = np.unique(y_data).shape[0]
       
    # Add ones column in the beggining of X matrix for bias  
    # X_data are (n,m+1)
    ones = np.ones(n)
    X_data = np.column_stack((ones, X_data))
    
    # Adjust y vector to be Y matrix 
    Y_data = one_hot_encoder(y_data)

    # Initakize Weight matrix W(m, k)
    #W = np.zeros((m+1, k))
    W = np.random.random((m+1, k))
    
    # divid X and y to batches
    X_batches, y_batches = create_mini_batches(X_data, Y_data, batch)
    
    for i in range(max_iter):
        
        # Iterate over batches 
        for j in range(batch):
            
            X = X_batches[j]
            Y = y_batches[j]
            
            # Calculate Score Matrix/ logits S
            S = X.dot(W)
            
            # Predict outbut
            predict, class_probabilties  = SoftMax(S)
            
            # Calculate the cost function/ loss cross entropy cost function
            loss = cross_entropy(class_probabilties, Y)

            # Check if Cost Function near to the minmum "threshold" so BREAK, else update Weight 
            if loss <= threshold:
                break
                
            # Calculate the Gradient of Theta
            # number of data 
            # Update Weight matrix 
            # Update weight foreach class 
            for c in range(k):
                p = class_probabilties[:, c]
                y = Y[:, c]
                term = p - y 
                b = X.shape[0]
                
                # Calculate gredient 
                delta = 1/b * (term.dot(X))
        
                # Update weigth
                W[:, c] = W[:, c] - (alpha * delta)
            
        if loss <= threshold:
            break
    # Return Weight values, Cost function 
    predicts, class_probabilties = SoftMax(X_data.dot(W))
    return W, predicts

In [372]:
def accuracy(y, p):
    '''
    y: Actual vector
    p: Predict vector
    '''
    # True Positive tp
    tp = 0
    all = y.shape[0]
    for i in range(all):
        if p[i] == y[i]:
            tp += 1
    
    return tp / all

Using the following cell to train and evaluate your model.

In [384]:
split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
indx = 0
for train_index, test_index in split.split(df, df["target"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
    
    # Use strat_train_set and strat_test_set to train and evaluate your mode
    indx += 1
    print("Fold ", indx)
    X_train = strat_train_set.iloc[:, :-1].to_numpy()
    y_train = strat_train_set.iloc[:, -1].to_numpy()
    
    Weight, train_predicts = Softmax_Gradient_Descent(X_train, y_train, batch=3, alpha=0.05, threshold = 0.2, max_iter=500)
    
    acc = accuracy(y_train, train_predicts)
    print("The accuray of the the train is {0}".format(acc))
    
    X_test = strat_test_set.iloc[:, :-1].to_numpy()
    n = X_test.shape[0]
    ones = np.ones(n)
    X_test = np.column_stack((ones, X_test))
    y_test = strat_test_set.iloc[:, -1].to_numpy()
    
    test_predicts, probabilities = SoftMax(X_test.dot(Weight))
    
    acc2 = accuracy(y_test, test_predicts)
    print("The accuray of the the test is {0}".format(acc2))
    print("---------------------------")

Fold  1
The accuray of the the train is 0.975
The accuray of the the test is 0.9666666666666667
---------------------------
Fold  2
The accuray of the the train is 0.9916666666666667
The accuray of the the test is 0.9
---------------------------
Fold  3
The accuray of the the train is 0.9833333333333333
The accuray of the the test is 0.9
---------------------------
