# HW2

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import time

from typing import Final
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, multilabel_confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier


In [None]:
def OneHotEncoding(df: pd.Series) -> np.ndarray:
    encodedClassNames: list[list[int]] = []
    for i in range(df.shape[0]):
        strClassName = str(df.iloc[i])
        encodedClassName = [
            int(strClassName == 'SEKER'),
            int(strClassName == 'BARBUNYA'),
            int(strClassName == 'BOMBAY'),
            int(strClassName == 'CALI'),
            int(strClassName == 'DERMASON'),
            int(strClassName == 'HOROZ'),
            int(strClassName == 'SIRA')
        ]
        if encodedClassName == [0, 0, 0, 0, 0, 0, 0]:
            raise ValueError(f'Unknown Class Name: {strClassName}')
        else:
            encodedClassNames.append(encodedClassName)

    return np.asarray(encodedClassNames)


In [None]:
# Read in the data
rawData: Final[pd.DataFrame] = pd.read_csv('./Dry_Beans_Dataset.csv')
inputAttributes: Final[pd.DataFrame] = rawData.drop(
    columns=['Class'], inplace=False)

# One hot encoding
encodedClassNames: Final[np.ndarray] = OneHotEncoding(rawData['Class'])

# Normalization
normalizedAttributes: Final[pd.DataFrame] = pd.DataFrame(
    MinMaxScaler()
    .fit(inputAttributes)
    .transform(inputAttributes)
)


In [None]:
# Setup some constants
DEFAULT_LEARNING_RATE: Final[float] = 0.3
DEFAULT_EPOCHS: Final[int] = 500


In [None]:
def BuildDefaultModel():
    SGD_optimizer: Final = tf.keras.optimizers.SGD(
        learning_rate=DEFAULT_LEARNING_RATE)
    lossFunction = tf.keras.losses.MeanSquaredError()

    model: keras.Sequential = keras.Sequential([
        keras.Input(shape=(16)),
        tf.keras.layers.Dense(
            units=12, activation='sigmoid', name='hidden_layer_1'),
        tf.keras.layers.Dense(
            units=3, activation='sigmoid', name='hidden_layer_2'),
        tf.keras.layers.Dense(units=7, activation='sigmoid', name='output'),
    ], name='Beans_Classifier')

    model.compile(loss=lossFunction, optimizer=SGD_optimizer,
                  metrics=['accuracy'])
    return model


beansClassifier = BuildDefaultModel()


In [None]:
X_Train, X_Test, y_Train, y_Test = train_test_split(
    normalizedAttributes, encodedClassNames, test_size=0.1, random_state=44)


In [None]:
# Compile and Train
startTime = time.time()
beansClassifier.fit(
    x=X_Train, y=y_Train,
    epochs=DEFAULT_EPOCHS,
    validation_data=(X_Test, y_Test), verbose='2')
endTime = time.time()

print(f'Training took {endTime-startTime} seconds')


In [None]:
# Print the MSE, Accuracy Score, and Confusion Matrices

predicted_y: np.ndarray = beansClassifier.predict(
    X_Test)  # raw continuous outputs
predicted_y_argmaxed = predicted_y.argmax(axis=1)

print(f'MSE of test set is: {mean_squared_error(predicted_y, y_Test)}')
print(f'Accuracy: {accuracy_score(predicted_y_argmaxed, y_Test.argmax(1))}')

print('Precision & Recall:')
print(classification_report(predicted_y_argmaxed, y_Test.argmax(1)))

# This prints an array of 7 matrices, each matrix is 2x2 of [[TT, TF], [FT, FF]]
# Where each index represents the corresponding class name
# in the OneHotEncoding function in the above cell
print('Confusion Matrix:')
print(multilabel_confusion_matrix(
    y_pred=predicted_y_argmaxed, y_true=y_Test.argmax(axis=1)))


In [None]:
def generateValidationData(inputDF: pd.DataFrame, expectedOutputs: np.ndarray,
                           trainIndexes: np.ndarray,
                           testIndexes: np.ndarray):
    X_Train = inputDF.iloc[trainIndexes]
    X_Test = inputDF.iloc[testIndexes]

    y_Train = expectedOutputs[trainIndexes]
    y_Test = expectedOutputs[testIndexes]

    return X_Train, X_Test, y_Train, y_Test


In [None]:
# Do 10-fold validation

KFolder = KFold(n_splits=10)
mseScores: list[float] = []
accuracyScores: list[float] = []

for trainIndexes, testIndexes in KFolder.split(normalizedAttributes):
    X_Train, X_Test, y_Train, y_Test = generateValidationData(
        normalizedAttributes, encodedClassNames, trainIndexes, testIndexes)

    beansClassifier.fit(
        x=X_Train, y=y_Train,
        epochs=DEFAULT_EPOCHS,
        validation_data=(X_Test, y_Test))

    predicted_y: np.ndarray = beansClassifier.predict(X_Test)

    mseScores.append(mean_squared_error(y_Test, predicted_y))
    accuracyScores.append(accuracy_score(
        predicted_y.argmax(axis=1), y_Test.argmax(axis=1)))


In [None]:
print(
    f'Accuracy Scores: {accuracyScores}\nAverage accuracy is: {np.average(accuracyScores)}\n')
print(f'MSE Loss: {mseScores}\nAverage MSE is: {np.average(mseScores)}')


## Exercise 3

In [None]:
def BuildModel(numNodesLayer1=12, numNodesLayer2=3, learningRate=DEFAULT_LEARNING_RATE):
    SGD_optimizer: Final = tf.keras.optimizers.SGD(learning_rate=learningRate)
    lossFunction = tf.keras.losses.MeanSquaredError()

    model: keras.Sequential = keras.Sequential([
        tf.keras.layers.Dense(
            input_dim=6,
            units=numNodesLayer1, activation='ReLU', name='hidden_layer_1'),
        tf.keras.layers.Dense(
            units=numNodesLayer2, activation='ReLU', name='hidden_layer_2'),
        tf.keras.layers.Dense(units=1, activation='ReLU', name='output'),
    ], name='Beans_Classifier')

    model.compile(loss=lossFunction, optimizer=SGD_optimizer,
                  metrics=['accuracy'])
    return model

wrappedBeansClassifier = KerasClassifier(build_fn=BuildModel)

# parameters passed to BuildModel(...)
param_grid = dict(
    nb_epoch=np.array(list(range(500, 1001, 50))),
    learningRate=np.array([0.1, 0.3, 0.6]),
    numNodesLayer1=np.array([12, 13, 14]),
    numNodesLayer2=np.array([3, 4, 5]),
)

grid = GridSearchCV(estimator=wrappedBeansClassifier,
                    param_grid=param_grid, n_jobs=-1, cv=10)

grid_result = grid.fit(X_Train, y_Train)


In [None]:
# Big reveal of best parameters
grid_result.best_params_