<a href="https://colab.research.google.com/github/tomaldridge12/breast-cancer-classification/blob/main/GroupProj_MLiS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# TODO: Insert citation for dataset. And instructions for how to use the script and its purpose.

from pandas import read_csv, Series, DataFrame
from numpy import ones, matmul, log, where, exp, mean, arange, random
from numpy.linalg import inv, LinAlgError
from json import dumps
from datetime import datetime
from os import path

In [None]:
# -------------[START]---------------
# -----   DEFINE:  PARAMETERS   -----
# -----------------------------------
percentageOfDataForTraining = False # Set to False (boolean) to loop through all possible train:test datasplits, from 0:100 to 100:0. 
numberOfTrials = 100 # Number of random trials to run the model for. int64
# seeds = [1,2,3]  # int64
fillDataGapsWith = False  # Set to False (boolean) to have rows with gaps completely removed.
verbose = True # (boolean) Enable/Disable additional comments.
# -----------------------------------
# -----   DEFINE:  PARAMETERS   -----
# -------------[END]-----------------

In [None]:
# -------------[START]---------------
# -----    NESTED  FUNCTIONS    -----
# -----------------------------------
def getAllData():
    """
    Retrieve all rows of data from Wisconsin dataset (stored remotely) and their respective column names (stored remotely).

    :param (global) fillDataGapsWith: this value will replace data gaps (eg. "?"). Set to False (boolean) for rows with gaps to be removed entirely.
    :return: DataFrame of all rows of data, with named columns and an initial column of ones.
    """ 
   
    columnNames = read_csv('https://raw.githubusercontent.com/reecehill/MLiS/main/breast-cancer-wisconsin-names.csv',
                       header=None).iloc[0].to_numpy() # get column names and convert to nd.array

    allData = read_csv('https://raw.githubusercontent.com/reecehill/MLiS/main/breast-cancer-wisconsin.csv',
                   header=None, # use the first row as data (we add column names next)
                   names=columnNames, # labelled by columnNames
                   usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # import all columns
                   na_values=["?"], # convert ? values to NaN
                   dtype='float64' # ensure dtype=float64 to allow for NaN
                   )
        
    if(fillDataGapsWith is False):
      # Remove rows if they are missing any data.
        allData = allData.dropna()
    else:
      # Keep all rows and replace missing data with fillDataGrapsWith 
        allData = allData.fillna(fillDataGapsWith)

    allData = allData.reset_index() * (10^5)
    # Add a column of ones as per machine-learning standards    
    allData.insert(loc=0, column='ColumnOfOnes', value=1.0)

    def rewriteClassesRule(row):
      """
      Redefine cells in "Class" column for use in sigmoid function later, changing 2 to -1 and 4 to 1.
      
      :param row: data row current under inspection.
      :return: new cell data
      """ 
      if(row['Class'] == 4* (10^5)):
        val = 1.0  # Malignant
      elif (row['Class'] == 2* (10^5)):
        val = -1   # Benign
      else:
        val = 0  # Inconclusive
      return val

    # Rewrite cells of "Class" column 
    allData["Class"] = allData.apply(rewriteClassesRule, axis=1)

    if(verbose==True):
        print("Total number of records: "+str(len(allData)))
    return allData

def changeSeed(currentSeed):
    print("Changing seed #"+str(seed)+" as it generates singular matrix. An additional seed has been added in its place. Press enter to continue.")
    #input("Press Enter to continue anyway.")
    lastSeedValue = seeds[-1]
    seeds.remove(seed)
    seeds.append(lastSeedValue + 1)

def stratifyAndRandomlySample(allData, seed, percentageOfDataForTraining):
    """
    Stratify all data based on their Class. Then, randomly sample (according to seed) to produce training-testing dataset split of user-defined proportions. 

    :param allData: wisconsin dataset, without split. 
    :param seed: controls the random sampling to allow reproducibility
    :param percentageOfDataForTraining: defines the size (in percentage) of allData that should be exclusively used for training. The testing dataset will use what remains. 
    :param (global) fillDataGapsWith: this value will replace data gaps (eg. "?"). Set to False (boolean) for rows with gaps to be removed entirely.
    :return: DataFrame of all rows of data, with named columns and an initial column of ones.
    """ 

    # -- PRODUCE TRAINING DATASET
    groupedTrainDatum = allData.groupby(
        'Class', # Stratify by class
        group_keys=False)

    trainDatum = groupedTrainDatum.apply(
            lambda x:
            x.sample(
                frac=(percentageOfDataForTraining/100), # Take percentage of entire dataset
                random_state=int(seed) # Seed
                )
    )

    # -- PRODUCE TESTING DATASET
    testDatum = allData.drop(trainDatum.index)
    
    # -- SPLIT TRAINING DATA INTO X (FEATURES) AND Y (TARGET)
    trainDatumY = trainDatum.Class
    trainDatumX = trainDatum.drop(
        'Class', axis=1, inplace=False)
    del trainDatum

    # -- SPLIT TESTING DATA INTO X (FEATURES) AND Y (TARGET)
    testDatumY = testDatum.Class
    testDatumX = testDatum.drop(
        'Class', axis=1, inplace=False)
    del testDatum
    print('trainDatumX:')
    print(trainDatumX.to_numpy(dtype='float64'))
    if(verbose==True):
        print("Number of records (training): "+str(len(trainDatumX)))
        print("Number of records (testing): "+str(len(testDatumX)))
    return trainDatumX, trainDatumY, testDatumX, testDatumY


def calculateOptimalBeta(trainDatumX, trainDatumY):
    """
    Use principles of linear model to find the optimal values for Beta (B*).
    Equation defined: B* = ((X^T * X) ^-1 ) * X^T * Y

    :param trainDatumX: features of training set
    :param trainDatumY: target of training set 
    :return: ndarray for beta
    """ 
    print(trainDatumY)
    Xt = trainDatumX.T
    print('---Xt:')
    print(Xt)
    XtX = matmul(Xt, trainDatumX)
    print('---XtX:')
    print(XtX)
    XtY = matmul(Xt, trainDatumY)
    print('---XtY:')
    print(XtY)
    invXtX = inv(XtX)
    print('---inv(XtX):')
    print(invXtX)
    optimalBeta = matmul(invXtX, XtY)
    
    if(verbose==True):
        print('B* = ', optimalBeta)
    return optimalBeta


def computePredictions(optimalBeta, trainDatumX):
    """
    Apply optimalBeta to features (X) of training data to get predicted Y values. 
    
    :param optimalBeta: optimal beta values
    :param trainDatumX: features of training set  
    :return: predicted Y values, scaled (by sigmoid function) to be between 0 and 1 
    """ 
    def applySigmoidToPredictions(predictedYValue):
      return 1 / (1 + exp(-predictedYValue))
  
    predictedYValue = matmul(trainDatumX.to_numpy(), optimalBeta) # y^ = X*beta
    predictedYValue = applySigmoidToPredictions(predictedYValue) # p = sig(y^)
    return predictedYValue

def classifyPredictions(predictedYValues):
    """
    Convert predicted Y values (between 0 and 1) into binary classifications of malignant (if y^ > 0.5), benign (if y^ < 0.5), inconclusive (otherwise).
    
    :param predictedYValues: predicted Y values, scaled (by sigmoid function) to be between 0 and 1   
    :return: predicted Y classes 
    """ 
    def classifyRule(row):
        """
        Redefine rows of data Series, predictedYValues, using 0.5 as a threshold.

        :param row: data row current under inspection.
        :return: data Series of -1.0, 0.0, and 1.0. Each representing a class.
        """ 
        if(row > 0.5): 
            val = 1.0  #Set class to Malignant
        elif row < 0.5: 
            val = -1.0  #Set class to Benign
        else: 
        # Given this is an unlikely event, we ensure it isn't unnoticed by waiting for user input.
            print("WARNING: The sequence has generated a prediction that is inconclusive.")
            input("Press Enter to continue anyway.")
            val = 0.0 #Set class to Inconclusive
        return val

    predictedYValues = Series(predictedYValues).apply(classifyRule)
    if (verbose==True):
        print("Y^ =", predictedYValues)
        print("Y^ (classified): ", predictedYValues.to_numpy())
    return predictedYValues.to_numpy()

def calculateStats(predictedYClass, trainDatumY):
    """
    Calculate performance metrics of mode.

    :param predictedYClass: predicted classes (target, Y)
    :param trainDatumY: actual classes of known data 
    :return: negativeLogLikehood, accuracy, precision, sensitivity, specificity
    """ 
    truePositives = where((predictedYClass == 1.0) &
                          (trainDatumY == 1.0))[0].size
    falsePositives = where((predictedYClass == 1.0) &
                           (trainDatumY == -1.0))[0].size
    trueNegatives = where((predictedYClass == -1.0) &
                          (trainDatumY == -1.0))[0].size
    falseNegatives = where((predictedYClass == -1.0) &
                           (trainDatumY == 1.0))[0].size
    
    # Negative log likelihood (NLL) = Negative mean of [ Y*BtX - log(1 + exp(BtX))]
    negativeLogLikelihood = -mean((trainDatumY*predictedYClass) - log(1+exp(predictedYClass)))

    
    if (truePositives + trueNegatives > 0):
        accuracy = (truePositives + trueNegatives) / trainDatumY.size
    else:
        accuracy = 0
    if truePositives > 0:
        precision  = truePositives / (truePositives + falsePositives)
        sensitivity = truePositives / (truePositives + falseNegatives)
    else:
        precision = 0
        sensitivity = 0
    
    if trueNegatives > 0:
        specificity = trueNegatives / (trueNegatives + falsePositives)
    else:
        specificity = 0

    return negativeLogLikelihood, accuracy, precision, sensitivity, specificity

def produceSequenceReport(negativeLogLikelihood, accuracy, precision, sensitivity, specificity):
    """
    Store performance metrics in a sequence-specific dictionary, sequenceReport.
    For definitions of parameters, consult function calculateStats.
    :return: sequenceReport
    """ 
    
    sequenceReport = {
        'NLL': negativeLogLikelihood,
        'Accuracy': accuracy,
        'Precision': precision,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
    }
    return sequenceReport

def addSequenceReportToTotalReport(sequenceReport, reportOfTotals):
    """
    Store sequence-specific performance metrics to a running tally of totals, a dictionary.
    For definitions of parameters, consult function calculateStats.

    :return: newReportOfTotals (a temporary variable)
    """
    newReportOfTotals = {
        'NLL': reportOfTotals['NLL'] + sequenceReport['NLL'],
        'Accuracy': reportOfTotals['Accuracy'] + sequenceReport['Accuracy'],
        'Precision': reportOfTotals['Precision'] + sequenceReport['Precision'],
        'Sensitivity': reportOfTotals['Sensitivity'] + sequenceReport['Sensitivity'],
        'Specificity': reportOfTotals['Specificity'] + sequenceReport['Specificity'],
    }
    return newReportOfTotals

def storeAveragesReportToCsv(averagesReport):
    """
    Store final report of averaged performance metrics, averagesReport, to a csv file for plotting later.
    :param averageReport: dictionary of 5 keys, see produceSequenceReport(), that are averaged from all seed trials for a given train-test data split. 
    :param fileName (global): name of csv file (of current timestamp) to which records will be saved. 
    :return: None
    """ 
    if path.isfile(fileName+'.csv'):
        mode = 'a'
        header = 0
    else:
        mode = 'w'
        header = averagesReport.keys()
    
    # Add current train:test data split information to report, before adding to csv
    averagesReport['Training Set Size'] = percentageOfDataForTraining
    averagesReport['Testing Set Size'] = 100-percentageOfDataForTraining
    DataFrame({'>': averagesReport}).transpose().to_csv(fileName+'.csv', header=header, mode=mode)

def runSequence(optimalBeta, knownDataX, knownDataY, reportOfTotals = False):
    """
    Main orchestrator of prediction process.
    Use the given features (knownDataX) and optimalBeta to make predictions. Then, compare these predictions against knownDataY to check if correct.
    Upon completion, store results in a sequence-specific report, and to a running-tally report, reportOfTotals. 

    :param optimalBeta: optimal values for Beta
    :knownDataX: features taken from either training or testing sub-set.
    :knownDataY: known targets taken from either training or testing sub-set.
    :reportOfTotals: the running-tally report to which the sequence-specific report should be added. Specify "False" (boolean) to not add a sequence-specific report (i.e., when training) 
    :return: sequenceReport (consult function produceSequenceReport), and conditionally, reportOfTotals (consult addSequenceReportToTotalReport) 
    """
    predictedYValues = computePredictions(optimalBeta, knownDataX)
    predictedYClasses = classifyPredictions(predictedYValues)
    negativeLogLikelihood, accuracy, precision, sensitivity, specificity = calculateStats(predictedYClasses, knownDataY)
    sequenceReport = produceSequenceReport(negativeLogLikelihood, accuracy, precision, sensitivity, specificity)
    if(reportOfTotals is not False):
      reportOfTotals = addSequenceReportToTotalReport(sequenceReport, reportOfTotals)
      return sequenceReport, reportOfTotals
    else:
      return sequenceReport


# -----    NESTED  FUNCTIONS    -----
# -------------[END]-----------------

In [None]:
# -------------[START]---------------
# -----        MAIN BODY        -----
# -----------------------------------
startSeedingFrom = 10
seedUntil = startSeedingFrom+numberOfTrials;
seeds = list(range(startSeedingFrom,seedUntil))
seedLength = len(seeds)

allData = getAllData()
fileName = datetime.now().strftime("%Y%m%d-%H%M%S")
allReports = {'splits': {}}

# Prepare for looping through train:test splits
if percentageOfDataForTraining is not False:
    percentagesOfDataForTraining=[percentageOfDataForTraining] # User has requested one split, so only run this single split.
else:
    numberOfRows = len(allData)
    increment = 100/numberOfRows
    startIncrementingFrom = 0
    continueIncrementingUntil = 100 # Ensure we increment to include 100%
    percentagesOfDataForTraining= arange(startIncrementingFrom,continueIncrementingUntil,increment)


# ------ MAIN LOOPS [START]------
for percentageOfDataForTraining in percentagesOfDataForTraining:
    reportOfTotals = {
        "NLL": 0,
        "Accuracy": 0,
        "Precision": 0,
        "Sensitivity": 0,
        "Specificity": 0,
    }
    splitReport = {'seeds': {}, 'Average Metrics': reportOfTotals}
    for seed in seeds:
        if(verbose==True):
            print('\r\n---\r\n*** STARTING SEED #'+str(seed))

        # -- SPLIT DATA FOR TRAINING AND TESTING
        trainDatumX, trainDatumY, testDatumX, testDatumY = stratifyAndRandomlySample(
            allData, seed, percentageOfDataForTraining)
        
        # -- USING TRAINING DATASET
        if(verbose==True):
            print('\r\n---\r\n** Commencing: TRAINING')

        try:
            optimalBeta = calculateOptimalBeta(trainDatumX.to_numpy(), trainDatumY)
        except LinAlgError:
            # This sample of data leads to a matrix that cannot be inversed, so resample.
            changeSeed(seed)
            continue


        seedReportForTraining = runSequence(optimalBeta, knownDataX = trainDatumX, knownDataY = trainDatumY, reportOfTotals=False)
        if(verbose==True):
            print('\r\n---\r\n** ** Finished: TRAINING')


        # -- USING TESTING DATASET
        if(verbose==True):
            print('\r\n---\r\n** Commencing: TESTING')
        seedReportForTesting, reportOfTotals = runSequence(optimalBeta, knownDataX = testDatumX, knownDataY = testDatumY, reportOfTotals=reportOfTotals)
        if(verbose==True):
            print('\r\n---\r\n** Finished: TESTING')
        
        # -- STORE PERFORMANCE TRAINING AND TESTING REPORTS TO "splitReport"
        splitReport['seeds'].update({
            seed: {
            'Training': seedReportForTraining,
            'Testing': seedReportForTesting
            }
            })
    

    # ------  ADD TO AVERAGES REPORT  ------
    averagesReport = {
        'Mean NLL': (reportOfTotals['NLL'] / seedLength), # NLL is NOT expressed as a percentage
        'Mean Accuracy': str((reportOfTotals['Accuracy'] / seedLength) * 100)+'%',
        'Mean Precision': str((reportOfTotals['Precision'] / seedLength) * 100)+'%',
        'Mean Sensitivity': str((reportOfTotals['Sensitivity'] / seedLength) * 100)+'%',
        'Mean Specificity': str((reportOfTotals['Specificity'] / seedLength) * 100)+'%',
        }
    splitReport['Average Metrics'].update(averagesReport)
    storeAveragesReportToCsv(averagesReport)
    
    # ------  ADD TO FINAL REPORT  ------
    allReports['splits'].update({
            percentageOfDataForTraining: splitReport
            })
    
# ----------------------------

# ------  PRINT FINISH  ------
if(verbose==True):
    print(dumps(allReports, indent=4))
    print('\r\n---\r\nFinished. A final report is produced above.')
else:
    print('\r\n---\r\nFinished. A final report is produced above. Set verbose to true to increase verbosity.')
# ----------------------------


# -----        MAIN BODY        -----
# -------------[END]-----------------

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  3.0000e+01  9.0000e+01 -1.5000e+01  1.0500e+02  6.0000e+01]
---inv(XtX):
[[ 1.31941395e+14  1.10767343e+10  2.87995092e+12  1.46028888e+13
   2.06158430e+12 -2.90339789e+13 -2.90339789e+13 -1.15105124e+13
   8.24371377e+12  3.61327082e+13  6.18475291e+12]
 [ 2.71038566e-04 -1.11940479e+06 -2.91045247e+08 -1.47783821e+09
   8.14358756e-06  1.47783821e+09  1.47783821e+09  1.47783821e+09
  -4.16418583e+08 -2.56746684e+09  6.50565287e-06]
 [ 1.32074418e-01 -2.91045247e+08 -7.56717641e+10 -3.84237934e+11
   5.64135843e-03  3.84237934e+11  3.84237934e+11  3.84237934e+11
  -1.08268832e+11 -6.67541377e+11  4.35177846e-03]
 [ 5.45357767e+13 -1.02195389e+09 -2.65708012e+11  3.35543165e+11
  -2.74877907e+10 -7.47860026e+11 -1.51751817e+12  7.22736776e+11
  -4.35142429e+11  1.17732417e+12 -1.92414535e+11]
 [-0.00000000e+00 -8.32837167e+07 -2.16537663e+10  2.74877907e+11
  -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -2.74877907e



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [1.500e+01 4.500e+01 3.000e+01 1.500e+02 1.500e+02]
 [1.500e+01 4.500e+01 3.000e+01 1.200e+02 1.350e+02]
 [1.500e+01 1.500e+01 1.500e+01 1.500e+02 1.500e+01]
 [1.500e+01 1.500e+01 1.500e+01 3.000e+01 1.500e+01]]
---XtX:
[[5.0000000e+00 3.1770000e+04 2.5500000e+02 2.5500000e+02 3.0000000e+02
  2.8500000e+02 2.1000000e+02 3.9000000e+02 3.4500000e+02 2.1000000e+02
  9.0000000e+01]
 [3.1770000e+04 2.2497525e+08 1.8272250e+06 1.8499500e+06 2.1989250e+06
  2.0565000e+06 1.3353750e+06 2.7904500e+06 2.4423750e+06 1.5174000e+06
  5.9220000e+05]
 [2.5500000e+02 1.8272250e+06 1.5975000e+04 1.6650000e+04 1.9575000e+04
  1.9800000e+04 1.1025000e+04 2.5650000e+04 2.2050000e+04 1.5975000e+04
  5.1750000e+03]
 [2.5500000e+02 1.8499500e+06 1.6650000e+04 2.0475000e+04 2.3625000e+04
  2.0250000e+04 1.2375000e+04 3.1275000e+04 2.7000000e+04 1.5975000e+04
  5.1750000e+03]
 [3.0000000e+02 2.1989250e+06 1.9575000e+04 2.3625000e+04 2.7450000e+0



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

---
*** STARTING SEED #603
trainDatumX:
[[1.000e+00 4.980e+03 7.500e+01 3.000e+01 3.000e+01 3.000e+01 3.000e+01
  1.500e+01 3.000e+01 3.000e+01 1.500e+01]
 [1.000e+00 5.835e+03 7.500e+01 1.500e+01 1.500e+01 1.500e+01 4.500e+01
  3.000e+01 3.000e+01 3.000e+01 1.500e+01]
 [1.000e+00 5.310e+03 1.500e+01 1.500e+01 1.500e+01 1.500e+01 3.000e+01
  1.500e+01 3.000e+01 1.500e+01 1.500e+01]
 [1.000e+00 3.000e+02 1.050e+02 4.500e+01 3.000e+01 1.500e+02 7.500e+01
  1.500e+02 7.500e+01 6.000e+01 6.000e+01]
 [1.000e+00 2.655e+03 7.500e+01 1.500e+02 1.500e+02 4.500e+01 1.200e+02
  1.500e+01 7.500e+01 1.500e+02 4.500e+01]]
Number of records (training): 5
Number of records (testing): 678

---
** Commencing: TRAINING
318   -1.0
375   -1.0
340   -1.0
20     1.0
171    1.0
Name: Class, dtype: float64
---Xt:
[[1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00]
 [4.980e+03 5.835e+03 5.310e+03 3.000e+02 2.655e+03]
 [7.500e+01 7.500e+01 1.500e




---XtY:
[-2.0000e+00 -1.3995e+04  1.0500e+02  1.0500e+02  7.5000e+01  6.0000e+01
  7.5000e+01  2.1000e+02  1.0500e+02  1.9500e+02 -1.5000e+01]
---inv(XtX):
[[-1.81788905e+15  1.21347566e+11  1.46338180e+12 -2.96823693e+13
   3.20942442e+13 -9.09296872e+11  8.94304847e+12  1.33842873e+12
   2.53281039e+13 -2.29478533e+13  3.08969283e+13]
 [ 8.66610150e+10 -4.39543327e+06  3.63518208e+08  1.44287321e+09
  -1.72975495e+09 -1.84833435e+08 -6.51330226e+08  1.59781938e+08
  -1.23457799e+09  8.80902372e+08 -2.40725042e+09]
 [-8.66610150e+11 -9.21305552e+07  6.86290123e+10 -1.48854857e+11
  -1.19863528e+11  2.90842879e+10  2.54128457e+10  3.38538534e+10
  -3.99028885e+10  4.19561632e+10  2.40725042e+10]
 [-2.76420379e+12 -7.94726673e+08 -8.96938281e+10 -7.80026862e+10
   1.71816437e+11 -7.80824798e+10  1.42219274e+11 -4.88205474e+10
  -1.67842104e+11  1.09252673e+11  7.55915879e+11]
 [ 3.81072974e+12  2.32305001e+08  8.86548388e+10  4.39788071e+11
  -3.61093273e+11 -8.77045258e+10 -1.61540664



trainDatumX:
[[1.0000e+00 1.0065e+04 4.5000e+01 3.0000e+01 1.5000e+01 3.0000e+01
  3.0000e+01 1.5000e+01 4.5000e+01 1.5000e+01 1.5000e+01]
 [1.0000e+00 6.4950e+03 4.5000e+01 3.0000e+01 3.0000e+01 4.5000e+01
  3.0000e+01 1.5000e+01 1.5000e+01 1.5000e+01 1.5000e+01]
 [1.0000e+00 3.8550e+03 4.5000e+01 1.5000e+01 1.5000e+01 1.5000e+01
  3.0000e+01 1.5000e+01 3.0000e+01 1.5000e+01 1.5000e+01]
 [1.0000e+00 4.7850e+03 6.0000e+01 6.0000e+01 6.0000e+01 6.0000e+01
  9.0000e+01 7.5000e+01 1.0500e+02 4.5000e+01 1.5000e+01]
 [1.0000e+00 3.5700e+03 1.2000e+02 1.5000e+02 1.5000e+02 1.2000e+02
  9.0000e+01 1.3500e+02 4.5000e+01 1.5000e+02 1.5000e+02]
 [1.0000e+00 1.0950e+03 1.3500e+02 6.0000e+01 7.5000e+01 1.5000e+02
  9.0000e+01 1.5000e+02 6.0000e+01 1.2000e+02 1.5000e+01]]
Number of records (training): 6
Number of records (testing): 677

---
** Commencing: TRAINING
655   -1.0
418   -1.0
249   -1.0
306   -1.0
231    1.0
71     1.0
Name: Class, dtype: float64
---Xt:
[[1.0000e+00 1.0000e+00 1.0000e+00 



trainDatumX:
[[1.000e+00 5.310e+03 1.500e+01 1.500e+01 1.500e+01 1.500e+01 3.000e+01
  1.500e+01 3.000e+01 1.500e+01 1.500e+01]
 [1.000e+00 2.940e+03 1.200e+02 6.000e+01 6.000e+01 7.500e+01 6.000e+01
  1.050e+02 1.050e+02 1.200e+02 3.000e+01]
 [1.000e+00 5.580e+03 6.000e+01 1.500e+01 3.000e+01 1.500e+01 3.000e+01
  1.500e+01 3.000e+01 1.500e+01 1.500e+01]
 [1.000e+00 7.875e+03 4.500e+01 1.500e+01 1.500e+01 3.000e+01 3.000e+01
  1.500e+01 1.500e+01 1.500e+01 1.500e+01]
 [1.000e+00 3.450e+03 1.050e+02 6.000e+01 1.050e+02 6.000e+01 4.500e+01
  1.050e+02 1.050e+02 9.000e+01 1.500e+01]
 [1.000e+00 9.075e+03 1.500e+02 7.500e+01 1.500e+02 4.500e+01 7.500e+01
  1.200e+02 1.050e+02 1.200e+02 4.500e+01]]
Number of records (training): 6
Number of records (testing): 677

---
** Commencing: TRAINING
340   -1.0
190   -1.0
358   -1.0
510   -1.0
224    1.0
590    1.0
Name: Class, dtype: float64
---Xt:
[[1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00]
 [5.310e+03 2.940e+03 5.580e+03 7.875e




---Xt:
[[1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00]
 [9.555e+03 9.015e+03 5.595e+03 5.655e+03 4.245e+03 9.540e+03]
 [6.000e+01 1.500e+01 7.500e+01 1.500e+01 1.500e+02 1.500e+02]
 [3.000e+01 1.500e+01 1.500e+01 1.500e+01 6.000e+01 1.500e+02]
 [6.000e+01 1.500e+01 1.500e+01 1.500e+01 9.000e+01 1.050e+02]
 [4.500e+01 1.500e+01 3.000e+01 1.500e+01 1.500e+01 1.200e+02]
 [3.000e+01 1.500e+01 3.000e+01 1.500e+01 3.000e+01 1.050e+02]
 [3.000e+01 1.500e+01 1.500e+01 1.500e+01 1.500e+02 1.500e+01]
 [3.000e+01 3.000e+01 3.000e+01 3.000e+01 7.500e+01 1.500e+02]
 [1.500e+01 1.500e+01 1.500e+01 1.500e+01 4.500e+01 1.500e+02]
 [1.500e+01 1.500e+01 1.500e+01 1.500e+01 1.500e+01 4.500e+01]]
---XtX:
[[6.00000000e+00 4.36050000e+04 4.65000000e+02 2.85000000e+02
  3.00000000e+02 2.40000000e+02 2.25000000e+02 2.40000000e+02
  3.45000000e+02 2.55000000e+02 1.20000000e+02]
 [4.36050000e+04 3.44882925e+08 3.28072500e+06 2.27632500e+06
  2.26102500e+06 2.02635000e+06 1.80360000e+06 1.3704750



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [3.45000000e+02 1.21927500e+06 1.91250000e+04 1.32750000e+04
  1.03500000e+04 1.73250000e+04 1.46250000e+04 2.45250000e+04
  1.91250000e+04 1.32750000e+04 9.22500000e+03]
 [2.55000000e+02 5.05800000e+05 2.13750000e+04 1.86750000e+04
  1.55250000e+04 1.41750000e+04 1.37250000e+04 2.54250000e+04
  1.32750000e+04 1.91250000e+04 8.32500000e+03]
 [1.65000000e+02 4.39650000e+05 1.19250000e+04 7.87500000e+03
  6.07500000e+03 1.14750000e+04 8.32500000e+03 1.46250000e+04
  9.22500000e+03 8.32500000e+03 5.62500000e+03]]
---XtY:
[-3.0000e+00 -2.5605e+04  4.5000e+01  6.0000e+01  6.0000e+01  7.5000e+01
  0.0000e+00  1.3500e+02 -1.0500e+02  1.0500e+02  1.5000e+01]
---inv(XtX):
[[-7.61572032e+14 -2.89740434e+10  1.17827704e+13 -1.04982472e+13
  -6.58295988e+10 -6.18536257e+12  6.59643845e+12 -1.05183310e+13
   1.82771821e+13  7.97921107e+12  2.81248646e+12]
 [-1.23167451e+10 -1.02342572e+04  2.63734161e+08 -5.22852672e+08
   5.63070782



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  6.0750000e+03]
 [2.5500000e+02 1.1243250e+06 1.5300000e+04 9.9000000e+03 9.2250000e+03
  5.4000000e+03 7.4250000e+03 1.1025000e+04 9.6750000e+03 9.2250000e+03
  4.5000000e+03]
 [2.2500000e+02 5.8702500e+05 1.8675000e+04 1.6875000e+04 1.5075000e+04
  5.6250000e+03 9.6750000e+03 1.7325000e+04 9.2250000e+03 1.6425000e+04
  5.1750000e+03]
 [1.2000000e+02 4.9477500e+05 7.4250000e+03 5.4000000e+03 4.9500000e+03
  2.7000000e+03 3.8250000e+03 6.0750000e+03 4.5000000e+03 5.1750000e+03
  2.2500000e+03]]
---XtY:
[-3.000e+00 -2.412e+04 -6.000e+01  3.000e+01  1.500e+01  0.000e+00
 -4.500e+01  1.050e+02 -1.050e+02  4.500e+01 -3.000e+01]
---inv(XtX):
[[ 1.34187206e+15  5.05649390e+10  7.44990101e+12  9.80967315e+11
  -4.02129053e+13  3.16424680e+13 -3.08446128e+12 -1.76794955e+13
  -7.58474085e+11  6.81135402e+13 -1.61293277e+14]
 [-3.13704350e-02 -2.44028038e+06 -3.59534643e+08  1.09123837e+09
   3.48007900e+08 -1.00965255e+08  1.488



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
---
** Commencing: TRAINING
547   -1.0
495   -1.0
28    -1.0
272   -1.0
600   -1.0
31     1.0
231    1.0
115    1.0
Name: Class, dtype: float64
---Xt:
[[1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
  1.000e+00]
 [8.430e+03 7.650e+03 4.350e+02 4.215e+03 9.225e+03 4.800e+02 3.570e+03
  1.755e+03]
 [1.500e+01 1.500e+01 1.500e+01 3.000e+01 6.000e+01 1.500e+02 1.200e+02
  6.000e+01]
 [1.500e+01 1.500e+01 1.500e+01 1.500e+01 1.500e+01 1.050e+02 1.500e+02
  7.500e+01]
 [1.500e+01 1.500e+01 4.500e+01 1.500e+01 4.500e+01 1.050e+02 1.500e+02
  7.500e+01]
 [1.500e+01 1.500e+01 1.500e+01 3.000e+01 1.500e+01 4.500e+01 1.200e+02
  1.500e+02]
 [3.000e+01 3.000e+01 3.000e+01 3.000e+01 3.000e+01 1.200e+02 9.000e+01
  6.000e+01]
 [1.500e+01 1.500e+01 1.500e+01 1.500e+01 1.500e+01 7.500e+01 1.350e+02
  1.500e+02]
 [4.500e+01 1.500e+01 1.500e+01 4.500e+01 3.000e+01 1.050e+02 4.500e+01
  1.050e+02]
 [1.500e+01 1.500e+



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [3.90000000e+02 1.78627500e+06 3.10500000e+04 2.83500000e+04
  2.85750000e+04 1.82250000e+04 1.93500000e+04 3.21750000e+04
  2.63250000e+04 2.34000000e+04 9.00000000e+03]
 [3.60000000e+02 1.46047500e+06 2.83500000e+04 3.37500000e+04
  2.99250000e+04 2.22750000e+04 2.16000000e+04 3.03750000e+04
  2.94750000e+04 2.97000000e+04 9.00000000e+03]
 [3.75000000e+02 1.60380000e+06 2.85750000e+04 2.99250000e+04
  2.94750000e+04 1.93500000e+04 2.00250000e+04 3.46500000e+04
  2.85750000e+04 2.45250000e+04 8.77500000e+03]
 [2.55000000e+02 1.02667500e+06 1.82250000e+04 2.22750000e+04
  1.93500000e+04 1.50750000e+04 1.48500000e+04 1.86750000e+04
  2.04750000e+04 2.00250000e+04 6.07500000e+03]
 [3.15000000e+02 1.37205000e+06 1.93500000e+04 2.16000000e+04
  2.00250000e+04 1.48500000e+04 1.59750000e+04 2.09250000e+04
  2.22750000e+04 1.89000000e+04 6.75000000e+03]
 [4.35000000e+02 1.89517500e+06 3.21750000e+04 3.03750000e+04
  3.46500000e



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1. -1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1. -1.  1.
 -1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.
  1.  1.  1.  1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1. -1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1. -1.
  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1. -1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1. -1.
  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  3.24000000e+06 3.88417500e+06 1.42537500e+06]
 [6.00000000e+02 4.08172500e+06 5.71500000e+04 4.59000000e+04
  2.99250000e+04 1.84500000e+04 3.55500000e+04 4.68000000e+04
  4.09500000e+04 5.42250000e+04 1.39500000e+04]
 [4.80000000e+02 3.21502500e+06 4.59000000e+04 4.50000000e+04
  2.45250000e+04 1.35000000e+04 3.30750000e+04 3.21750000e+04
  3.37500000e+04 5.24250000e+04 1.17000000e+04]
 [3.30000000e+02 2.24752500e+06 2.99250000e+04 2.45250000e+04
  1.71000000e+04 9.67500000e+03 1.95750000e+04 2.52000000e+04
  2.43000000e+04 2.99250000e+04 8.10000000e+03]
 [2.40000000e+02 1.41682500e+06 1.84500000e+04 1.35000000e+04
  9.67500000e+03 9.00000000e+03 1.19250000e+04 1.91250000e+04
  1.21500000e+04 1.68750000e+04 6.30000000e+03]
 [4.65000000e+02 3.18555000e+06 3.55500000e+04 3.30750000e+04
  1.95750000e+04 1.19250000e+04 2.72250000e+04 2.52000000e+04
  2.72250000e+04 3.93750000e+04 1.10250000e+04]
 [4.65000000e+02 2.68672500



trainDatumX:
[[1.000e+00 6.465e+03 7.500e+01 1.500e+01 1.500e+01 4.500e+01 6.000e+01
  1.500e+01 4.500e+01 3.000e+01 1.500e+01]
 [1.000e+00 6.930e+03 9.000e+01 1.500e+01 1.500e+01 4.500e+01 3.000e+01
  1.500e+01 1.500e+01 1.500e+01 1.500e+01]
 [1.000e+00 6.810e+03 3.000e+01 4.500e+01 1.500e+01 1.500e+01 4.500e+01
  1.500e+01 1.500e+01 1.500e+01 1.500e+01]
 [1.000e+00 3.300e+02 4.500e+01 1.500e+01 1.500e+01 1.500e+01 3.000e+01
  1.500e+01 3.000e+01 1.500e+01 1.500e+01]
 [1.000e+00 1.170e+03 4.500e+01 1.500e+01 1.500e+01 1.500e+01 3.000e+01
  4.500e+01 4.500e+01 1.500e+01 1.500e+01]
 [1.000e+00 8.145e+03 6.000e+01 1.500e+01 1.500e+01 1.500e+01 3.000e+01
  1.500e+01 3.000e+01 1.500e+01 1.500e+01]
 [1.000e+00 4.110e+03 4.500e+01 1.500e+01 1.500e+01 1.500e+01 3.000e+01
  1.500e+01 4.500e+01 3.000e+01 1.500e+01]
 [1.000e+00 5.370e+03 1.200e+02 1.500e+02 7.500e+01 4.500e+01 1.200e+02
  6.000e+01 6.000e+01 1.500e+02 4.500e+01]
 [1.000e+00 8.400e+02 1.200e+02 1.500e+02 1.500e+02 1.500e+01 4.500



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
673   -1.0
655   -1.0
321   -1.0
539   -1.0
389   -1.0
      ... 
180    1.0
642    1.0
571    1.0
195    1.0
652    1.0
Name: Class, Length: 682, dtype: float64
---Xt:
[[1.0000e+00 1.0000e+00 1.0000e+00 ... 1.0000e+00 1.0000e+00 1.0000e+00]
 [1.0335e+04 1.0065e+04 5.0250e+03 ... 8.7900e+03 3.0150e+03 1.0020e+04]
 [1.5000e+01 4.5000e+01 1.5000e+01 ... 1.2000e+02 1.5000e+02 7.5000e+01]
 ...
 [1.5000e+01 4.5000e+01 1.5000e+01 ... 1.5000e+02 1.2000e+02 1.0500e+02]
 [1.5000e+01 1.5000e+01 1.5000e+01 ... 1.5000e+02 1.5000e+01 1.5000e+02]
 [1.2000e+02 1.5000e+01 1.5000e+01 ... 1.5000e+01 1.5000e+01 4.5000e+01]]
---XtX:
[[6.82000000e+02 3.59125500e+06 4.54350000e+04 3.22650000e+04
  3.29250000e+04 2.89800000e+04 3.31050000e+04 3.63000000e+04
  3.52650000e+04 2.93850000e+04 1.64100000e+04]
 [3.59125500e+06 2.51973893e+10 2.28090150e+08 1.60366500e+08
  1.62481725e+08 1.47486150e+08 1.61211600e+08 1.70095275e+08
  1.74753900e+08 1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
sys.version

'3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]'

<h2>
TO-DO: Main aim is to iterate through every possible test:train split to find the optimal value. To do this we will:
</h2>
<ul>
<li> Iterate through every possible value of the training split. This is stored as a percentage.
<ul>
<li>There are 699 rows in the data, meaning each row contributes 1/699 to the percentage. This makes us want to iterate from 0 to 100, going up by 100/699 each time until we have finished iterating. **NOTE: I have made calculation of 699 dynamic, to allow for removal of rows if data gaps.**</li>
</ul>

<li> Store the output for each of these splits into a vector
<ul><li>
We could also store every metric for each training split (what resolution do we want this at? Every single split, or only integer values?) to allow for a plot of each metric based off split !! TALKING POINT !!
</ul>
<li> Keep track of the optimal training:test split, where we define output as our "favourite" metric when averaged over X training splits.
<ul><li>
Either just find max(accuracy)[index], i.e. the index (split) that provided the best accuracy or store maxAccuracy in a variable, compare each iteration, if higher then rewrite maxAccuracy and store this index (split) in another variable. First method seems more appropriate, if possible.
</ul>
<li>
Restructure our report form to accommodate for the new information.
<ul><li> Visualised in our facebook chat, will try to show here (could remove the metrics from each seed and just use average split metrics if it gets way too much (it might, unless verbose == true). We could also output as a JSON file? Not sure if necessary, might be useful for something? You tell me tbh):
<br>
Splits
<br> - Split 1 0.14%
<br> ----- Seed 1
<br> --------- metrics
<br> ----- Seed 2 
<br> --------- metrics
<br> ----- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;...
<br> ----- Seed N 
<br> --------- metrics
<br> ----- Size of training split (could be put in split title? See above)
<br> ----- Metrics for split
<br>
<br> - Split 2 0.28%
<br> ----- Seed 1
<br> --------- metrics
<br> ----- Seed 2
<br> --------- metrics
<br> ----- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;...
<br> ----- Seed N
<br> --------- metrics
<br> ----- Size of training split (could be put in split title? See above)
<br> ----- Metrics for split
<br>
<br> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;...
<br>
<br> - Split M 99.86%
<br> ----- Seed 1
<br> --------- metrics
<br> ----- Seed 2
<br> --------- metrics
<br> ----- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;...
<br> ----- Seed N
<br> --------- metrics
<br> ----- Size of training split (could be put in split title? See above)
<br> ----- Metrics for split
<br>
<br> - Highest Accuracy Recorded
<br> - Optimal training:test split
</ul>
<li> Plot metrics
<ul><li> Line graphs of metrics with regard to each split
<li> Distribution of data initially (box plots, anything else? See Tom code)

