In [None]:
!gdown --id '17zWpL9tR5f-HWsGcBKnQXyQfkOnctzLl'
!unzip 'data.zip'

In [172]:
from sklearn.ensemble import RandomForestClassifier
from function import RVFL_train_val
from option import option as op

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold

import scipy.io
import numpy as np
import pandas as pd
import random
import h5py
import time
from pathlib import Path

In [168]:
# Set hyperparams
LIST_OF_DATASET = ['cardiotocography-3clases', 'contrac', 'image-segmentation', 'led-display',  'molec-biol-splice',
                   'statlog-image', 'steel-plates', 'titanic', 'waveform', 'yeast']
datadir = Path.cwd()/'data'
outputdir = Path.cwd()/'output'
outputdir.mkdir(exist_ok=True)

In [None]:
rf_results_dict = {'dataset_name':[],
                   'best_ntree':[],
                   'best_mtry':[],
                   'best_cv_acc':[],
                   'final_train_acc':[],
                   'final_test_acc':[],
                   }

for dataset_name in LIST_OF_DATASET: 
# Reading datasets.

    train_data = scipy.io.loadmat(datadir/f'{dataset_name}/{dataset_name}_Train.mat')
    test_data = scipy.io.loadmat(datadir/f'{dataset_name}/{dataset_name}_Test.mat')

    # Grab training/testing data + targets as numpy arrrays
    train_x = train_data['Data']
    train_y = train_data['Label']
    test_x = test_data['Data']
    test_y = test_data['Label'] 

    print(f'doing dataset: {dataset_name}....')
    # print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    # print(np.unique(train_y))

    # START====== Do random forests
    np.random.seed(6969) # Set seed for consistent results.
    ntree_range = list(range(200, 601, 200)) # Number of trees to search through
    mtry_range = list(range(1,int(np.ceil(np.sqrt(train_x.shape[1])))+1)) # max no of features per node in trees to search through (1 to num_features)

    cv_score_array = np.zeros((len(ntree_range)*len(mtry_range), 4))

    count = 0
    for ntree in ntree_range:
        for mtry in mtry_range:
            rf = RandomForestClassifier(n_estimators=ntree, max_features=mtry)
            cv_score = cross_validate(rf,train_x,train_y.ravel(), scoring='accuracy',cv=4, return_train_score=True)
            cv_score_array[count] = [ntree, mtry, cv_score['test_score'].mean(), cv_score['train_score'].mean()]
            count+=1

    np.savetxt(outputdir/f"rf_tuning_{dataset_name}.csv", cv_score_array, header = 'ntrees,mtry,average_val_acc,average_train_acc',delimiter=",")

    best_ntree, best_mtry, best_val_acc, best_train_acc = cv_score_array[cv_score_array[:,2].argmax()]

    # Test on testing set with best params.
    np.random.seed(6969)

    rf = RandomForestClassifier(n_estimators=int(best_ntree), max_features=int(best_mtry))
    rf.fit(train_x, train_y)
    final_train_acc = rf.score(train_x, train_y.ravel())
    final_test_acc = rf.score(test_x, test_y)

    # Store results for each dataset.
    rf_results_dict['dataset_name'].append(dataset_name)
    rf_results_dict['best_ntree'].append(best_ntree)
    rf_results_dict['best_mtry'].append(best_mtry)
    rf_results_dict['best_cv_acc'].append(best_val_acc)
    rf_results_dict['final_train_acc'].append(final_train_acc)
    rf_results_dict['final_test_acc'].append(final_test_acc)


In [None]:
rvfl_results_dict = {'dataset_name':[],
                   'best_N':[],
                   'best_C':[],
                   'best_cv_acc':[],
                   'final_train_acc':[],
                   'final_test_acc':[],
                   }

for dataset_name in LIST_OF_DATASET: 
# Reading datasets.

    train_data = scipy.io.loadmat(datadir/f'{dataset_name}/{dataset_name}_Train.mat')
    test_data = scipy.io.loadmat(datadir/f'{dataset_name}/{dataset_name}_Test.mat')

    # Grab training/testing data + targets as numpy arrrays
    train_x = train_data['Data']
    train_y = train_data['Label']
    test_x = test_data['Data']
    test_y = test_data['Label'] 

    print(f'doing dataset: {dataset_name}....')
    # print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    # print(np.unique(train_y))

    # START======Do RVFL

    # Generate datasplits and parameter list
    np.random.seed(6969)
    skf = StratifiedKFold(n_splits=4)
    N_list = list(range(3, 204, 20)) # N is number of neurons
    C_list = list(range(-5, 15)) # C is ridge regression parameter.

    # Create scoring array.
    cv_score_array = np.zeros((len(N_list)*len(C_list), 4))
    count = 0
    for num_neuron in N_list:
        for param_c in C_list:
            avg_cv_acc = 0
            avg_train_acc = 0
            for train_index, val_index in skf.split(train_x,train_y):
                option = op(N=num_neuron, C=param_c)
                trainX = train_x[train_index]
                trainY = train_y[train_index]
                testX = train_x[val_index]
                testY = train_y[val_index]
                train_acc, val_acc = RVFL_train_val(trainX, trainY, testX, testY, option)
                avg_cv_acc+= val_acc
                avg_train_acc += train_acc

            cv_score_array[count] = [num_neuron, param_c, avg_cv_acc/4, avg_train_acc/4]
            count +=1

    np.savetxt(outputdir/f"rvfl_tuning_{dataset_name}.csv", cv_score_array, header = 'N,C,average_val_acc,average_train_acc',delimiter=",")

    best_N, best_C, best_val_acc, best_train_acc = cv_score_array[cv_score_array[:,2].argmax()]

    # Test on testing set with best params.
    option = op(N=int(best_N), C=int(best_C))
    final_train_acc, final_test_acc = RVFL_train_val(train_x, train_y, test_x, test_y, option)

    # SAVE ==== results
    rvfl_results_dict['dataset_name'].append(dataset_name)
    rvfl_results_dict['best_N'].append(best_N)
    rvfl_results_dict['best_C'].append(best_C)
    rvfl_results_dict['best_cv_acc'].append(best_val_acc)
    rvfl_results_dict['final_train_acc'].append(final_train_acc)
    rvfl_results_dict['final_test_acc'].append(final_test_acc)

In [186]:
# Convert resuults to dataframe and save.
rf_df = pd.DataFrame(rf_results_dict)
rvfl_df = pd.DataFrame(rvfl_results_dict)
rf_df.to_csv('rf_results.csv', index=False)
rvfl_df.to_csv('rvfl_results.csv', index=False)

## Do significance testing

## RF base code

In [131]:
# Tune random forests
np.random.seed(6969) # Set seed for consistent results.
ntree_range = list(range(200, 601, 200)) # Number of trees to search through
mtry_range = list(range(1,int(np.ceil(np.sqrt(train_x.shape[1])))+1)) # max no of features per node in trees to search through (1 to num_features)

cv_score_array = np.zeros((len(ntree_range)*len(mtry_range), 4))

count = 0
for ntree in ntree_range:
    for mtry in mtry_range:
        rf = RandomForestClassifier(n_estimators=ntree, max_features=mtry)
        cv_score = cross_validate(rf,train_x,train_y.ravel(), scoring='accuracy',cv=4, return_train_score=True)
        cv_score_array[count] = [ntree, mtry, cv_score['test_score'].mean(), cv_score['train_score'].mean()]
        count+=1

np.savetxt(outputdir/f"rf_tuning_{dataset_name}.csv", cv_score_array, header = 'ntrees,mtry,average_val_acc,average_train_acc',delimiter=",")

best_ntree, best_mtry, best_val_acc, best_train_acc = cv_score_array[cv_score_array[:,2].argmax()]

# Test on testing set with best params.
np.random.seed(6969)

rf = RandomForestClassifier(n_estimators=int(best_ntree), max_features=int(best_mtry))
rf.fit(train_x, train_y)
final_train_acc = rf.score(train_x, train_y.ravel())
final_test_acc = rf.score(test_x, test_y)


## RVFL base code

In [122]:
# Tune RVFL

# Generate datasplits and parameter list
np.random.seed(6969)
skf = StratifiedKFold(n_splits=4)
N_list = list(range(3, 204, 20)) # N is number of neurons
C_list = list(range(-5, 15)) # C is ridge regression parameter.

# Create scoring array.
cv_score_array = np.zeros((len(N_list)*len(C_list), 4))
count = 0
for num_neuron in N_list:
    for param_c in C_list:
        avg_cv_acc = 0
        avg_train_acc = 0
        for train_index, val_index in skf.split(train_x,train_y):
            option = op(N=num_neuron, C=param_c)
            trainX = train_x[train_index]
            trainY = train_y[train_index]
            testX = train_x[val_index]
            testY = train_y[val_index]
            train_acc, val_acc = RVFL_train_val(trainX, trainY, testX, testY, option)
            avg_cv_acc+= val_acc
            avg_train_acc += train_acc

        cv_score_array[count] = [num_neuron, param_c, avg_cv_acc/4, avg_train_acc/4]
        count +=1

np.savetxt(outputdir/f"rvfl_tuning_{dataset_name}.csv", cv_score_array, header = 'N,C,average_val_acc,average_train_acc',delimiter=",")

best_N, best_C, best_val_acc, best_train_acc = cv_score_array[cv_score_array[:,2].argmax()]

# Test on testing set with best params.
option = op(N=best_N, C=best_C)
final_train_acc, final_test_acc = RVFL_train_val(train_x, train_y, test_x, test_y, option)


In [87]:
# Tune RVFL
np.random.seed(6969) # Set seed for consistent results.
ntree_range = list(range(200, 601, 200)) # Number of trees to search through
mtry_range = list(range(1,int(np.ceil(np.sqrt(train_x.shape[1])))+1)) # max no of features per node in trees to search through (1 to num_features)


array([[200.        ,   1.        ,   0.91470588],
       [200.        ,   2.        ,   0.92352941],
       [200.        ,   3.        ,   0.92941176],
       [200.        ,   4.        ,   0.93647059],
       [200.        ,   5.        ,   0.93705882],
       [400.        ,   1.        ,   0.91411765],
       [400.        ,   2.        ,   0.92882353],
       [400.        ,   3.        ,   0.93117647],
       [400.        ,   4.        ,   0.93352941],
       [400.        ,   5.        ,   0.93705882],
       [600.        ,   1.        ,   0.91176471],
       [600.        ,   2.        ,   0.92352941],
       [600.        ,   3.        ,   0.93058824],
       [600.        ,   4.        ,   0.93588235],
       [600.        ,   5.        ,   0.93588235]])

In [100]:
list(range(3, 204, 20))

[3, 23, 43, 63, 83, 103, 123, 143, 163, 183, 203]

In [46]:
list(ntree_range)

[100, 200, 300, 400, 500, 600, 700, 800, 900]

In [47]:
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [52]:
int(np.sqrt(train_x.shape[1]))+1

9