In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.datasets import fetch_covtype
from sklearn import naive_bayes as nb
from sklearn import neural_network as nn

import time


import codes as c

import importlib
importlib.reload(c)

<module 'codes' from '/Users/pmccauley/analysis/comp5318/comp5318_assignment_2/codes/__init__.py'>

In [14]:
def read_covtype(crop=False):
    '''
    Name:
        read_covtype
    
    Purpose: 
        Read covtype dataset using the sklearn.datasets function fetch_covtype 
        and return in X, y array format along with class name and number arrays
    
    Parameters: 
        No Required Inputs:
        
        1 Optional Settings:
        
        (crop) = Boolean, default=Fales. Set to only keep the first 10 columns
                 of X, which encode the most information
    
    Returns: 
        4 Ouputs: 
        
        1 (X) = NumPy array, data array
        2 (y) = NumPy array, class labels
        3 (cnames) = list, class names
        4 (cnums) = NumPy array, class number (numeric class labels)
    '''     
    
    data = fetch_covtype()
    X = data['data']
    y = data['target']
    
    if crop==True:
        X = X[:,0:10]
    
    cnames = ['Spruce/Fir','Lodgepole Pine','Ponderosa Pine','Cottonwood/Willow','Aspen','Douglas-fir','Krummholz']
    cnums = np.arange(1,8)
    
    return X, y, cnames, cnums
    

<h2>Naive Bayes</h2>

In [None]:
def covtype_naive_bayes(X, y, kfold=10, style='prop'):
    
    X_train, y_train, X_test, y_test = c.split_dataset(X,y,regularize=False)
    
    _classifier = nb.GaussianNB
    
    confs = c.cross_validate_classifier(_classifier, X, y-1, kfold=kfold, style=style)
    
    conf = np.mean(confs, axis=0)
    df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=True)
    
    return df_total, df_class, df_conf
    

In [None]:
X, y, cnames, cnums = read_covtype(crop=False)
df_total, df_class, df_conf = covtype_naive_bayes(X, y)

<h2>Multi-Layer Perceptron (MLP)</h2>

The following two 

In [6]:
def mlp_explore_param(param, values, X_train, y_train, X_test, y_test, args=None, redo=False):
    '''
    Name:
        mlp_explore_param
    
    Purpose: 
        Explore a given scikit-learn MLP hyperparameter by looping through an 
        array of values and recording classification performance for each step. 
        Results are be written to CSV files and read from there on subsequent 
        calls unless redo=True.
    
    Parameters: 
        6 Required Inputs:
        
        1 (param) = String, hyperparameter to test (e.g. 'hidden_layer_sizes')
        2 (values) = List or numpy array containing param values to be tested
        3 (X_train) = NumPy array, Training data
        4 (y_train) = NumPy array, Training labels
        5 (X_test) = NumPy array, Test data
        6 (y_test) = NumPy array, Test labels
        
        2 Optional Settings:
        
        1 (args) = Dictionary, default={'solver':'sgd', 'early_stopping':True}. 
                   Arguments passed to the MLP classifier via **kwargs that 
                   will be kept constant for each test
        2 (redo) = Boolean, default=False. Results will be written to and read 
                   from a CSV file. Set redo=True to remake an existing CSV
        
    
    Returns: 
        Out: Pandas DataFrame containing the metrics (accuracy, f1, precision, 
             recall, run time, loss, and iteration count) for each parameter value
    ''' 
    
    kwargs = args if args != None else {'solver':'sgd', 'early_stopping':True}
    
    if param not in kwargs:
        kwargs = {**kwargs, **{param:values[0]}}

    runtimes = np.zeros(np.shape(values)[0])
    n_iter = np.zeros(np.shape(values)[0])
    loss = np.zeros(np.shape(values)[0])
        
    if param=='learning_rate_init' or param=='tol':
        file = Path('mlp_explore_results/mlp_explore_'+param+'_'+kwargs['learning_rate']+'.csv')
    elif param=='hidden_layer_sizes':
        config = 'depth' if type(values)==list else 'width'
        file = Path('mlp_explore_results/mlp_explore_'+param+'_'+config+'.csv')
    else:
        file = Path('mlp_explore_results/mlp_explore_'+param+'.csv')
    
    if file.is_file()==True and redo==False:
        
        print('Reading existing results from '+file.name)
        df_totals = pd.read_csv(file.resolve(), index_col=0)
    
    else:
    
        print('Exploring param = '+param+' from '+str(values[0])+' to '+str(values[-1]))
        print('Working on '+param+' = ', sep=' ', end='', flush=True)
    
        for i, param_value in enumerate(values):
            kwargs[param] = param_value
            print(param_value, sep=' ', end=',', flush=True)
            
            start = time.time()
            
            _classifier = nn.MLPClassifier(**kwargs)
            _classifier = _classifier.fit(X_train, y_train)
            y_pred = _classifier.predict(X_test)
            
            runtimes[i] = time.time()-start
            n_iter[i] = _classifier.n_iter_
            loss[i] = _classifier.loss_
            
            conf = c.construct_confusion_matrix(y_test-1, y_pred-1, dim=7)
            df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=False)

            df_totals = df_total.copy() if i==0 else df_totals.append(df_total, ignore_index=True)

        df_totals.insert(0, 'n_iter', n_iter)    
        df_totals.insert(0, 'loss', loss)    
        df_totals.insert(0, 'Run Time', runtimes)
        df_totals.insert(0, param, values)

        df_totals.to_csv(path_or_buf=file)
        print(' ', sep='\newline')
        print('Wrote results to '+file.name)
    
    return df_totals


In [12]:
def mlp_explore_params(X_train, y_train, X_test, y_test, redo=False):
    '''
    Name:
        mlp_explore_params
    
    Purpose: 
        Explore scikit-learn MLP hyperparameters by looping through an 
        arrays possible values. This code is wrapper for mlp_explore_param 
        that defines the the values to be tested and passes them to the main 
        routine. 
    
    Parameters: 
        4 Required Inputs:
    
        1 (X_train) = NumPy array, Training data
        2 (y_train) = NumPy array, Training labels
        3 (X_test) = NumPy array, Test data
        4 (y_test) = NumPy array, Test labels
        
        2 Optional Settings:
        
        2 (redo) = Boolean, default=False. Results will be written to and read 
                   from CSV filee. Set redo=True to remake existing CSVe
        
    
    Returns: 
        Out: List with Pandas DataFrames containing the metrics (accuracy, f1, precision, 
             recall, run time, loss, and iteration count) for each parameter value
    ''' 
    
    
    params = ['hidden_layer_sizes', \
              'hidden_layer_sizes', \
              'activation', \
              'alpha', \
              'batch_size', \
              'momentum', \
              'learning_rate_init', \
              'shuffle', \
              'nesterovs_momentum', \
              'power_t', \
              'tol']
    
    values = [np.append([np.arange(1,10,1),np.arange(10,100,10)],np.arange(100,1100,100)), \
              [[100],[100,100],[100,100,100],[100,100,100,100],[100,100,100,100,100]], \
              ['identity', 'logistic', 'tanh', 'relu'], \
              np.sort(np.append(np.geomspace(1e-6,1e-1,num=6),np.geomspace(5e-6,5e-1,num=6))), \
              np.sort(np.append(np.geomspace(1e1,1e5,num=5),np.geomspace(5e1,5e4,num=4))).round().astype(int), \
              np.linspace(0.01, 0.99, 50), \
              np.sort(np.append(np.geomspace(1e-5,1e0,num=6),np.geomspace(5e-5,5e-1,num=5))), \
              [False,True], \
              [False,True], \
              np.linspace(0.1, 2, 20), \
              np.sort(np.append(np.geomspace(1e-6,1e1,num=8),np.geomspace(5e-6,5e0,num=7)))]
    
    learning_rates = ['constant','invscaling','adaptive']
    
    base_args = {'solver':'sgd', 'early_stopping':True}
    
    output = []
    for i, param in enumerate(params):
        
        if param=='learning_rate_init' or param=='tol':
            for rate in learning_rates:
                args = {**base_args, **{'learning_rate':rate}}
                df_totals = mlp_explore_param(param, values[i], X_train, y_train, X_test, y_test, args=args, redo=redo)
        
        elif param=='power_t':
            args = {**base_args, **{'learning_rate_init':0.1, 'learning_rate':'invscaling'}}
            df_totals = mlp_explore_param(param, values[i], X_train, y_train, X_test, y_test, redo=redo)
            
        else:
            df_totals = mlp_explore_param(param, values[i], X_train, y_train, X_test, y_test, redo=redo)
                    
        output.append(df_totals)
        
    return output, params
        

In [13]:
X, y, cnames, cnums = read_covtype(crop=False)
X_train, y_train, X_test, y_test = c.split_dataset(X,y,regularize=True)

tables, params = mlp_explore_params(X_train, y_train, X_test, y_test, redo=False)

Reading existing results from mlp_explore_hidden_layer_sizes_width.csv
Reading existing results from mlp_explore_hidden_layer_sizes_depth.csv
Reading existing results from mlp_explore_activation.csv
Reading existing results from mlp_explore_alpha.csv
Reading existing results from mlp_explore_batch_size.csv
Reading existing results from mlp_explore_momentum.csv
Reading existing results from mlp_explore_learning_rate_init_constant.csv
Reading existing results from mlp_explore_learning_rate_init_invscaling.csv
Reading existing results from mlp_explore_learning_rate_init_adaptive.csv
Reading existing results from mlp_explore_shuffle.csv
Reading existing results from mlp_explore_nesterovs_momentum.csv
Reading existing results from mlp_explore_power_t.csv
Reading existing results from mlp_explore_tol_constant.csv
Reading existing results from mlp_explore_tol_invscaling.csv
Reading existing results from mlp_explore_tol_adaptive.csv


In [None]:
#hidden layer number = mean([features,columns])
start = time.time()

kwargs = mlp_args()

_classifier = nn.MLPClassifier(**kwargs)
_classifier = _classifier.fit(X_train, y_train)
y_pred = _classifier.predict(X_test)
print((time.time()-start)/60)

conf = c.construct_confusion_matrix(y_test-1, y_pred-1, dim=7)
df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=True)