In [77]:
import warnings
warnings.filterwarnings('always')

import pandas as pd
import numpy as np

from sklearn.datasets import fetch_covtype
from sklearn import naive_bayes as nb
from sklearn import neural_network as nn

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

import time

#from codes import cross_validate_classifier, \
#                  format_classifier_performance, \
#                  plot_confusion_matrix, metrics_wrapper

import codes as c

import importlib
importlib.reload(c)

<module 'codes' from '/Users/pmccauley/analysis/comp5318/comp5318_assignment_2/codes/__init__.py'>

In [12]:
def class_count_split(X, y, count=250):
    '''
    Parameters:
        X = data
        y = labels
        
        count: number of each class to keep in training set, default 250, only 
                used if STYLE='equal'
                    
    Returns:
        X_train, y_train, X_test, y_test
    '''   
    
    #unique classes
    classes = np.unique(y)
    
    #empty array to store random pick indices 
    picks = np.empty([count, np.shape(classes)[0]])

    #loop through each class and randomly select <count> representatives
    for i, this_class in enumerate(classes):
        want = np.where(y == this_class)[0]
        picks[:,i] = np.random.choice(want,size=count, replace=False)

    #sort pick indices by the order they appear in the original table
    picks = np.sort(picks.flatten()).astype(int)
    
    X_test = X[picks,:]
    y_test = y[picks]
    
    X_train = np.delete(X,picks,axis=0)
    y_train = np.delete(y,picks)  
        
    return X_train, y_train, X_test, y_test

In [11]:
def split_dataset(X, y, style='prop', count=250, test_size=0.1):
    '''
    Parameters:
        X = data
        y = labels
        
        style: 'prop' = classes proportionally represented in test dataset based
                        on proportions in training dataseg
                'equal' = equal number of class counts in training set
                'random' = random test/training partition
        
        count: number of each class to keep in training set, default 250, only 
                used if STYLE='equal'
        
        test_size: fraction of dataset used for test set, default 0.1, only used 
                    if STYLE='prop' or 'random'
                    
    Returns:
        X_train, y_train, X_test, y_test
    '''
    
    if style=='equal':
        X_train, y_train, X_test, y_test = class_count_split(X,y,count=count)
        
    if style=='prop':
        sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
    if style=='random':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
            
    return X_train, y_train, X_test, y_test

In [8]:
def read_covtype(crop=False, regularize=True):
    data = fetch_covtype()
    X = data['data']
    y = data['target']
    
    if crop==True:
        X = X[:,0:10]
    
    cnames = ['Spruce/Fir','Lodgepole Pine','Ponderosa Pine','Cottonwood/Willow','Aspen','Douglas-fir','Krummholz']
    cnums = np.arange(1,8)
    
    from sklearn.preprocessing import StandardScaler  
    scaler = StandardScaler()  
    # Don't cheat - fit only on training data
    scaler.fit(X)  
    X = scaler.transform(X)  
    # apply same transformation to test data
    #X_test = scaler.transform(X_test)  
    
    return X, y, cnames, cnums
    

In [59]:
X, y, cnames, cnums = read_covtype(crop=False)

In [None]:
X_train, y_train, X_test, y_test = c.split_dataset(X,y,regularize=True)

In [60]:
_classifier = nb.GaussianNB
confs = c.cross_validate_classifier(_classifier, X, y-1, kfold=1, style='equal')

In [63]:
conf = np.mean(confs, axis=0)
print(conf.shape)
df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=True)
print(df_total['Overall Accuracy'][0])

(7, 7)
Average/overall metrics:


Unnamed: 0,Average F-Meas,Average Precision,Average Recall,Overall Accuracy
Class-Averaged or Overall:,0.55,0.63,0.59,0.59


Class-specific metrics:


Unnamed: 0_level_0,Class,F-Meas,Precision,Recall
Confusion Matrix Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Spruce/Fir,0.58,0.46,0.8
1,Lodgepole Pine,0.31,0.67,0.2
2,Ponderosa Pine,0.52,0.4,0.74
3,Cottonwood/Willow,0.77,0.72,0.84
4,Aspen,0.65,0.71,0.61
5,Douglas-fir,0.15,0.61,0.09
6,Krummholz,0.84,0.86,0.83


Confusion matrix (yellow = col max; red = row max):


Predicted,0,1,2,3,4,5,6
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,199,11,1,0,7,1,31
1,144,51,17,0,34,2,2
2,2,1,185,52,6,4,0
3,0,0,39,211,0,0,0
4,41,11,39,0,152,7,0
5,5,1,175,32,15,22,0
6,40,1,1,0,1,0,207


0.59


In [37]:
X_train, y_train, X_test, y_test = split_dataset(X,y, style='prop')
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(522910, 10) (522910,) (58102, 10) (58102,)


In [38]:
_classifier = nb.GaussianNB()
_classifier = _classifier.fit(X_train, y_train)
y_pred = _classifier.predict(X_test)

In [43]:
conf = c.construct_confusion_matrix(y_test-1, y_pred-1, dim=7)
df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=True)

Average/overall metrics:


Unnamed: 0,Average F-Meas,Average Precision,Average Recall,Overall Accuracy
Class-Averaged or Overall:,0.45,0.43,0.47,0.63


Class-specific metrics:


Unnamed: 0_level_0,Class,F-Meas,Precision,Recall
Confusion Matrix Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Spruce/Fir,0.65,0.63,0.68
1,Lodgepole Pine,0.69,0.73,0.65
2,Ponderosa Pine,0.56,0.49,0.65
3,Cottonwood/Willow,0.45,0.42,0.49
4,Aspen,0.2,0.18,0.22
5,Douglas-fir,0.31,0.31,0.3
6,Krummholz,0.28,0.28,0.29


Confusion matrix (yellow = col max; red = row max):


Predicted,0,1,2,3,4,5,6
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,14307,5321,152,0,198,69,1137
1,6842,18477,1247,2,705,656,402
2,0,705,2325,132,10,403,0
3,0,0,108,134,0,33,0
4,0,683,37,0,210,19,0
5,0,288,857,54,13,525,0
6,1439,8,14,0,0,0,590


In [64]:
#hidden layer number = mean([features,columns])
_classifier = nn.MLPClassifier(hidden_layer_sizes=[2])
_classifier = _classifier.fit(X_train, y_train)
y_pred = _classifier.predict(X_test)

In [82]:

#layer_sizes = np.arange(2,202,2)
layer_sizes = np.arange(2,6,2)
runtimes = np.zeros(layer_sizes.shape[0])

print(layer_sizes,runtimes)

for i, layer_size in enumerate(layer_sizes):
    start = time.time()
    _classifier = nn.MLPClassifier(hidden_layer_sizes=layer_size)
    _classifier = _classifier.fit(X_train, y_train)
    y_pred = _classifier.predict(X_test)
    runtimes[i] = time.time()-start
    conf = c.construct_confusion_matrix(y_test-1, y_pred-1, dim=7)
    df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=False)
    df_tots = 
    

print(runtimes)

[2 4] [0. 0.]
[28.47710705 44.28450298]


In [83]:
conf = c.construct_confusion_matrix(y_test-1, y_pred-1, dim=7)
df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=False)

In [92]:
blah = pd.concat([df_conf,df_conf], axis=1)
print(blah.shape, df_conf.shape)

(7, 14) (7, 7)


In [66]:
conf = c.construct_confusion_matrix(y_test-1, y_pred-1, dim=7)
df_total, df_class, df_conf = c.metrics_wrapper(conf, cnames, do_display=True)

Average/overall metrics:


Unnamed: 0,Average F-Meas,Average Precision,Average Recall,Overall Accuracy
Class-Averaged or Overall:,0.15,0.21,0.2,0.5


Class-specific metrics:


Unnamed: 0_level_0,Class,F-Meas,Precision,Recall
Confusion Matrix Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Spruce/Fir,0.02,0.58,0.01
1,Lodgepole Pine,0.66,0.51,0.96
2,Ponderosa Pine,0.38,0.36,0.4
3,Cottonwood/Willow,0.0,0.0,0.0
4,Aspen,0.0,0.0,0.0
5,Douglas-fir,0.0,0.0,0.0
6,Krummholz,0.0,0.0,0.0


Confusion matrix (yellow = col max; red = row max):


Predicted,0,1,2,3,4,5,6
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,267,20355,562,0,0,0,0
1,38,27144,1149,0,0,0,0
2,0,2140,1435,0,0,0,0
3,0,168,107,0,0,0,0
4,0,853,96,0,0,0,0
5,1,1160,576,0,0,0,0
6,156,1885,10,0,0,0,0
