# The goal of this project is to train a classification model that can accurately predict a samples Iron Deficiency Chlorosis (IDC) rating, given the percentages of green, yellow, and brown that exists within the sample.



# 3 models (M0, M1, & M2) are trained. Training data consists of color percentages as features (vector [%green %yellow %brown]) and IDC ratings as labels (IDC Bins for Model M0, see below).

**Although Model MB is trained, it is not used in the final hierarchical model.**

## The training data for Model MB is left as is. Features are percentages of green, yellow, and brown for each sample, and labels are the respective IDC rating for each sample.


## <u>Model M0 Training Data</u>
### The training data for Model M0 is first binned. 
* Samples with IDC ratings 1 & 2 belong to Bin 1. 
* Samples with IDC rating 3 belongs to Bin 2.
* Samples with IDC ratings 4 & 5 belong to Bin 3.

Therefore, the features are the color percentages, and the labels are the bin numbers.

## <u>Model M1 Training Data</u>
### The training data for Model M1 consists of only samples where the IDC rating is 1 or 2.
The features are the color percentages, and the labels are ratings 1 or 2.


## <u>Model M2 Training Data</u>
### The training data for Model M2 consists of only samples where the IDC rating is 4 or 5.
The features are the color percentages, and the labels are ratings 4 or 5.


## The final hierarchical model uses the following logic:

1. Use Model M0 to predict the bin (1, 2, or 3) the sample belongs to.
2. If the predicted bin is 1, use Model M1 to predict an IDC rating of 1 or 2.
3. Else if Model M0 predicts the bin to be 2, return an IDC Rating of 3.
4. Else if Model M0 predicts the bin to be 3, use Model M2 to predict and IDC Rating of 4 or 5.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import imblearn
print(imblearn.__version__)
#import models and performance metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm
from sklearn import svm as svm_linear

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split as tts
import sklearn
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from matplotlib import pyplot
import pandas as pd

print(imblearn.__version__)
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import KMeansSMOTE


import dataframe_image as dfi

0.9.0
0.9.0


In [5]:
import os
#os.chdir('E:/Discovery_OneDrive_therin_young/isu_fall2020/research_projects/soydisease/colorseg')
#df = pd.read_csv('fullcanopy_data_2021colors_final.csv',index_col = False)


os.chdir('C:/Users/theri/OneDrive/isu_phd_files/Projects/soydisease/paper2_workflows_and_data/2021_tls_soybean/2023_experiments/data_v2/')

In [6]:
# read input data
df = pd.read_csv('fullcanopy_colorpercentages_output.csv',index_col = False)

In [7]:
# rename columns
df.rename(columns = {'percgreen':'%g','percyellow':'%y','percbrown':'%b','idc_score':'rating'},inplace=True)

In [8]:
#update datatype for rating column to integer 
df['rating'] = df['rating'].astype(int)

In [9]:
#define features (X) and classification labels (y)
X = np.array(df[['%b','%g','%y']])
y = np.array(df['rating'])

In [10]:
#identify the number of samples within the rating with the highest number of samples
u, inv = np.unique(df.rating, return_inverse=True)
counts = np.bincount(inv)
max_count = max(counts)

# Establishing Datasets

## 1a. Model MB Data

In [11]:
#define features (X) and classification labels (y)
X = np.array(df[['%b','%g','%y']])
y = np.array(df['rating'])

u, inv = np.unique(y, return_inverse=True)
counts = np.bincount(inv)
max_count_base = max(counts)
print(max_count_base)

226


## 1b. Model M0 Data

In [12]:
#bin the ratings, ratings 1 & 2 go to bin 1, rating 3 goes to bin 2, ratings 4 & 5 go to bin 3
# and create a new column for the bins

bin_lst = []
for i in range(0,df.shape[0]):
    if df['rating'][i] == 1:
        bin_lst.append(1)
    elif df['rating'][i] == 2:
        bin_lst.append(1)
    elif df['rating'][i] == 3:
        bin_lst.append(2)
    elif df['rating'][i] == 4:
        bin_lst.append(3)
    elif df['rating'][i] == 5:
        bin_lst.append(3)

df['severity_level'] = bin_lst

In [13]:
#define features (X) and classification labels (y)
X_m0 = np.array(df[['%b','%g','%y']])
y_m0 = np.array(df['severity_level'])

u, inv = np.unique(y_m0, return_inverse=True)
counts = np.bincount(inv)
max_count_m0 = max(counts)
print(max_count_m0)

390


In [14]:
print(counts)

[209 123 390]


## 1c. Model M1 Data

In [15]:
#model1 dataset
X_m1 = X[((y ==1)|(y == 2))]
y_m1 = y[((y == 1)|(y == 2))]

u, inv = np.unique(y_m1, return_inverse=True)
counts = np.bincount(inv)
max_count_m1 = max(counts)
print(max_count_m1)

124


In [16]:
print(counts)

[ 85 124]


## 1d. Model M2 Data

In [17]:
#model2 dataset
X_m2 = X[((y == 4)|(y == 5))]
y_m2 = y[((y == 4)|(y == 5))]

u, inv = np.unique(y_m2, return_inverse=True)
counts = np.bincount(inv)
max_count_m2 = max(counts)
print(counts)

[164 226]


In [18]:
def misclassification_cost(arg1, arg2):
    
    # function for calculating the missclassification cost of a classifier given test labels and predicted labels returned from the
    # trained classifier.
    
    '''
    inputs:
    arg1 = array of test labels
    arg2 = array of predicted labels
    
    returns:
    misclassification cost
    '''

    #print confusion matrix
    CM = confusion_matrix(arg1,arg2)
    print(CM)

    #define cost matrix shape
    cM = np.zeros(CM.shape)

    #assign weights to cost matrix
    if cM.shape == (3,3):
        cM[0] = [0,1,2]
        cM[1] = [1,0,1]
        cM[2] = [2,1,0]

    # for binary classification
    elif cM.shape ==(2,2):
        cM[0] = [0,1]
        cM[1] = [1,0]

    elif cM.shape == (5,5):
        cM[0] = [0,1,2,3,4]
        cM[1] = [1,0,1,2,3]
        cM[2] = [2,1,0,1,2]
        cM[3] = [3,2,1,0,1]
        cM[4] = [4,3,2,1,0]

    #calculate classification cost
    cM_matrix = np.matrix(CM * cM)
    clcost = cM_matrix.sum()/arg2.shape[0]
    
    return(clcost)

In [19]:
#Calculate performance of hierarchical classifier for unbinned as-is data

def add_hierarchical_results(arg1, arg2, arg3):
    
    '''
    This is a function that adds classification performance of the 
    hierarchical classifier to either the unbinned or binned classifier data dataframe
    
    arg1: array of test labels
    arg2: array of predicted labels
    arg3: dataframe returned from classification_pipeline0 for hierarchical classifier data to be added to
    '''
    
    value = 'hierarchical'
    if value in list(arg3['Model']):
        print('hierarchical data already exists')
        return(arg3)
    else:       
  

        report_dict = classification_report(arg1,arg2,output_dict=True)
        model = 'hierarchical'
        accuracy = report_dict['accuracy']
        mpca = report_dict['macro avg']['recall']
        f1_wt = report_dict['weighted avg']['f1-score']
        cost = misclassification_cost(arg1,arg2)
        unique_predictions = np.unique(arg2)

        #Add hierarchical classification results for as-is data to unbinned classification results for as-is data
        table = arg3
        table.loc[len(table.index)] = model,accuracy,mpca,f1_wt,cost,unique_predictions

        return (table)
    

# Define the Classification Model

In [42]:
def classification_model(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8):
    
    '''
    arg1: Input array (features) - Training data (features)
    arg2: Target array  - Training data (labels)
    arg3: number of items in the majority class (max count)
    arg4: random seed for splitting data
    arg5: oversample rate
    arg6: undersample rate
    arg7: undersample dictionary
    arg8: oversample dictionary
    '''
    
    #Define SMOTE strategies to be used during training
    SMOTE_dict = {'SMOTE': 'sampling_strategy = over_sampling_dict,random_state = %d' % (42),
                  'BorderlineSMOTE': 'sampling_strategy = over_sampling_dict,random_state = %d' % (42),        
                 'SVMSMOTE': 'sampling_strategy = over_sampling_dict,random_state = %d' % (42)}
    
    
    #create dataframe for storing classification results
    output = pd.DataFrame()
    
    for key_1 in SMOTE_dict.keys():
        
        print(key_1)
    
    
    
    
        #Define classifiers to use during training
        modelDict = {'DecisionTreeClassifier':'max_depth = %d,random_state = %d' % (4,0),
                     'RandomForestClassifier':'',
                     'svm.SVC':'kernel="rbf"', 
                     'KNeighborsClassifier':'n_neighbors = 4',
                     'LinearDiscriminantAnalysis':'solver = "lsqr",shrinkage = 0.02',
                     'QuadraticDiscriminantAnalysis':'',
                     'svm_linear.SVC':'kernel="linear"'
                    }

        model_lst = list(modelDict.keys())  
    

        #create lists for storing model scores for cross-validation
        models = []
        accuracy = []
        mpca = []
        f1_wt = []

        #create lists for storing model scores
        accuracy2 = []
        mpca2 = []
        f1_wt2 = []
        cost2 = []
        unique2 = []
        smote = []
    
    
    
        for key in modelDict.keys():
            print(key)
            models.append(key)
            smote.append(key_1)

            #define classifier with parameters including penalizing parameters
            clf = eval('%s(%s)' % (key,modelDict[key]))

            #define over and under sampling rates
            over_sample_rate = arg5
            under_sample_rate = arg6


            #assign sampling rates to dictionaries
            under_sampling_dict = arg7
            over_sampling_dict = arg8

            # define pipeline
            over = eval('%s(%s)' % (key_1,SMOTE_dict[key_1]))
            under = RandomUnderSampler(sampling_strategy=under_sampling_dict, random_state=1)
            steps = [('o', over),('u',under), ('model',clf)]
            pipeline = Pipeline(steps=steps)


            #evaluate pipeline

            # define cross-validation method for model evaluation
            cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)       

            
            #calculate cross-validated mean per class accuracy (recall macroaverage from classification report)
            results = cross_validate(pipeline, arg1, arg2, scoring=['recall_macro','accuracy','f1_macro','f1_weighted'], cv=cv, n_jobs=-1,error_score='raise')

            #append cross-validation results to respective lists
            accuracy.append(results['test_accuracy'].mean())
            mpca.append(results['test_recall_macro'].mean())
            f1_wt.append(results['test_f1_weighted'].mean())
            




            #train each model with training data and predict on test data

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = tts(arg1, arg2, test_size=0.03, random_state=arg4,stratify=arg2)

            # Create a dictionary to store the indices of each label in the training set
            label_indices = {}
            for i in range(len(y_train)):
                label = y_train[i]
                if label not in label_indices:
                    label_indices[label] = []
                label_indices[label].append(i)

            # Create a list to store the 5 samples from each label
            samples = []
            for label in label_indices:
                label_samples = label_indices[label][:5] # Get the first 5 indices for each label
                for idx in label_samples:
                    samples.append(idx)

            # Get the data and labels for the selected samples
            X_test = X_train[samples]
            y_test = y_train[samples]
            

            
            print(np.unique(y_train))
            #train models
            model = pipeline.fit(X_train, y_train)  
            

            #predict on test data
            y_hat = model.predict(X_test)      
            print(classification_report(y_test, y_hat))

            #print confusion matrix
            CM = confusion_matrix(y_test,y_hat)
            print(CM)


            #capture classification accuracy metrics
            report_dict = classification_report(y_test,y_hat,output_dict=True)

            #mean per class accuracy
            mpca2.append(report_dict['macro avg']['recall']) #mean per class accuracy

            #return f1 score
            f1_wt2.append(report_dict['weighted avg']['f1-score'])

            #accuracy
            accuracy2.append(report_dict['accuracy'])   

            #misclassification cost
            cost2.append(misclassification_cost(y_test,y_hat))

            #unique label predictions
            unique2.append(np.unique(y_hat))
   
        
        
        #create and populate dataframe with cross-validation results
        df_scores = pd.DataFrame()
        df_scores['Model'] = models
        df_scores['CV Accuracy'] = accuracy
        df_scores['CV MPCA'] = mpca
        df_scores['CV F1_weighted'] = f1_wt
        


        #create and populate dataframe with trained model results
        df_scores2 = pd.DataFrame()
        df_scores2['SMOTE MDL'] = smote
        df_scores2['Model'] = models
        df_scores2['Accuracy'] = accuracy2
        df_scores2['MPCA'] = mpca2
        df_scores2['F1_weighted'] = f1_wt2
        df_scores2['Misclassification_Cost'] = cost2
        df_scores2['Unique Predictions'] = unique2
        
        output = output.append(df_scores2)
        
        
        print(key_1)
        
    
    return(df_scores2,model,X_test,y_test,X_train,y_train,df_scores,output)

In [41]:
#Define SMOTE undersampling and oversampling rates for each model's dataset
'''
osr = oversample rate
usr = undersample rate
usd = undersample dictionary
osd = oversample dictionary
'''

mb_osr = int(0.7*max_count_base)
mb_usr = int(0.7*max_count_base)
mb_usd = {}
mb_osd = {1:mb_osr,2:mb_osr}

m0_osr = int(0.7*max_count_m0)
m0_usr = int(0.7*max_count_m0)
m0_usd = {}
m0_osd = {1:int(0.7*max_count_m0)}

m1_osr = int(0.85*max_count_m1)
m1_usr = int(0.7*max_count_m1)
m1_usd = {}
m1_osd = {1:m1_osr}

m2_osr = int(0.85*max_count_m2)
m2_usr = {}
m2_usd = {}
m2_osd = {}

In [29]:
# go to location of input data
os.chdir('C:/Users/theri/OneDrive/isu_phd_files/Projects/soydisease/paper2_workflows_and_data/2021_tls_soybean/2023_experiments/full_canopy_v2')

In [30]:
# Varying random seed when splitting test and training data to assess variance in classifier accuracy
rdm_seed_lst = [42,43,44,45,46]

In [36]:
#define arguments for the classification model for each dataset
mb_args = [X,y,max_count_base,rdm_seed_lst[i],mb_osr,mb_usr,mb_usd,mb_osd]
m0_args = [X_m0,y_m0,max_count_m0,rdm_seed_lst[i],m0_osr,m0_usr,m0_usd,m0_osd]
m1_args = [X_m1,y_m1,max_count_m1,rdm_seed_lst[i],m1_osr,m1_usr,m1_usd,m1_osd]
m2_args = [X_m2,y_m2,max_count_m2,rdm_seed_lst[i],m2_osr,m2_usr,m2_usd,m2_osd]

In [37]:
# train model on each dataset using the arguments defined above

output_mb = pd.DataFrame()
output_m0 = pd.DataFrame()
output_m1 = pd.DataFrame()
output_m2 = pd.DataFrame()


for i in range(len(rdm_seed_lst)): 
    mb_results = classification_model(*mb_args)
    m0_results = classification_model(*m0_args)
    m1_results = classification_model(*m1_args)
    m2_results = classification_model(*m2_args)
    
    output_mb = output_mb.append(mb_results[7])
    output_m0 = output_m0.append(m0_results[7])
    output_m1 = output_m1.append(m1_results[7])
    output_m2 = output_m2.append(m2_results[7])
    
#output.to_excel('mb_output_2021colors_EX1B.xlsx')

SMOTE
DecisionTreeClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
RandomForestClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.0

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
BorderlineSMOTE
SVMSMOTE
DecisionTreeClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
RandomForestClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
 

[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
svm.SVC
[1 2 3]
              precision    recall  f1-score   support

           1       0.71      1.00      0.83         5
           2       1.00      0.60      0.75         5
           3       1.00      1.00      1.00         5

    accuracy                           0.87        15
   macro avg       0.90      0.87      0.86        15
weighted avg       0.90      0.87      0.86        15

[[5 0 0]
 [2 3 0]
 [0 0 5]]
[[5 0 0]
 [2 3 0]
 [0 0 5]]
KNeighborsClassifier
[1 2 3]
              precision    recall  f1-score   support

           1      

[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
SMOTE
BorderlineSMOTE
DecisionTreeClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
RandomForestClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                        

[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
svm.SVC
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
KNeighborsClassifier
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.

[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
SMOTE
BorderlineSMOTE
DecisionTreeClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.83      1.00      0.91         5
           3       1.00      0.80      0.89         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      

[1 2 3 4 5]
              precision    recall  f1-score   support

           1       0.83      1.00      0.91         5
           2       1.00      0.80      0.89         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           0.96        25
   macro avg       0.97      0.96      0.96        25
weighted avg       0.97      0.96      0.96        25

[[5 0 0 0 0]
 [1 4 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [1 4 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
svm_linear.SVC
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy  

[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
svm.SVC
[1 2 3]
              precision    recall  f1-score   support

           1       0.83      1.00      0.91         5
           2       1.00      0.80      0.89         5
           3       1.00      1.00      1.00         5

    accuracy                           0.93        15
   macro avg       0.94      0.93      0.93        15
weighted avg       0.94      0.93      0.93        15

[[5 0 0]
 [1 4 0]
 [0 0 5]]
[[5 0 0]
 [1 4 0]
 [0 0 5]]
KNeighborsClassifier
[1 2 3]
              precision    recall  f1-score   support

           1      

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
RandomForestClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
svm.SVC
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00  

[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
svm.SVC
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
KNeighborsClassifier
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 1 4 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
KNeighborsClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.83      1.00      0.91         5
           3       1.00      0.80      0.89         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           0.96        25
   macro avg       0.97      0.96      0.96        25
weighted avg       0.97      0.96      0.96        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 1 4 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 1 4 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
LinearDiscriminantAnalysis
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00 

[[5 0 0]
 [0 5 0]
 [0 0 5]]
KNeighborsClassifier
[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
LinearDiscriminantAnalysis
[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
QuadraticDiscriminantAnalysis
[1 2

[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
svm.SVC
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
KNeighborsClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.

[[5 0]
 [0 5]]
svm.SVC
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
KNeighborsClassifier
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
LinearDiscriminantAnalysis
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                     

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
KNeighborsClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
LinearDiscriminantAnalysis
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3

[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
KNeighborsClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.83      1.00      0.91         5
           3       1.00      0.80      0.89         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accu

[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
svm_linear.SVC
[1 2 3]
              precision    recall  f1-score   support

           1       0.71      1.00      0.83         5
           2       1.00      0.60      0.75         5
           3       1.00      1.00      1.00         5

    accuracy                           0.87        15
   macro avg       0.90      0.87      0.86        15
weighted avg       0.90      0.87      0.86        15

[[5 0 0]
 [2 3 0]
 [0 0 5]]
[[5 0 0]
 [2 3 0]
 [0 0 5]]
BorderlineSMOTE
SVMSMOTE
DecisionTreeClassifier
[1 2 3]
              precision    recall  f1-s

[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
svm_linear.SVC
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
BorderlineSMOTE
SVMSMOTE
DecisionTreeClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.

[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
RandomForestClassifier
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
svm.SVC
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 0 5 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
svm.SVC
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.83      1.00      0.91         5
           3       1.00      0.80      0.89         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           0.96        25
   macro avg       0.97      0.96      0.96        25
weighted avg       0.97      0.96      0.96        25

[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 1 4 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
[[5 0 0 0 0]
 [0 5 0 0 0]
 [0 1 4 0 0]
 [0 0 0 5 0]
 [0 0 0 0 5]]
KNeighborsClassifier
[1 2 3 4 5]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.83      1.00      0.91         5
           3       1.00      0.80      0.89         5
           4       1.00      1.00      1.00

[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
svm.SVC
[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
KNeighborsClassifier
[1 2 3]
              precision    recall  f1-score   support

           1      

 [0 0 5]]
svm_linear.SVC
[1 2 3]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

[[5 0 0]
 [0 5 0]
 [0 0 5]]
[[5 0 0]
 [0 5 0]
 [0 0 5]]
SVMSMOTE
SMOTE
DecisionTreeClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
RandomForestClassifier
[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00  

[1 2]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
SVMSMOTE
SMOTE
DecisionTreeClassifier
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

[[5 0]
 [0 5]]
[[5 0]
 [0 5]]
RandomForestClassifier
[4 5]
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5

    accuracy                           1.00

#  Hierarchical classification with trained models (M0, M1, and M2)

### <u>Logic</u>
1. Use Model M0 to predict the bin (1, 2, or 3) the sample belongs to.
2. If the predicted bin is 1, use Model M1 to predict an IDC rating of 1 or 2.
3. Else if Model M0 predicts the bin to be 2, return an IDC Rating of 3.
4. Else if Model M0 predicts the bin to be 3, use Model M2 to predict and IDC Rating of 4 or 5.


In [39]:
#hierarchical classification of validation data

rdm_seed_lst = [42,43,44,45,46]


seed = []
mpca = []
f1_wt = []
cost = []
accuracy = []
unique_predictions = []



for i in range(len(rdm_seed_lst)):
    seed.append(rdm_seed_lst[i])
    
    #mb = smote_classifier_base(X,y,max_count_base,rdm_seed_lst[i
    X_train, X_test, y_train, y_test = tts(X,y, random_state=i, stratify = y)
    mbx = [X_train,  y_train, X_test, y_test]

    pred_list = []
    for i in range(0,len(mbx[2])):
        if m0_results[1].predict(mbx[2])[i] == 1:    #idc score is 1 or 2
            pred = m1_results[1].predict(mbx[2])[i]  #use model 1 to classify as 1 or 2
            pred_list.append(pred)                          
        elif m0_results[1].predict(mbx[2])[i] == 2:
            pred = 3
            pred_list.append(pred)
        elif m0_results[1].predict(mbx[2])[i] == 3:   #idc score is 4 or 5
            pred = m2_results[1].predict(mbx[2])[i]   #use model 2 to classify as 4 or 5
            pred_list.append(pred)
    pred_list = np.array(pred_list)                         

    print(classification_report(mbx[3],pred_list))
    
    #print classification report
    report_dict = classification_report(mbx[3],pred_list,output_dict=True)

    #mean per-class accuracy
    mpca.append(report_dict['macro avg']['recall']) #mean per class accuracy

    #return f1 score
    f1_wt.append(report_dict['weighted avg']['f1-score'])

    #accuracy
    accuracy.append(accuracy_score(pred_list,mbx[3]))       

    #uniqueness of the predictions
    unique_predictions.append(np.unique(pred_list))

    #misclassification cost
    cost.append(misclassification_cost(mbx[3],pred_list))
    
df_hierarchy_cv = pd.DataFrame()
df_hierarchy_cv['Seed'] = seed
df_hierarchy_cv['Accuracy'] = accuracy
df_hierarchy_cv['MPCA'] = mpca
df_hierarchy_cv['F1_weighted'] = f1_wt
df_hierarchy_cv['Misclassification Cost'] = cost

#df_hierarchy_cv.to_excel('hierarchy_results.xlsx')

              precision    recall  f1-score   support

           1       0.95      1.00      0.98        21
           2       0.88      0.97      0.92        31
           3       0.89      0.77      0.83        31
           4       0.93      0.90      0.91        41
           5       0.97      0.98      0.97        57

    accuracy                           0.93       181
   macro avg       0.92      0.93      0.92       181
weighted avg       0.93      0.93      0.93       181

[[21  0  0  0  0]
 [ 0 30  1  0  0]
 [ 1  4 24  2  0]
 [ 0  0  2 37  2]
 [ 0  0  0  1 56]]
              precision    recall  f1-score   support

           1       0.95      1.00      0.98        21
           2       0.94      1.00      0.97        31
           3       0.96      0.84      0.90        31
           4       0.93      0.95      0.94        41
           5       0.98      0.98      0.98        57

    accuracy                           0.96       181
   macro avg       0.95      0.95      0

In [45]:
df_hierarchy_cv

Unnamed: 0,Seed,Accuracy,MPCA,F1_weighted,Misclassification Cost
0,42,0.928177,0.925366,0.926807,0.077348
1,43,0.955801,0.954477,0.955061,0.049724
2,44,0.955801,0.948387,0.954941,0.049724
3,45,0.955801,0.959198,0.955301,0.044199
4,46,0.939227,0.932179,0.93736,0.066298


In [120]:
df_hierarchy_cv['Accuracy'].mean(), df_hierarchy_cv['MPCA'].mean(), df_hierarchy_cv['F1_weighted'].mean(),df_hierarchy_cv['Misclassification Cost'].mean()

(0.9425414364640885,
 0.9368351642284309,
 0.9419874255316805,
 0.060773480662983416)

In [121]:
df_hierarchy_cv['Accuracy'].std(), df_hierarchy_cv['MPCA'].std(), df_hierarchy_cv['F1_weighted'].std(),df_hierarchy_cv['Misclassification Cost'].std()

(0.012719187219029121,
 0.01725571509631964,
 0.01322261029264977,
 0.015130457389645472)