This script was modified from the "General.ipynb" script, which was created for the purpose of benchmarking of ML methods. This script is meant to train the same networks present in "General.ipynb" with the same training set, and classify the new data collected with belt.

Created by Samuel Horovatin, February 2022

In [1]:
# Data Processing Imports
import pandas as pd
import os
from skimage.transform import resize, rotate, resize 
from skimage.io import imread
import numpy as np
import glob
import pickle

# Logger Imports
import time
import logging
import sys
from logging.handlers import TimedRotatingFileHandler

# Model Specific Imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso

In [2]:
# Sets up all standardized logger stuff
FORMATTER = logging.Formatter("%(asctime)s —  %(levelname)s — %(message)s")
LOG_FILE = "General_Script_Full_Set.log"

def get_console_handler():
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(FORMATTER)
    return console_handler

def get_file_handler():
    file_handler = TimedRotatingFileHandler(LOG_FILE, when='midnight')
    file_handler.setFormatter(FORMATTER)
    return file_handler

def get_logger(logger_name):
    logger = logging.getLogger(logger_name)
    
    if (logger.hasHandlers()): # important as removes duplicate loggers (and thus duplicate log entries)
        logger.handlers.clear()
    logger.setLevel(logging.DEBUG) # better to have too much log than not enough
    logger.addHandler(get_console_handler())
    logger.addHandler(get_file_handler())
    # with this pattern, it's rarely necessary to propagate the error up to parent
    logger.propagate = False
    return logger

logger = get_logger("General_Script")

In [3]:
# Lays out all training data paths. 
# NOTE, the folder in which the img's are stored in dictates there health class. See comments for folder class

Categories_Durum=['CFP-CK1','CFP-CK2'] # [Infected, Healthy]
Categories_Bread=['CFP-CK3','CFP-CK4'] # [Infected, Healthy]
Categories_Test_B179 = ['CFP-B179-A','CFP-B179-B'] # [Infected, Healthy]
Categories_Test_B223 = ['CFP-B223-A','CFP-B223-B'] # [Infected, Healthy]

Infected_Categories = ['CFP-CK1', 'CFP-CK3', 'CFP-B179-A', 'CFP-B223-A']

All_Categories = [Categories_Durum, Categories_Bread, Categories_Test_B179, Categories_Test_B223]

# Pickle Info Locations
Pickle_location = '/birl2/users/sch923/Thesis/Data/'
Eval_Data_Pickle_Name = 'Eval_Data.pkl'
Models_Pickle_Name = 'Models_L1_LogR.pkl'
Prediction_Pickle_Name = 'Predictions_L1_LogR.pkl'

# Training Set Info Locations
datadir='/birl2/users/sch923/Thesis/Data/Wheat/TestSamples' 

In [4]:
# Pre-processing step and loading of all training data

img_arr, label_arr = [],[]
total_imgs, total_infected, total_healthy = 0, 0, 0


# creating species specific data sets
for Categories in All_Categories:
    img_arr.append(list())
    label_arr.append(list())
    
    for i in Categories:
        logger.info(f'loading category: {i}')    
        path=os.path.join(datadir,i) 
        image_count = 0
        
        for img_path in os.listdir(path):  
            img=imread(os.path.join(path,img_path))
            img_resized=resize(img,(150,150,3))  
            img_arr[len(img_arr)-1].append(img_resized) # Saves images
            label_arr[len(img_arr)-1].append(Categories.index(i)) # Applies category based on folder
            image_count += 1
            
        logger.info(f'loaded category: {i} successfully, found {image_count} images')

        # Entire Dataset metric collection
        total_imgs = total_imgs + image_count
        if i in Infected_Categories:
            total_infected = total_infected + image_count
        else:
            total_healthy = total_healthy + image_count

logger.info(f'Number Of Healthy Kernel Images Found: {total_healthy}')            
logger.info(f'Number Of Infected Kernel Images Found: {total_infected}')
logger.info(f'Total Number Of Images Found: {total_imgs}')

2022-03-01 08:34:46,413 —  INFO — loading category: CFP-CK1
2022-03-01 08:35:04,000 —  INFO — loaded category: CFP-CK1 successfully, found 43 images
2022-03-01 08:35:04,002 —  INFO — loading category: CFP-CK2
2022-03-01 08:35:12,544 —  INFO — loaded category: CFP-CK2 successfully, found 48 images
2022-03-01 08:35:12,546 —  INFO — loading category: CFP-CK3
2022-03-01 08:35:16,042 —  INFO — loaded category: CFP-CK3 successfully, found 24 images
2022-03-01 08:35:16,043 —  INFO — loading category: CFP-CK4
2022-03-01 08:35:23,805 —  INFO — loaded category: CFP-CK4 successfully, found 31 images
2022-03-01 08:35:23,806 —  INFO — loading category: CFP-B179-A
2022-03-01 08:35:30,951 —  INFO — loaded category: CFP-B179-A successfully, found 29 images
2022-03-01 08:35:30,952 —  INFO — loading category: CFP-B179-B
2022-03-01 08:35:41,858 —  INFO — loaded category: CFP-B179-B successfully, found 74 images
2022-03-01 08:35:41,859 —  INFO — loading category: CFP-B223-A
2022-03-01 08:35:43,928 —  INFO

In [5]:
# Image flattening for training data
logger.info(f'Flattening images...')
flat_arr = []

for group in img_arr:
    flat_arr.append(list())
    for img in group:
        flat_img = img.flatten()
        flat_arr[len(flat_arr)-1].append(flat_img)        
        
logger.info(f'Image flattening complete!')


2022-03-01 08:35:56,921 —  INFO — Flattening images...
2022-03-01 08:35:57,010 —  INFO — Image flattening complete!


In [6]:
# Slicing of input data into seperate dataframes (two for each data set), one containing image data and the other containing class data

list_arr_fun = lambda l : np.array(l)
flat_img_list=list(map(list_arr_fun, flat_arr))
label_img_list=list(map(list_arr_fun, label_arr))

df_Training=pd.DataFrame(np.append(flat_img_list[0], flat_img_list[1], axis=0))

df_Training['label']=np.append(label_img_list[0], label_img_list[1], axis=0)

x_Training=df_Training.iloc[:,:-1] # Image data 
y_Training=df_Training.iloc[:,-1] # Label data

# Creates test and training split
logger.info('======= Start Of Data Split =======')
x_train,x_test,y_train,y_test=train_test_split(x_Training,y_Training, test_size=0.20,random_state=77,stratify=y_Training)
# We want all the images included in training set, thus the training and test sets are concatinated
x_train = pd.concat([x_train, x_test]) 
y_train = pd.concat([y_train, y_test])
logger.info(f'Splitted Successfully. Training Set Length = {len(y_train)}')

logger.info('Test Splitted Successfully')
logger.info('======= End Of Data Split =======\n')


2022-03-01 08:36:04,095 —  INFO — Splitted Successfully. Training Set Length = 146
2022-03-01 08:36:04,096 —  INFO — Test Splitted Successfully



In [7]:
# A generalized model that trains a given model, gathering time complexity stats about its training.

def modelTrainer(model, x_train, y_train, model_name, method_name):

    pipe = make_pipeline(model) # StandardScaler() was removed as it appears to negatively affect the baseline
    logger.info(f"Starting to train {method_name} model...")
    start = time.time()
    pipe.fit(x_train,y_train)
    end = time.time()
    logger.info(f"The {model_name} model trained in: {str(end - start)} seconds")
    return pipe

In [8]:
# Search for Pickle of previously trained models.
Pickle_Path = glob.glob(os.path.join(Pickle_location, Models_Pickle_Name))
if len(Pickle_Path) == 0:
    # Train all models
    
    logger.info('======= Start Of Logistic Regression Model Generation =======')
    LogR_param_grid= {'penalty':['l1'], 'C':[0.1,1,10,100], 'max_iter':[2048], 'solver': ['liblinear', 'saga',] } 
    LogR_model= GridSearchCV(LogisticRegression(), LogR_param_grid, n_jobs=-1)
    LogR_model = modelTrainer(LogR_model, x_train, y_train, "L1_LogR", "L1_LogR")

    # logger.info('======= Start Of SVM Model Generation =======')
    # SVM_param_grid={'C':[0.1,1,10,100],'gamma':[0.0001,0.001,0.1,1],'kernel':['rbf','poly']}
    # svc=svm.SVC(probability=True)
    # SVM_model= GridSearchCV(svm.SVC(probability=True), SVM_param_grid, n_jobs=-1)
    # SVM_model= modelTrainer(SVM_model, x_train, y_train, "SVM", "SVM")

    # logger.info('======= Start Of K Nearest Neighbors Model Generation =======')
    # KNN_param_grid={'n_neighbors':[5], 'weights':['uniform', 'distance']}
    # KNN_model= GridSearchCV(KNeighborsClassifier(), KNN_param_grid, n_jobs=-1)
    # KNN_model= modelTrainer(KNN_model, x_train, y_train, "KNN", "KNN")

    # logger.info('======= Start Of Random Forest Model Generation =======')
    # RanF_param_grid={'n_estimators':[100], 'criterion': ['gini', 'entropy']}
    # RanF_model= GridSearchCV(RandomForestClassifier(), RanF_param_grid, n_jobs=-1)
    # RanF_model= modelTrainer(RanF_model, x_train, y_train, "RanF", "RanF")

    Trained_Models = [LogR_model]
    with open(os.path.join(Pickle_location, Models_Pickle_Name),"wb") as file_handle:
        pickle.dump(Trained_Models, file_handle, pickle.HIGHEST_PROTOCOL)
    logger.info(f'Successfully created trained models pickle')
else:
    logger.info('Found Pickle, starting unpickling...')
    with open(os.path.join(Pickle_location, Eval_Data_Pickle_Name),"rb") as file_handle:
        Trained_Models = pickle.load(file_handle)
        logger.info(f'Successfully loaded models pickle!')


2022-03-01 08:36:04,297 —  INFO — Starting to train L1_LogR model...




2022-03-01 08:45:21,707 —  INFO — The L1_LogR model trained in: 557.4090864658356 seconds
2022-03-01 08:45:21,724 —  INFO — Successfully created trained models pickle




In [9]:
# Specifying the location of data to be evaluated by the models.
Eval_Inpath = "/birl2/users/sch923/Thesis/Data/phenoSEEDOutput"
Eval_Labels = "/birl2/users/sch923/Thesis/Data/UGRep2FDK.txt"
Eval_Extension = '*.jpg'

In [10]:
# Search for Pickle of previously loaded imaged. Saves time vs re-indexing and pre-processing images.
Pickle_Path = glob.glob(os.path.join(Pickle_location, Eval_Data_Pickle_Name))

if len(Pickle_Path) == 0:
    # Collect all images (in the form of .npz) to be evaluated by the networks
    Evale_Images_Paths = glob.glob(os.path.join(Eval_Inpath, '**', Eval_Extension), recursive=True)
    if len(Evale_Images_Paths) == 0:
        logging.error(f' there are no {Eval_Extension} found in supplied directory: \n{Eval_Inpath}')


    # Preform an identical pre-procesisng step as images used for training.
    Eval_img_tuple_arr= [] # contains lot name, resized image
    for img_path in Evale_Images_Paths:
        img=imread(os.path.join(path,img_path))
        Eval_img_resized = resize(img,(150,150,3))
        Eval_flat_img = Eval_img_resized.flatten()
        Eval_flat_img_arr = np.array(Eval_flat_img)
        Eval_flat_df=pd.DataFrame(Eval_flat_img_arr)

        lot_name = img_path.split('/')[-3] # Based on the file path we can derive the lot name  
        Eval_img_tuple_arr.append((lot_name, Eval_flat_df)) # Saves flat images
    
    with open(os.path.join(Pickle_location, Eval_Data_Pickle_Name),"wb") as file_handle:
        pickle.dump(Eval_img_tuple_arr, file_handle, pickle.HIGHEST_PROTOCOL)
    logger.info(f'Successfully created evaluation image pickle')
else:
    with open(os.path.join(Pickle_location, Eval_Data_Pickle_Name),"rb") as file_handle:
        Eval_img_tuple_arr = pickle.load(file_handle)
        logger.info(f'Successfully loaded eval images pickle!')

logger.info(f'Successfully loaded evaluation images, found {len(Eval_img_tuple_arr)} images')



2022-03-01 08:46:15,047 —  INFO — Successfully loaded eval images pickle!
2022-03-01 08:46:15,048 —  INFO — Successfully loaded evaluation images, found 46586 images


In [11]:
Model_Label = ['L1_LogR']
# Model_Label = ['Logistic Regression', 'Support Vector Machine', 'K Nearest Neighbors', 'Random Forest'] # NOTE, these label names need to be in the same order as the models in Trained_Models
Prediction_dict_arr = []

Pickle_Path = glob.glob(os.path.join(Pickle_location, Prediction_Pickle_Name))

if len(Pickle_Path) == 0:
    for model_name, model in zip(Model_Label, Trained_Models):
        Infected_count = 0
        Kernel_count = 0
        Infected_Percent_dict = dict()
        Last_label = ""
        logger.info(f'Starting predictions for {model_name} model...')
        for (label, img) in Eval_img_tuple_arr:
            if not label in Infected_Percent_dict.keys():
                # Guard for first loop iteration
                if not Last_label == "":
                    Infected_Percent_dict[Last_label]= (Infected_count/Kernel_count, Infected_count, Kernel_count) # Save relevant counts for last label
                    Infected_count = 0
                    Kernel_count = 0
                    Last_label = label
                else: 
                    Last_label = label
            
            pred = model.predict(img.T)
            
            if pred[0] == 1:
                Infected_count = Infected_count + 1  
            Kernel_count = Kernel_count + 1
            
        # Save each models prediction results in list
        Prediction_dict_arr.append((model_name, Infected_Percent_dict))

    with open(os.path.join(Pickle_location, Prediction_Pickle_Name),"wb") as file_handle:
        pickle.dump(Prediction_dict_arr, file_handle, pickle.HIGHEST_PROTOCOL)
    logger.info(f'Successfully created predictions pickle')

else:
    with open(os.path.join(Pickle_location, Prediction_Pickle_Name),"rb") as file_handle:
        Prediction_dict_arr = pickle.load(file_handle)
        logger.info(f'Successfully loaded prediction pickle!')


2022-03-01 08:46:15,202 —  INFO — Starting predictions for L1_LogR model...
2022-03-01 13:28:18,772 —  INFO — Successfully created predictions pickle


In [12]:
import math
# Get all image metadata
# CN, Abbrev., Greenhouse Entry, Tall/Short, U of G FDK, U of G FDK%, UofG Imaging #
Eval_Label_List = pd.read_csv(Eval_Labels, delimiter='\t').values.tolist()
logger.info(f'Loaded labels from {Eval_Labels}')
headers = ['ModelName', 'Abbrev.', 'PredictedInfected', 'ActualInfected', 'Accuracy']
metrics_rows = []
for (model_name, Prediction_dict) in Prediction_dict_arr:
    logger.info(f'Starting metric gathering for {model_name}...')
    for Eval_Entry in Eval_Label_List:
        if Eval_Entry[1] in Prediction_dict and not math.isnan(Eval_Entry[4]) and Eval_Entry[4] > 0.0 and float(Prediction_dict[Eval_Entry[1]][1]) > 0.0:
            metrics = []
            metrics.append(model_name)
            metrics.append(Eval_Entry[1])
            metrics.append(Eval_Entry[4])  # The index of the U of G FDK or the number of infected kernels
            metrics.append(float(Prediction_dict[Eval_Entry[1]][1])) # The index of the Abbrev. lot label, the index of the # infected predicted
            if Eval_Entry[4] > float(Prediction_dict[Eval_Entry[1]][1]):
                metrics.append(float(Prediction_dict[Eval_Entry[1]][1]/Eval_Entry[4]))
            else:
                metrics.append(Eval_Entry[4]/float(Prediction_dict[Eval_Entry[1]][1]))
            metrics_rows.append(metrics
    # Generater confusion matrix
    )
    logger.info(f'Completed metric gathering for {model_name}!')

pd.DataFrame(metrics_rows, columns =headers).to_csv('L1_LogR_Prediction.csv',index=False)
# pd.DataFrame(metrics_rows, columns =headers).to_csv('Full_Set_Prediction.csv',index=False)


logger.info(f'Saved gathered metrics to csv!')   


2022-03-01 13:28:19,098 —  INFO — Loaded labels from /birl2/users/sch923/Thesis/Data/UGRep2FDK.txt
2022-03-01 13:28:19,098 —  INFO — Starting metric gathering for L1_LogR...
2022-03-01 13:28:19,104 —  INFO — Completed metric gathering for L1_LogR!
2022-03-01 13:28:19,129 —  INFO — Saved gathered metrics to csv!
