In [329]:
import cv2
import os
import random
import numpy as np
import time
import pickle
import pandas as pd

In [330]:
# hyper-parameters
ROOT_FOLDER = "..\.."
DATA_FOLDER = os.path.join(ROOT_FOLDER, "data")
TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train_set")
TEST_FOLDER = os.path.join(DATA_FOLDER, "test_set")
PROCESSED_DATA = os.path.join(ROOT_FOLDER, 'processed_data')
TEMPLATE_FOLDER = os.path.join(PROCESSED_DATA, 'templates')

In [432]:
def match_template(input_path, template_path, category, method = "pixel_count", debug_mode = False):
    """
    function that matches the image to a template.

    :input_path: str path to image 
    :template_path: str path to template  
    :method: str identifier of what method to use for matching
    :debug_mode: decides whether to return additional debugging data
    :return: an error variable dependent on the chosen method and debugging info or None depending on mode.
    """

    # sometimes the category is given as as 
     
    category_converter = {'00':1, '01':2, '02':2, '03':3, '04':3, '05':4, '06':5, '07':6, '08':6, '09':6, '10':6}
    if category in list(category_converter.keys()):
        category = category_converter[category]

        stored_parameters = {
            1: 50,
            2: 50,
            3: 50,
            4: 50,
            5: 50,
            6: 25
        }

    else:
        category = int(category)

        stored_parameters = {
            1: 35,
            2: 25,
            3: 50,
            4: 75,
            5: 75,
            6: 100
        }

    # load in image and template
    sample_image = cv2.imread(input_path, cv2.IMREAD_GRAYSCALE)
    template_image = cv2.imread(template_path, cv2.IMREAD_GRAYSCALE)

    if method == "pixel_count":
        # important for this method is that the oprder of subtracting does matter. 
        # because a 2 will have holes in the same spots as a 5 and will register a false positive

        thresh = stored_parameters[category]

        diff = template_image - sample_image
        errors = (diff > thresh).sum()

        if debug_mode:
            return errors, sample_image, template_image

        return errors, None, None

In [388]:
def gather_samples(processed_data=True):
    """
    gather all files from all folders

    :type: str representing which files we want
    :return: list containg a list for each sample with the path and the category
    """

    if processed_data:
        data_folder = PROCESSED_DATA
        templates = [1,2,3,4,5,6]
    else:
        data_folder = DATA_FOLDER
        template_folder = os.path.join(DATA_FOLDER, "train_set")
        templates = os.listdir(template_folder)

    all_files = []

    for template in templates:

        # get the folder name, all filenames inside it, and make a list of all the image files inside
        train_folder = os.path.join(data_folder, 'train_set')
        folder = os.path.join(train_folder, str(template))
        filenames = os.listdir(folder)
        files = [file for file in filenames if ".png" in file]

        for file in files:
            random_file_path = os.path.join(folder,file)

            all_files.append([random_file_path, template])

    return all_files


In [340]:
all_files = gather_samples()

df = pd.DataFrame(all_files)
df = df.set_axis(['path', 'category'], axis=1)

In [389]:
# code below calculates the errors for each of the categories.

def train_model(processed_data = True):
    """
    A very basic model where each sample n has x features that represent the error per category
    calculated by the match template - pixel count method.

    :returns: a list containing a list for each sample containing x errors, 1 for each category. 
    """

    if processed_data:
        all_templates = os.listdir(TEMPLATE_FOLDER)
    else:
        template_folder = os.path.join(DATA_FOLDER, "templates")
        all_templates = os.listdir(template_folder)
        all_templates.remove('ano.png')
    
    all_errors = []
    # for each sample
    for idx, row in df.iterrows():

        sample_path = row['path']
        errors_per_row = []

        for idx, template in enumerate(all_templates):
            category = template
            category = category.replace(".png","")
            template_file = template
            correct_template_path = os.path.join(template_folder, template_file)
            errors, _, _ = match_template(sample_path, correct_template_path, category)
            errors_per_row.append(errors)
        all_errors.append(errors_per_row)

    return all_errors


In [None]:
all_errors = train_model()

results = pd.DataFrame(all_errors)
results = results.transpose().reset_index(drop=True).transpose()
results = results.set_axis([1,2,3,4,5,6], axis=1)

df = pd.concat([df,results], axis=1)
df.to_csv('results.csv')

df.head()

In [263]:
thresholds = {}

for x in range(6):
    col = x+1

    correct = df.loc[df.category == col]
    false = df.loc[df.category != col]
    
    print(correct[col].mean(), false[col].mean())

    thresholds[col] = (correct[col].mean() + false[col].mean())/2

5077.420624151968 5529.879499485773
5736.462890625 6387.752478817379
5769.956910569105 6368.983710915559
5832.717811158798 6556.790920375953
5771.823399558499 6686.0946160635485
5733.020665901263 7280.811141022986


In [264]:
thresholds

{1: 5303.65006181887,
 2: 6062.10768472119,
 3: 6069.470310742332,
 4: 6194.754365767376,
 5: 6228.959007811023,
 6: 6506.915903462124}

In [322]:
average = 0
for key, val in thresholds.items():
    average += val

average = average - 500
average = average / 6

In [323]:
# gather all anomaly files 
anom_files = []

train_folder = os.path.join(PROCESSED_DATA, 'train_set')
folder = os.path.join(train_folder, "ano")
filenames = os.listdir(folder)
files = [file for file in filenames if ".png" in file]

for file in files:
    random_file_path = os.path.join(folder,file)

    anom_files.append([random_file_path, template])



In [324]:
predictions = []
total_detection = 0

for anom in anom_files:
    prediction = 0
    for idx, template in enumerate(all_templates):
        category = idx + 1
        template_file = template
        correct_template_path = os.path.join(TEMPLATE_FOLDER, template_file)
        errors, _, _ = match_template(anom[0], correct_template_path, category)
        if errors > average:
            prediction = 1
    predictions.append(prediction)

correct_predictions = [1 for x in predictions]

from sklearn.metrics import f1_score
f1_score(correct_predictions, predictions, average='macro')

In [325]:
correct_predictions = [1 for x in predictions]

In [326]:
from sklearn.metrics import f1_score
f1_score(correct_predictions, predictions, average='macro')

0.44

In [445]:
# now try with the old images
all_files = gather_samples(processed_data=False)

df = pd.DataFrame(all_files)
df = df.set_axis(['path', 'category'], axis=1)

In [446]:
all_errors = train_model(processed_data=False)

In [447]:
results = pd.DataFrame(all_errors)
results = results.transpose().reset_index(drop=True).transpose()
results = results.set_axis([1,2,3,4,5,6,7,8,9,10,11], axis=1)

df = pd.concat([df,results], axis=1)
df.to_csv('results.csv')
df = df[df.category != "ano"]

df.head()

Unnamed: 0,path,category,1,2,3,4,5,6,7,8,9,10,11
0,..\..\data\train_set\00\16_09_21_00_000.png,0,5212,6575,6435,7241,6962,7876,8134,10128,9965,10064,10176
1,..\..\data\train_set\00\16_09_21_00_001.png,0,7718,7810,8564,8488,9142,9239,9321,12242,12369,11866,12491
2,..\..\data\train_set\00\16_09_21_00_002.png,0,2547,5038,4951,5853,4696,7310,7227,8043,8479,8138,8663
3,..\..\data\train_set\00\16_09_21_00_003.png,0,2329,4782,4542,5059,4336,6893,6905,7885,8254,8400,8276
4,..\..\data\train_set\00\16_09_21_00_004.png,0,3795,5476,5726,6524,5837,7495,7561,9945,9539,9579,9929


In [448]:
df['category'] = pd.to_numeric(df.category)

In [468]:
thresholds = {}

for x in range(10):
    col = x+1

    correct = df.loc[df.category == col]
    false = df.loc[df.category != col]
    
    print(correct[col].mean(), false[col].mean())

    thresholds[col] = (correct[col].mean() + false[col].mean())/2

8503.953703703704 8180.433758912286
9335.028925619834 9155.987021521274
9064.86303630363 9183.330427493713
9822.004807692309 9802.97948545485
9311.492489270386 9663.397942897676
8987.030905077263 10396.141394527802
10920.125907990314 10235.96346216304
10825.793969849246 12040.647011177709
10877.939334637966 11959.446864686468
10932.016666666666 11982.497317509347


In [477]:
thresholds

{1: 8342.193731307994,
 2: 9245.507973570555,
 3: 9124.096731898671,
 4: 9812.492146573579,
 5: 9487.44521608403,
 6: 9691.586149802533,
 7: 10578.044685076677,
 8: 11433.220490513479,
 9: 11418.693099662218,
 10: 11457.256992088007}

In [473]:
average = 0
for key, val in thresholds.items():
    average += val

average = average - 500
average = average / 10

In [474]:
average

10009.053721657776

In [475]:
# gather all anomaly files 
anom_files = []

train_folder = os.path.join(DATA_FOLDER, 'train_set')
folder = os.path.join(train_folder, "ano")
filenames = os.listdir(folder)
files = [file for file in filenames if ".png" in file]

for file in files:
    random_file_path = os.path.join(folder,file)

    anom_files.append([random_file_path, template])


In [476]:
template_folder = os.path.join(DATA_FOLDER, "templates")
all_templates = os.listdir(template_folder)
all_templates.remove('ano.png')

predictions = []
total_detection = 0

for anom in anom_files:
    prediction = 0
    for idx, template in enumerate(all_templates):
        category = template
        category = category.replace('.png', '')
        template_file = template
        correct_template_path = os.path.join(template_folder, template_file)
        errors, _, _ = match_template(anom[0], correct_template_path, category)
        if errors > average:
            prediction = 1
    predictions.append(prediction)

correct_predictions = [1 for x in predictions]

from sklearn.metrics import f1_score
f1_score(correct_predictions, predictions, average='macro')

0.4042553191489362