# ICSE 2022
---

**Article:** Evaluating Test Oracles based on Deep Neural Networks for Scientific Software

**Licence**: GNU GENERAL PUBLIC LICENSE, Version 3 (GPLv3)



This notebook is part of the implementation of the **Select Data** task of the **Test Oracle based on CNN** (TOrC) method presented in the abovementioned article. It relies on <a href="https://ieeexplore.ieee.org/document/1284395">structural similarity</a> to devise sets with the most dissimilar images in order to have more representative samples and avoid data redundancy.




In [None]:
from skimage.metrics import structural_similarity as ssim
import numpy as np
import cv2
import glob
import itertools
import operator
import os


# Structural similarity
---

Images are compared based on structural similarity according to a predefined limit (threshold). 


In [None]:
def compare_images(im_n, limit = 0.85):
    simd = {}
    acc_simd = {}
    disc_simd = {}
    all_ind = []
    
    for i in im_n:
        all_ind.append(i) ## All the keys
    
    for a, b in itertools.combinations(all_ind, 2):
        simd[a+'  X  '+b] = ssim(im_n[a], im_n[b])
    
    acc_simd = simd.copy()
    disc_simd = simd.copy()
    
    for s in simd:
        if (simd[s] > limit):
            del acc_simd[s]
        else:
            del disc_simd[s]
             
    return simd, acc_simd, disc_simd
      

# Find the best images
---

The best/most suitable images are then obtained where the best one presents a higher score in the file `...final.txt`. The function `find_best_images` assumes that the following directory structure has already been created within the output directory:

<ul>
<li>train</li>
    <ul>
        <li>correct</li>
        <li>mutantd</li>
     </ul>    
<li>val</li>
    <ul>
        <li>correct</li>
        <li>mutantd</li>
     </ul>    
</ul>

Images of the training and validation datasets are automatically moved to the `train` and `val` directories, respectively. Note that you must work with `correct` or `mutantd` images separately. The parameter `output_end` defines the class which is being considered.


In [None]:
def find_best_images(input_file, input_dir, output_dir, output_end):
    flat = open(input_file+'final.txt','w+')
    file = open(input_file+'.txt', 'r')
    unique = {}
    
    for line in file.readlines():
        value = []
        value = line.split()
        if (value[0] in unique):
            unique[value[0]] += 1
        else:
            unique[value[0]] = 1
        
        if (value[2] in unique):
            unique[value[2]] += 1
        else:
            unique[value[2]] = 1   
    
  
    sorted_unique = dict(sorted(unique.items(), key=operator.itemgetter(1),reverse=True))
    
    csel = 0
    for i in sorted_unique:
        flat.write(str(i)+ ' - ' + str(sorted_unique[i]) + '\n')
        csel+=1
        if (csel <= 50):
            if ((csel % 9) == 0) or ((csel % 10) == 0): # val set
                os.rename(input_dir+str(i)+".png", output_dir+"val/"+output_end+"/"+str(i)+".png")
            else: # train set
                os.rename(input_dir+str(i)+".png", output_dir+"train/"+output_end+"/"+str(i)+".png")
        
    file.close()  
    flat.close()    
        

# Handling files
---

Auxiliary functions to handle files.


In [None]:
def short_name(s):
    ind=s.rfind("/")
    new_s=s[ind+1:-4]
    return new_s
    
def save_files(data, name):
    file = open(name+'.txt',"w+")
    for i in data:
        file.write(str(i)+ ' - ' + str(data[i]) + '\n')
    file.close()
    print('length ' + name + ' :', len(data))
    print('#######################################################\n\n')
    

# Main code
---

Run the main code.

In [None]:
# Input directory
dirimages =  "/Users/valdivino/Documents/Des/pythonw/DL/pyTorch/TestOracle/versao4/input/"
# Output directory
dirdest = "/Users/valdivino/Documents/Des/pythonw/DL/pyTorch/TestOracle/versao4/output/"

im_name = {}
for f in glob.iglob(dirimages+"*.*"):
    image = cv2.imread(f)
    # Convert the images to grayscale
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    im_name[short_name(f)] = image


# Compare the images based on a limit (threshold).
orig, accepted, discarded = compare_images(im_name, limit = 0.85)

# Save files.
save_files(orig,'orig')
save_files(accepted,'accepted')
save_files(discarded,'discarded')

# Find the best images. Choose the value of the last parameter according to the class: "correct" or "mutantd".
find_best_images('accepted', dirimages, dirdest, "correct")
