In [None]:
# This notebook will compute the best bagging model, given as input the results on validation of different models.
# The prediction of each model on the validation set have to be saved in the same folder. This script will brute force every combination and
#  will select the combination with the highest validation score.

In [None]:
import tensorflow as tf
import os
import numpy as np
import csv
import itertools
from scipy import stats

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# SEED setting
SEED = 1234
tf.random.set_seed(SEED)

In [None]:
#unzip our dataset in the current working directory.
!unzip "/content/drive/My Drive/challenge1/dataset/MaskDataset.zip" 

Archive:  /content/drive/My Drive/challenge1/dataset/MaskDataset.zip
replace MaskDataset/test/test/10001.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
validResultPath = "/content/drive/MyDrive/challenge1/validationResults"
val_dir = "/content/MaskDataset/val"

In [None]:
none_images = os.listdir(os.path.join(val_dir, "0_none"))
all_images = os.listdir(os.path.join(val_dir, "1_all"))
some_images = os.listdir(os.path.join(val_dir, "2_some"))

In [None]:
content = os.listdir(validResultPath) #all the file in the folder
models = {}
#load the prediction on the validation set of each model in a dictionary. At the end,
# the models dictionary will contain for each model a dictionary with the prediction
# for each image.
for prediction in content:
    res_path = os.path.join(validResultPath, prediction)
    f = open(res_path)
    reader = csv.DictReader(f, delimiter=",")
    a = {}
    for row in reader:
        a[row["Id"]] = int(row["Category"]) 
    f.close()   
    models[prediction] = a

In [None]:
#this snippet computes all the possible combinations of models and computes the
#majority voting of theirs prediction
maxAcc = 0
maxComb = None
bestDict = None
print("numero di modelli {}".format(len(models)))
#i is the number of extracition without repetition done from the models dictionary.
#the lowest value is 4, since the models have to vote on 3 classes
for i in range(4,len(models)+1):
  print("controllo con {} combinazioni".format(i))
  #get the names of the models
  keys = list(models.keys())

  #itertools.combinations returns a list of all the possible combination of the array keys
  #with size i
  combinations = list(itertools.combinations(keys, i))

  #for each combination, compute the majority voting of eacj prediction and
  #the validation accuracy
  for combination in combinations:
    listDict = {}

    #empty list in which votes have to be put
    for key in models[combination[0]].keys():
        listDict[key] = []

    #each models puts in its votes
    for name in combination:
      model = models[name]
      for key in model.keys():
          listDict[key].append(model[key])
    
    #performing of the majority voting
    for key in listDict:
      array = listDict[key]
      mode = stats.mode(array).mode[0]
      listDict[key] = mode 
    
    #compute the accuracy of the bagged models
    total = 0
    right = 0
    for immagine in listDict.keys():
      total +=1
      if listDict[immagine] == 0:
        if immagine in none_images:
          right +=1
      if listDict[immagine] == 1:
        if immagine in all_images:
          right +=1       
      if listDict[immagine] == 2:
        if immagine in some_images:
          right +=1
    accuracy = right/total
    
    #save the best model if it has an higer accuracy
    if (accuracy > maxAcc):
      maxAcc = accuracy
      maxComb = combination
      bestDict = listDict  
      print("MaxAcc={}\nmaxComb={}\n".format(maxAcc, maxComb))               
    
      
print("Il massimo è: {}\nAccuracy:{}".format(maxComb, maxAcc))

In [None]:
#saves the pediction of the validation set in a file, if they have to be used again
pathBestBagging = "/content/drive/MyDrive/challenge1/bestBagging/bestBag.csv"
with open(pathBestBagging, "w") as r:
    writer = csv.DictWriter(r, delimiter=",", fieldnames = ["Id", "Category"] )
    writer.writeheader()
    for key in bestDict:
        writer.writerow({"Id":key, "Category":listDict[key]})