In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

In [2]:
def fix_file_index(data_path, inter_path):
    """ Fix the index of training set input and test set result output. """
    train_lab_path = f"{data_path}/train_label.csv"
    train_label = pd.read_csv(train_lab_path)
    train_filename = train_label['filename'].tolist()

    # Fix the index of test_filename
    test_filename = sorted(os.listdir(f"{data_path}/test/pe"))
    pd.DataFrame({'filename': test_filename}).to_csv(f"{inter_path}/test_filename.txt", header=False, index=False)

    # Get the same samples labeled 7, 8, 9 both in training and test samples
    delect_filename = train_label[train_label['family'].isin([7, 8, 9]) &
                                  (train_label['filename'].isin(test_filename))]['filename'].tolist()
    np.save(f"{inter_path}/train_filename_de", delect_filename)

    # Get rid of training samples labeled 7, 8, 9 but not in the test set
    other_filename = train_label[train_label['family'].isin([7, 8, 9]) &
                                 (~train_label['filename'].isin(test_filename))]['filename'].tolist()
    train_filename = list(set(train_filename) - set(other_filename))
    train_label = train_label[train_label['filename'].isin(train_filename)]
    train_filename = train_label['filename']
    train_y = np.array(train_label['family'])
    train_filename.to_csv(f"{inter_path}/train_filename.txt", header=False, index=False)
    np.save(f"{inter_path}/train_y.npy", train_y)

In [3]:
def vote_weight_results(labels_loss, vote_list, feature_list):
    """ Weighted soft voting result ensemble. """
    result_ensemble = np.zeros(vote_list[0].shape, dtype=float)
    final = np.zeros(vote_list[0].shape, dtype=float)
    for i, res in enumerate(vote_list):
        weight = np.array(labels_loss[feature_list[i]])
        result_ensemble += res * weight
    # Get the index of max probability
    pred = np.argmax(result_ensemble, axis=1)
    for i, v in enumerate(pred):
        vals = np.array([res[i, v] for res in vote_list])
        max_pos = np.argmax(vals)
        final[i, :] = vote_list[max_pos][i, :]
    return final

In [None]:
def vote_results(vote_list):
    result_ensemble = np.zeros(vote_list[0].shape, dtype=float)
    pred_list = []
    final = np.zeros(vote_list[0].shape, dtype=float)
    for res in vote_list:
        result_ensemble += res
        
    pred = np.argmax(result_ensemble, axis=1)
    for i, v in enumerate(pred):
        vals = np.array([res[i, v] for res in vote_list])
        max_pos = np.argmax(vals)
        final[i, :] = vote_list[max_pos][i, :]    
    return final

In [4]:
def get_class_logloss(x):
    """ Calculate the logloss of each family. """
    class_pred = np.array(x.iloc[:, :-1])
    class_result = np.array(x['family'])
    class_loss = log_loss_custom(class_result, class_pred)
    return class_loss

In [5]:
def load_data(data_type, feature_list, inter_path):
    """ Get dataset (dict) made up by different features. """

    data_dict = {}
    for feature in feature_list:
        data_dict[feature] = np.load(f"{inter_path}/feature/{data_type}_{feature}.npy")

    return data_dict

In [1]:
def log_loss_custom(y_true, y_pred):
    """ Logloss that supports single category calculation. """
    summ = 0.0
    for i in range(len(y_true)):
        summ -= np.log(max(y_pred[i][y_true[i]], 1e-15))
    return summ / len(y_true)