In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, accuracy_score

import pandas as pd
import numpy as np
import h5py

In [9]:
def data_importing(verbose = True):

    # Features list we are interested in
    features_list = [12, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 52]

    # Get file
    f = h5py.File('tutorials/Data/JetDataset/jetImage_7_100p_30000_40000.h5')

    # Enclose the file in a numpy array and retrieve information about that
    images = np.array(f.get("jetImage"))

    if verbose == True:
        print('The file contains {} bidimensional item, each having {} rows x {} columns. They resemble an image.\n'\
                            .format(images.shape[0], images.shape[1], images.shape[2]))

    # For each item, extract the X's features
    feature_names = (f.get("jetFeatureNames"))[features_list]

    # Print the names of the features
    if verbose == True:
        print('Names of the features:\n', feature_names)

    feature = pd.DataFrame(f.get("jets")[:,features_list], columns = feature_names)
    feature.columns = feature.columns.astype(str)

    if verbose == True:
        print('\nPrint the dataset: \n', feature.head(1))

    # For each item, extract the y's feature. The y's are one hot encoded, so we're dealing with a multiclass clusterization problem.
    target = pd.DataFrame(f.get('jets')[0:,-6:-1])
    if verbose == True:
        print("\nSample y's row:\n", target[0:1])

    return images, feature, feature_names, target

In [10]:
def replace_numbers_with_letters(dataset, mode = 'letter'):
    df = pd.DataFrame(columns=['label'], data=dataset.idxmax(axis=1))
    if mode == 'letter':
        df = df.applymap(lambda x: 'A' if x == 0 else x)
        df = df.applymap(lambda x: 'B' if x == 1 else x)
        df = df.applymap(lambda x: 'C' if x == 2 else x)
        df = df.applymap(lambda x: 'D' if x == 3 else x)
        df = df.applymap(lambda x: 'E' if x == 4 else x)
    return df

In [11]:
# 80% of the dataset will be used as train and the remaining part as testing
# Using "stratify" we make sure the random shuffling will take into account the original distribution of each class

# Define train/test dataset splitting, scaling implemented
def train_test(dataset, scaling = False):
    X = dataset.iloc[:,:dataset.shape[1]-1]
    y = dataset.iloc[:,-1]

    if scaling == True:
        X_columns = X.columns.to_list()
        X = pd.DataFrame(normalize(X, axis=0), columns=X_columns)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
        test_size=0.2,
        stratify=dataset.iloc[:,-1],
        random_state=42)

    return X_train, X_test, y_train, y_test

In [None]:
# Define the classification report. "macro avg" for precision, recall and f1 will be used as a benchmark
def classification_reporting(tuning, model, y_test, y_pred, X_train):
    report = classification_report(y_test, y_pred, output_dict = True)
    print('{} {} performance using {} features:\n'.format(tuning, model, X_train.shape[1]),
          report['macro avg'])
    print('Accuracy score:', accuracy_score(y_test, y_pred))