In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from plotly import express as px
from sklearn import feature_selection


In [2]:
def plot_scores(df, predicate=None):
    if not predicate:
        predicate = lambda d: d['score'] > 0.2

    fig = px.bar(df[predicate(df)],
                 x='score',
                 y='label',
                 labels={'score': 'Score', 'label': 'Feature'},
                 range_x=(0.0, 1.0))
    fig.update_layout(yaxis={"dtick": 1},
                      margin={"t": 10, "b": 10},
                      bargap=0.25,
                      height=1000)
    fig.show()


In [3]:
scenarios = ['raw', 'min_max', 'standardized']
scores_list = []

for s in scenarios:
    base_dir = pathlib.Path(f'../gtzan/processed/{s}').absolute()
    df = pd.read_csv(base_dir.joinpath('train.csv'))

    X = df.iloc[:, 2:].to_numpy()
    Y = df['label'].to_numpy()


    # Calculo das características
    selector = feature_selection.SelectKBest(feature_selection.f_classif, k=5)
    selector.fit(X, Y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    X_labels = df.columns[2:].to_list()

    # Criação do DataFrame
    score_df = pd.DataFrame(data={
        'label': X_labels,
        'score': scores
    }).sort_values(by=['score'],
                inplace=False,
                ignore_index=True,
                ascending=True)

    scores_list.append(score_df)
    

In [4]:
# Validação, observações indicaram que todos possuem mesmos valor
for i in range(0, len(scores_list) - 1):
    c = scores_list[i]
    n = scores_list[i + 1]

    assert c['label'].equals(n['label']) 
    assert np.allclose(c['score'], n['score'])


In [5]:
plot_scores(scores_list[0])