In [1]:
import numpy as np
from MTCFeatures import MTCFeatureLoader
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from fractions import Fraction
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

### Récupération des données

In [2]:
fl = MTCFeatureLoader('MTC-FS-INST-2.0')
seqs = fl.sequences()

### Récupération des informations des features de chaque séquence.
On les met sous forme de dataframe pandas.


In [3]:
%%time
phrase_data = []
for ii, x in enumerate(seqs):
    phrase_data.append({
        'id': x['id'],
        **x['features']
    })

CPU times: total: 9.39 s
Wall time: 17.2 s


In [4]:
df = pd.DataFrame(phrase_data)

### Récupération d'un échantillon du jeu de données

In [5]:
df = df.sample(frac=1/4)

### Création des sous-séquences
Taille = 4, décalage = 2, transformation des valeurs None en 0.

In [6]:
numerical_columns = [ 'scaledegree', 'imaweigth', 'pitch40', 'midipitch', 'diatonicpitch', 'diatonicinterval', 'chromaticinterval', 'pitchproximity', 'pitchreversal', 'duration', 'onsettick', 'phrasepos', 'phrase_ix', 'songpos', 'IOI', 'IOR', 'beatstrength', 'beat_str', 'beat', 'timesignature', 'gpr2a_Frankland', 'gpr2b_Frankland', 'gpr3a_Frankland', 'gpr3d_Frankland', 'gpr_Frankland_sum', 'lbdm_spitch', 'lbdm_sioi', 'lbdm_srest', 'lbdm_rpitch', 'lbdm_rioi', 'lbdm_rrest', 'lbdm_boundarystrength']

# On remplace les valeurs None par des 0.
for col in numerical_columns:
    if col in df.columns:
        df[col] = df[col].apply(
        lambda x: [0 if v is None else v for v in x]
    )

In [7]:
size = 4 #taille des sous sequences 
step = size // 2 #décalage

subsequences = []
labels = []
ids = []
list_columns = []

for col in df.columns:
    if col != 'id' and df[col].apply(lambda x: isinstance(x, list)).all():
        list_columns.append(col) #prend les colonnes qui sont composées de liste

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    song_id = row['id']
    sequence_length = len(row['scaledegree'])

    if sequence_length == 0:
        continue # Passage a la sequence suivante si ça taille est de 0
    
    #Parcours de séquences pour créer des sous-séquences.
    start_idx = 0
    while start_idx + size <= sequence_length: 
        end_idx = start_idx + size

        subseq = {}
        for col in list_columns:
            subseq[col] = row[col][start_idx:end_idx]
        if len(subseq['scaledegree']) == size:
            label = subseq['phrase_end'][-1] #Utilisation de la dernière valeur de phrase_end, pour labelliser la sous-séquence.
            subsequences.append(subseq)
            labels.append(label)
            ids.append(song_id)

        start_idx += step  
    
    # Dernière sous séquences
    if start_idx < sequence_length:
        end_idx = sequence_length

        subseq = {}
        for col in list_columns:
            subseq[col] = row[col][start_idx:end_idx]
        if len(subseq['scaledegree']) == size:
            label = subseq['phrase_end'][-1]
            subsequences.append(subseq)
            labels.append(label)
            ids.append(song_id)

subsequences = pd.DataFrame(subsequences)
subsequences['id'] = ids
subsequences['label'] = labels

100%|██████████| 4527/4527 [00:38<00:00, 116.94it/s]


### Choix des features à utiliser
Sélection des features, puis on étend les listes de sous-séquences. 

In [8]:
feature_columns = ["duration", "beatinphrase", 'restduration_frac', "beatinphrase_end", "beatstrength", "gpr2b_Frankland", "gpr_Frankland_sum", "lbdm_srest", "lbdm_boundarystrength", "pitch40", 'imaweight']

feature_arrays = []
for idx, row in tqdm(subsequences.iterrows(), total=subsequences.shape[0], desc="Processing subsequences"):
    feature_vector = []
    for col in feature_columns:
        feature_vector.extend(row[col]) #on étend les sequences.
    feature_arrays.append(feature_vector)

features = np.array(feature_arrays)
labels = np.array(subsequences['label'])
ids = np.array(subsequences['id'])

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)



Processing subsequences: 100%|██████████| 154481/154481 [00:13<00:00, 11616.25it/s]


### Formatage des données
Transformation des fractions en valeur numérique, transformation des données catégorielles et normalisation.


In [9]:
numerical_columns = [ 'scaledegree', 'imaweigth', 'pitch40', 'midipitch', 'diatonicpitch', 'diatonicinterval', 'chromaticinterval', 'pitchproximity', 'pitchreversal', 'duration', 'onsettick', 'phrasepos', 'phrase_ix', 'songpos', 'IOI', 'IOR', 'beatstrength', 'beat_str', 'beat', 'timesignature', 'gpr2a_Frankland', 'gpr2b_Frankland', 'gpr3a_Frankland', 'gpr3d_Frankland', 'gpr_Frankland_sum', 'lbdm_spitch', 'lbdm_sioi', 'lbdm_srest', 'lbdm_rpitch', 'lbdm_rioi', 'lbdm_rrest', 'lbdm_boundarystrength']

# On remplace les valeurs None par des 0
for col in numerical_columns:
    if col in df.columns:
        df[col] = df[col].apply(
        lambda x: [0 if v is None else v for v in x]
    )

In [10]:
refactor = [4,5,6,7,8,9,10,11,12,13,14,15] #index des colones fraction a reformater

for x in refactor:
    features[:, x] = [
        float(Fraction(value)) if isinstance(value, str) and '/' in value else
        float(value) if value is not None else 0.0  # Remplacement des None par 0.0
        for value in features[:, x]
    ]
cat_columns = [] #index des colones catégorielles
num_columns = [] #index des colones numériques a normaliser

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), cat_columns),  # Encodage catégoriel
        ('num', StandardScaler(), num_columns)  # Normalisation 
    ],
    remainder='passthrough' #données non modifiées
)

final_features = preprocessor.fit_transform(features)

### Recherche des hyperparamètres
On utilise la fonction GridSearchCV et le multithreading avec `n_jobs=-1`

In [11]:
param = {
    'min_samples_split': [2, 5, 10],
    'max_depth': [10, 20, None],
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', None]
}

In [12]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param, cv=3, scoring=make_scorer(f1_score, average='macro'),n_jobs=-1)
grid_search.fit(features, labels)

### Nos résultats

In [13]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}
Best score: 0.9334208010174511
