# Modèle naïf - average par genre

### Importations

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import os
import sys
project_dir = os.getcwd().split('notebooks')[0]
sys.path.append(project_dir)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils.tools import *
from src.utils.metrics import *
from src.utils.extract_data import transform_parquet_to_csv
from src.utils.metrics_plot import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn import metrics

### Dataframe des métriques

In [3]:
metrics=['Accuracy','Precision micro', 'Recall micro', 'F1 score micro', 'Hamming-Loss']
final_res=pd.DataFrame(columns=metrics)

### Importation des données et mise en forme des dataframes

In [4]:
#importation des données
df0 = pd.read_parquet("C:/Users/22sir/Desktop/Stat app/dataset2.parquet", engine="pyarrow")
df = df0.copy()
df["Nombre labels"] = df.iloc[:,0:21].sum(axis=1)
df.sort_values("Nombre labels")

#dataframe des labels
labels = df.columns[:21]
df_labels = pd.DataFrame(data=labels,columns=['Label'])

In [5]:
#variables utiles
n_titre = len(df0) #nombre titres dans la base
n_audio = len(df0.loc[0,'audio_features']) #nombre de données audio
n_usage = len(df0.loc[0,'usage_features']) #nombre de données usage
n_label = len(df_labels) #nombre labels dans la base
p=0.5 #proportion de titres de la base utilisés

In [6]:
#df, le dataframe qu'on utilise avec les features décomposées
df=pd.read_csv('dataset.csv')
df

Unnamed: 0,song_index,asian,rnb,reggae,blues,pop,dance,folk,arabic-music,indie,...,usage_feature_119,usage_feature_120,usage_feature_121,usage_feature_122,usage_feature_123,usage_feature_124,usage_feature_125,usage_feature_126,usage_feature_127,usage_feature_128
0,195,0,0,0,0,0,0,0,0,0,...,0.008582,-0.000024,-0.272207,0.210503,0.030972,0.080964,-0.015423,0.102148,0.365081,-0.066897
1,417,0,0,0,0,0,0,0,0,0,...,-0.182309,-0.051225,-0.004617,-0.005726,-0.038334,-0.063972,0.053881,-0.001152,0.045032,0.027505
2,530,0,0,0,0,1,0,0,0,0,...,-0.044620,-0.132282,-0.141684,-0.106271,-0.189065,-0.229804,0.083824,-0.160790,0.040417,-0.094503
3,909,0,0,0,0,0,0,0,0,0,...,-0.127573,-0.068515,-0.106374,-0.102694,-0.066937,0.026911,-0.032158,0.063670,-0.026653,-0.068251
4,1257,0,0,0,0,0,0,0,0,0,...,0.024463,-0.052952,-0.059056,0.041670,-0.025673,-0.000446,-0.003438,0.060359,0.058938,0.029523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50557,49454,0,0,0,0,1,0,0,0,0,...,-0.034780,0.112294,-0.084898,-0.191570,0.153120,0.106828,0.056458,0.127816,0.086341,-0.024928
50558,49589,0,0,0,0,0,0,0,1,0,...,0.063418,0.166352,0.021962,0.003740,0.146321,-0.094655,-0.118755,-0.008160,-0.068332,0.041923
50559,49689,0,0,0,0,0,0,0,0,0,...,-0.000624,-0.037719,0.018047,0.028091,-0.000380,0.015697,-0.035737,-0.022669,-0.003089,-0.014504
50560,50137,0,0,0,0,0,0,0,0,0,...,0.063921,0.000640,-0.081422,-0.012015,-0.046315,-0.112069,0.017265,-0.063093,-0.134029,-0.007111


### Séparation train-test

In [43]:
Y = df.iloc[:,1:22]
X = df[["audio_feature_"+str(i) for i in range(1,n_audio)]+["usage_feature_"+str(i) for i in range(1,n_usage)]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=17)

In [44]:
#le dataframe d'entraînement avec les sorties Y et les features X associées à chaque titre
df_train = pd.concat([X_train, y_train], axis = 1)

### Calcul des features moyennes pour chaque label

In [69]:
#le data frame avec les features moyennes de chaque label (moyenne calculée sur l'échantillon d'entraînement)
df_moyenne = pd.DataFrame(columns=df_train.columns)

In [70]:
for label in labels:
    df_moyenne.loc['Average_'+label] = df_train[df_train[label] == 1].mean(axis = 0)

In [72]:
df_moyenne = df_moyenne.iloc[:,0:n_audio+n_usage-2]
df_moyenne

Unnamed: 0,audio_feature_1,audio_feature_2,audio_feature_3,audio_feature_4,audio_feature_5,audio_feature_6,audio_feature_7,audio_feature_8,audio_feature_9,audio_feature_10,...,usage_feature_118,usage_feature_119,usage_feature_120,usage_feature_121,usage_feature_122,usage_feature_123,usage_feature_124,usage_feature_125,usage_feature_126,usage_feature_127
Average_asian,-0.017372,0.001212,0.274875,0.186976,-0.324015,-0.262456,-0.280398,0.984705,0.10267,0.57717,...,-0.017922,-0.034216,-0.026146,0.062942,0.008912,0.041301,0.020249,-0.012543,-0.004957,0.020776
Average_rnb,-0.100891,0.061423,0.360813,0.39132,-0.221719,-0.29205,-0.079659,1.184045,0.207745,0.551979,...,0.02439,-0.010755,-0.024389,-0.022088,-0.008739,-0.012021,-0.0152,-0.017383,0.024492,0.02754
Average_reggae,-0.076302,-0.100962,0.198539,0.417918,-0.314878,-0.296773,-0.147408,0.944873,0.447923,0.420649,...,0.006735,0.004193,-0.0049,-0.010944,0.004945,0.011266,0.001955,0.004784,0.013095,0.024917
Average_blues,0.359204,-0.355346,-0.080626,0.221473,-0.410956,-0.273909,-0.375709,0.374167,0.298612,0.068188,...,0.016024,0.09768,0.158647,0.136837,0.029234,0.068584,-0.126249,-0.049042,0.059617,0.013766
Average_pop,-0.078773,0.088664,0.403303,0.340886,-0.36205,-0.282711,-0.171007,1.129757,0.151901,0.62254,...,0.023307,-0.019409,-0.027166,-0.031467,-0.039008,-0.014533,-0.042409,-0.003536,0.009805,0.03608
Average_dance,-0.16263,0.039029,0.326422,0.459264,-0.483101,-0.345011,-0.097579,1.223383,0.222559,0.574642,...,0.008147,-0.002195,0.004231,-0.01306,-0.020103,0.016495,0.014421,0.019299,-0.003688,0.016934
Average_folk,0.251646,-0.215971,-0.104468,0.21634,-0.376029,-0.242505,-0.383655,0.73346,0.153658,0.392836,...,0.024055,0.057336,0.171865,-0.031007,-0.003097,-0.016103,0.123387,-0.00442,-0.085019,0.150344
Average_arabic-music,-0.061391,-0.079829,0.305863,0.436104,-0.389252,-0.30949,-0.258469,0.982556,0.351005,0.434354,...,-0.003514,0.007479,0.008573,-0.003483,0.002778,0.026327,-0.009198,-0.005657,-0.002639,-0.007884
Average_indie,-0.097463,0.02764,0.106531,0.156141,-0.352382,-0.299294,-0.189,1.014254,0.008941,0.583772,...,-0.006655,0.093862,0.039897,0.078635,-0.008341,0.018154,0.065861,0.009586,-0.020487,-0.014628
Average_rock,-0.066771,0.002207,0.260427,-0.145764,-0.494928,-0.251064,-0.34328,0.680581,-0.128246,0.460504,...,0.033187,0.020191,0.032484,0.058654,-0.003417,0.037576,0.008059,0.010017,-0.035442,-0.002106
