In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Timing decorator to find exécution time of function
def timing(func):
    def wrapper(*args, **kwargs):
        print("Calculation start")
        start = time.perf_counter()
        data = func(*args, **kwargs)
        print("Finished!")
        end = time.perf_counter()
        print(f"Execution time: {round(end-start,2)} second(s)")
        return data
    return wrapper

In [3]:
@timing
def read_data(path):
    d = pd.read_csv(path)
    return d

In [4]:
X = read_data("X.csv")

Calculation start
Finished!
Execution time: 19.72 second(s)


In [5]:
y = read_data("y_1.csv")

Calculation start
Finished!
Execution time: 1.1 second(s)


In [6]:
X.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,NO,NO,rpoSk1aXo+6hZQxVMp/PAw8+w67/vzWkyQs/xGqFCnw=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,0.837674,0.072627,0.869502,0.279803,0.297919,NO,...,7.0,0.885,4565,3456,YES,NO,YES,4,0.623843,0.285871
1,NO,NO,/VV6+dCb+in5lV7V/e9b5HoZ/BN34M+dPmKWY8BeHh4=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,1.289474,0.082577,0.948578,0.069268,0.527802,NO,...,0.0,0.67,4676,3306,YES,NO,YES,0,0.932244,0.526946
2,YES,YES,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,YvZUuCDjLu9VvkCdBWgARWQrvm+FSXgxp0zIrMjcLBc=,0.653912,0.041257,0.941,0.090423,0.422868,YES,...,1.5,0.963333,3306,4678,YES,NO,YES,11,0.668876,0.412886
3,YES,NO,X/hdUOVR5KuExVGLzjhLcM2CyIqym9t0Nh+ZX05M+1w=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,1.179921,0.051104,0.949501,0.270638,0.411161,YES,...,0.0,0.95,4677,3307,YES,NO,YES,4,0.738434,0.404105
4,NO,NO,4FIxS25OrBv/DHbmmVLtScptssXXAhNxD087PPzA9BU=,B+EJpnEbkYtLnwDQYN1dP1rcfnoCnxAjKLYwQZE07Ew=,0.706815,0.0,1.0,0.0,0.503363,YES,...,3.0,1.0,892,1262,NO,NO,YES,7,0.59588,0.48991


In [7]:
y.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,...,y24,y25,y26,y27,y28,y29,y30,y31,y32,y33
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Traitement des données

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 145 entries, x1 to x145
dtypes: float64(55), int64(30), object(60)
memory usage: 110.6+ MB


In [13]:
X_float_col = X.dtypes[X.dtypes=='float64'].index
X_int_col = X.dtypes[X.dtypes=='int64'].index
X_obj_col = X.dtypes[X.dtypes=='object'].index

In [12]:
X[X_float_col].head()

Unnamed: 0,x5,x6,x7,x8,x9,x16,x19,x20,x21,x28,...,x122,x123,x124,x125,x132,x135,x136,x137,x144,x145
0,0.837674,0.072627,0.869502,0.279803,0.297919,0.619213,0.99537,0.0,0.94,0.623843,...,0.072627,0.869502,0.279803,0.298795,0.315104,0.990917,7.0,0.885,0.623843,0.285871
1,1.289474,0.082577,0.948578,0.069268,0.527802,0.937689,0.99879,0.0,0.85,0.938899,...,0.082577,0.948578,0.069268,0.530368,0.924985,0.99274,0.0,0.67,0.932244,0.526946
2,0.653912,0.041257,0.941,0.090423,0.422868,0.651347,0.982471,0.0,0.97,0.668876,...,0.041257,0.941,0.090423,0.424077,0.433091,0.947951,1.5,0.963333,0.668876,0.412886
3,1.179921,0.051104,0.949501,0.270638,0.411161,0.753553,0.995162,0.0,0.95,0.758391,...,0.051104,0.949501,0.270638,0.411375,0.732083,0.99365,0.0,0.95,0.738434,0.404105
4,0.706815,0.0,1.0,0.0,0.503363,0.568938,0.973059,0.0,1.0,0.59588,...,0.0,1.0,0.0,0.503363,0.224247,0.898574,3.0,1.0,0.59588,0.48991


In [9]:
X.replace("YES",1, inplace=True)
X.replace("NO",0, inplace=True)
cat = X.dtypes[X.dtypes=='object'].index
X[cat].head()

Unnamed: 0,x3,x4,x34,x35,x61,x64,x65,x91,x94,x95
0,rpoSk1aXo+6hZQxVMp/PAw8+w67/vzWkyQs/xGqFCnw=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,rpoSk1aXo+6hZQxVMp/PAw8+w67/vzWkyQs/xGqFCnw=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,EfLSpSt1wC0MjAdFeWuFDYR2laBmHlXisFYHkb7CIIQ=,X/hdUOVR5KuExVGLzjhLcM2CyIqym9t0Nh+ZX05M+1w=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,UB4XMFemJzxaAfdj75Bwjm7xYNDCzCkUk7ANRZlr034=,X/hdUOVR5KuExVGLzjhLcM2CyIqym9t0Nh+ZX05M+1w=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=
1,/VV6+dCb+in5lV7V/e9b5HoZ/BN34M+dPmKWY8BeHh4=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,vBrSPPWbvTBrPn/YdHWxt+pFeV6E5wPpNjXS8RILB88=,N8wBprLMRuVKWZMBXX9JHVBdT6S9zQ+NL1ZfWnoGQrk=,0DeSv/rVzbodXvNJK0+KRGddE1skGm60/T8G7xoLB4g=,m/m91A62TG6D7crdQTdYd9eeMO8xNcbN7N7iwgclLp0=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,N8wBprLMRuVKWZMBXX9JHVBdT6S9zQ+NL1ZfWnoGQrk=,wVxsLFUAZzdPJIUFpt1aktySAsTOyUo72bEDtGlI+RQ=,IoM2E9pNxABFR+H3yfapUL+ThKm7GtTzY7js9H/H99o=
2,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,YvZUuCDjLu9VvkCdBWgARWQrvm+FSXgxp0zIrMjcLBc=,YDvUCZu63XmryhV8xiJBhsWEYeM3giPDKXmzpKBDxdc=,B+EJpnEbkYtLnwDQYN1dP1rcfnoCnxAjKLYwQZE07Ew=,bCou8rTLsVvraVfOMQ+PxfDbaX5b2+tIkuRaNCGPgEw=,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,YvZUuCDjLu9VvkCdBWgARWQrvm+FSXgxp0zIrMjcLBc=,Q4Bg+jJRQ1ivy6iLEkWu2O+LgzkeqHGxW+q3MXgJqtc=,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,YvZUuCDjLu9VvkCdBWgARWQrvm+FSXgxp0zIrMjcLBc=
3,X/hdUOVR5KuExVGLzjhLcM2CyIqym9t0Nh+ZX05M+1w=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,8Whd23AFTt1KV61HEnaVzYZCSZsw5sqqmf4WUmWd3bQ=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,8Whd23AFTt1KV61HEnaVzYZCSZsw5sqqmf4WUmWd3bQ=,8Whd23AFTt1KV61HEnaVzYZCSZsw5sqqmf4WUmWd3bQ=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,p8+Fg/7zVuXwMfQhnbFo6jWbBSc9xt7pQ/ZZRzVmxNE=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=
4,4FIxS25OrBv/DHbmmVLtScptssXXAhNxD087PPzA9BU=,B+EJpnEbkYtLnwDQYN1dP1rcfnoCnxAjKLYwQZE07Ew=,xTrPdVJop7A4z6r8XR9ZqxHDaeGn/EWMXEalNIGdbUk=,wueA/6OX5rJJgWep/A3FouhsW3Oup3Te1LLzxFjRZ3I=,9mx6vCyxztkC0/4nqSSLnaDoFkEtpTHi3LzpnellMDg=,xTrPdVJop7A4z6r8XR9ZqxHDaeGn/EWMXEalNIGdbUk=,wueA/6OX5rJJgWep/A3FouhsW3Oup3Te1LLzxFjRZ3I=,FHs4ckB5vuW2CnNz8QXDKdqnhZeKUWmCGl84gU0YpOQ=,mPvFNpHgDw4+ry3ew4Udv9ewcU7LQ5PC1iPNPXZcRcw=,bvw4TBmE4y4uNMa25WcnFYtEBgSwmT6fAm7KrXKV9Og=


# Réduction de dimensionnalité (PCA)

Nous allons d'abord normaliser les données

In [None]:
np.where()

In [1]:
trainLabels.head()

NameError: name 'trainLabels' is not defined

In [None]:
df_features1 = scaler.fit_transform(df_features)

In [None]:
df_features2 = pd.DataFrame(data=df_features1, columns=list(df_features.columns))

In [None]:
df_features2.head()

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=33)

In [None]:
df = pca.fit_transform(df_features2)

In [None]:
from sklearn.model_selection import train_test_split
X_train, x, y_train, y = train_test_split(df, df_targets, test_size = 0.3, random_state=1236)
X_val, X_test, y_val, y_test = train_test_split(x,y, test_size=0.5, random_state=1236)

In [None]:
print("Train set 70%")
print(f"X Train set size: {X_train.shape}")
print(f"y Train set size: {y_train.shape}\n")

print("Validation set 15%")
print(f"X validation set size: {X_val.shape}")
print(f"y validation set size: {y_val.shape}")

print("test set 15%")
print(f"X test set size: {X_test.shape}")
print(f"y test set size: {y_test.shape}")

# Entrainement des modèles

## 1 - Label Transformation method
Il en existe 3 methode
* Binary Relevance
* Classification chain
* Label Powerset

### 1.a - Binary relevance avec le reseau des neuronnes

In [None]:
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from sklearn.metrics import accuracy_score, hamming_loss
from tensorflow import keras
from sklearn.neural_network import MLPClassifier

In [None]:
# MLP Classifier
mlp_clf = MLPClassifier()
br_mlp = BinaryRelevance(mlp_clf)

In [None]:
br_mlp.fit(X_train, y_train)

le modele avec tous ces paramètres  
BinaryRelevance(classifier=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
        require_dense=[True, True])

In [None]:
y_pred = br_mlp.predict(X_test)

In [None]:
# accuracy score
accuracy_score(y_test, y_pred)

On obtient un accuracy de 83.65 %
