# Réduction de dimension

In [None]:
# Librairies
from pickleshare import PickleShareDB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go

## Chargement des données

Nous chargeons les données depuis le fichier des données préprarées.

In [None]:
# Données nettoyées
db = PickleShareDB('../prep_data/kity')

#df_net_1 = db['net_attack_1_clean']
#df_net_2 = db['net_attack_2_clean']
#df_net_3 = db['net_attack_3_clean']
df_net_4 = db['net_attack_4_clean']
#df_net_norm = db['net_norm_clean']

In [None]:
data = df_net_4

In [None]:
data.head()

## Préparation des données

In [None]:
data_reduced = data.drop(columns=['Time'])

In [None]:
X = data_reduced.drop(columns=['label', 'label_n'])
y_label = data_reduced['label']
y_label_n = data_reduced['label_n']

In [None]:
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled.shape

# Choix du nombre de composantes

In [None]:
pca_var = PCA(n_components=X_scaled.shape[1])
pca_var.fit(X_scaled)

In [None]:
variance_exp = pca_var.explained_variance_ratio_

In [None]:
sum_variance = variance_exp.cumsum()

table_df = pd.DataFrame({
    'Composante principale': [f'PC{i}' for i in range(1, len(variance_exp) + 1)],
    'Variance expliquée (%)': [f'{var * 100:.2f}%' for var in variance_exp],
    'Variance cumulée (%)': [f'{var * 100:.2f}%' for var in sum_variance]
})

print(table_df)

In [None]:
fig = go.Figure(data=[
    go.Bar(x=[f'PC{i}' for i in range(1, len(variance_exp) + 1)],
           y=variance_exp,
           text=[f'{var:.2%}' for var in variance_exp], 
           textposition='auto'
    )
])

fig.update_layout(
    title='Variance expliquée par chaque composante principale',
    xaxis_title='Composantes principales',
    yaxis_title='Variance expliquée',
    yaxis=dict(tickformat=".0%")
)

fig.show()

Pour garder plus de 90% des informations, nous conservons 6 dimensions.

# PCA

In [None]:
pca = PCA(n_components=6) 
X_pca = pca.fit_transform(X_scaled)

In [None]:
df_pca = pd.DataFrame(X_pca, columns=[f'PCA{i+1}' for i in range(6)])
df_pca['label'] = y_label.values
df_pca['label_n'] = y_label_n.values

In [None]:
print(df_pca.head())

# Visualisation du résultat

In [None]:
fig1 = px.scatter(df_pca, x='PCA1', y='PCA2', color='label', 
                  title='PC1 - PC2', labels={'PCA1': 'Dim 1', 'PCA2': 'Dim 2'})

fig1.show()

In [None]:
fig2 = px.scatter(df_pca, x='PCA3', y='PCA4', color='label', 
                  title='PC3 - PC4', labels={'PCA3': 'Dim 3', 'PCA4': 'Dim 4'})
fig2.show()


In [None]:
fig3 = px.scatter(df_pca, x='PCA5', y='PCA6', color='label', 
                  title='PC5 - PC6', labels={'PCA5': 'Dim 5', 'PCA6': 'Dim 6'})

fig3.show()

In [None]:
# TODO : Revoir pourquoi cela ne marche pas
'''
fig_3d = px.scatter_3d(df_pca, x='PCA1', y='PCA2', z='PCA3', color='label', 
                       title='Visualisation 3D des trois premières composantes principales')
fig_3d.show()
'''

In [None]:
# TODO : Remettres les labels dans le résultat de la PCA ?
# TODO : Affichage 3D
# TODO : Enregistrer les résultats dans la base de données
# TODO : Utiliser Time dans la PCA ?