In [52]:
import pandas as pd

## Load data

In [53]:
def load_data(file, encoding='utf-16', sep='\t'):
    data = pd.read_csv(f'../datasets/Physical dataset/{file}.csv', encoding=encoding, sep=sep)
    return data

In [54]:
df_load_phy_1 = load_data('phy_att_1')
df_load_phy_2 = load_data('phy_att_2')
df_load_phy_3 = load_data('phy_att_3')
df_load_phy_4 = load_data('phy_att_4', encoding='utf-8', sep=',')
df_load_phy_norm = load_data('phy_norm')

In [55]:
df_phy_1 = df_load_phy_1.copy()
df_phy_2 = df_load_phy_2.copy()
df_phy_3 = df_load_phy_3.copy()
df_phy_4 = df_load_phy_4.copy()
df_phy_norm = df_load_phy_norm.copy()

## Exploration

In [56]:
print(df_phy_1.shape)
print(df_phy_2.shape)
print(df_phy_3.shape)
print(df_phy_4.shape)
print(df_phy_norm.shape)

(2420, 43)
(2104, 43)
(1254, 43)
(1717, 43)
(3428, 43)


In [57]:
print(df_phy_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Time           2420 non-null   object
 1   Tank_1         2420 non-null   int64 
 2   Tank_2         2420 non-null   int64 
 3   Tank_3         2420 non-null   int64 
 4   Tank_4         2420 non-null   int64 
 5   Tank_5         2420 non-null   int64 
 6   Tank_6         2420 non-null   int64 
 7   Tank_7         2420 non-null   int64 
 8   Tank_8         2420 non-null   int64 
 9   Pump_1         2420 non-null   bool  
 10  Pump_2         2420 non-null   bool  
 11  Pump_3         2420 non-null   bool  
 12  Pump_4         2420 non-null   bool  
 13  Pump_5         2420 non-null   bool  
 14  Pump_6         2420 non-null   bool  
 15  Flow_sensor_1  2420 non-null   int64 
 16  Flow_sensor_2  2420 non-null   int64 
 17  Flow_sensor_3  2420 non-null   int64 
 18  Flow_sensor_4  2420 non-null

In [58]:
# On voit qu'on a une colonne de temps, on la transformera en datetime
# On a aussi des entiers (Tank et Flow_sensor),
# des booléens (Valve et Pump)
# 2 colonnes étiquetées : Label : catégorielle, Label_n : entier

In [59]:
print(df_phy_1['Label'].value_counts())
print(df_phy_2['Label'].value_counts()) # faute de frappe : "nomal" qu'on va transformer en "normal"
print(df_phy_3['Label'].value_counts())
print(df_phy_4['Label'].value_counts())
print(df_phy_norm['Label'].value_counts())

Label
normal            1610
MITM               533
physical fault     277
Name: count, dtype: int64
Label
normal            1549
nomal              249
physical fault     123
MITM                96
DoS                 80
scan                 7
Name: count, dtype: int64
Label
normal            911
physical fault    152
MITM              114
DoS                77
Name: count, dtype: int64
Label
normal            1159
MITM               265
DoS                153
physical fault     133
scan                 7
Name: count, dtype: int64
Label
normal    3428
Name: count, dtype: int64


In [60]:
# Label : quel type d'attaques (ou juste normal si pas d'attaque)
# Label_n : 0 pour normal, 1 pour attaque
# Label_n n'est pas réellement un int, c'est une catégorie : on va le transformer en booléen

In [61]:
# changer Label pourque ce ne soit plus object mais category
df_phy_1['Label'] = df_phy_1['Label'].astype('category')
df_phy_2['Label'] = df_phy_2['Label'].astype('category')
df_phy_3['Label'] = df_phy_3['Label'].astype('category')
df_phy_4['Label'] = df_phy_4['Label'].astype('category')
df_phy_norm['Label'] = df_phy_norm['Label'].astype('category')

## Clean data

In [62]:
# Convertir Time en datetime
df_phy_1['Time'] = pd.to_datetime(df_phy_1['Time'], dayfirst=True)
df_phy_2['Time'] = pd.to_datetime(df_phy_2['Time'], dayfirst=True)
df_phy_3['Time'] = pd.to_datetime(df_phy_3['Time'], dayfirst=True)
df_phy_4['Time'] = pd.to_datetime(df_phy_4['Time'], dayfirst=True)
df_phy_norm['Time'] = pd.to_datetime(df_phy_norm['Time'], dayfirst=True)

In [63]:
# Renommer la colonne 'Lable_n' en 'Label_n'
df_phy_2.rename(columns={'Lable_n': 'Label_n'}, inplace=True)

In [64]:
# Transformer Label_n en catégorielle
df_phy_1['Label_n'] = df_phy_1['Label_n'].astype('bool')
df_phy_2['Label_n'] = df_phy_2['Label_n'].astype('bool')
df_phy_3['Label_n'] = df_phy_3['Label_n'].astype('bool')
df_phy_4['Label_n'] = df_phy_4['Label_n'].astype('bool')
df_phy_norm['Label_n'] = df_phy_norm['Label_n'].astype('bool')

In [65]:
# Renommer "nomal" en "normal"
df_phy_2['Label'] = df_phy_2['Label'].str.replace('nomal', 'normal')

## Exploration (2)

In [66]:
# Vérifier que Label_n = True <=> Label est une attaque 
# et Label_n = False <=> Label est normal
print(df_phy_1[['Label', 'Label_n']].value_counts())
print(df_phy_2[['Label', 'Label_n']].value_counts())
print(df_phy_3[['Label', 'Label_n']].value_counts())
print(df_phy_4[['Label', 'Label_n']].value_counts())
print(df_phy_norm[['Label', 'Label_n']].value_counts())

Label           Label_n
normal          False      1610
MITM            True        533
physical fault  True        277
Name: count, dtype: int64
Label           Label_n
normal          False      1798
physical fault  True        123
MITM            True         96
DoS             True         80
scan            True          7
Name: count, dtype: int64
Label           Label_n
normal          False      911
physical fault  True       152
MITM            True       114
DoS             True        77
Name: count, dtype: int64
Label           Label_n
normal          False      1159
MITM            True        265
DoS             True        153
physical fault  True        133
scan            True          7
Name: count, dtype: int64
Label   Label_n
normal  False      3428
Name: count, dtype: int64


In [67]:
# dans la colonne Time : est ce que toutes les valeurs sont espacées de la même manière ?
print(df_phy_1['Time'].diff().value_counts())
print(df_phy_2['Time'].diff().value_counts())
print(df_phy_3['Time'].diff().value_counts())
print(df_phy_4['Time'].diff().value_counts())
print(df_phy_norm['Time'].diff().value_counts())

Time
0 days 00:00:01    2419
Name: count, dtype: int64
Time
0 days 00:00:01    2102
0 days 00:00:02       1
Name: count, dtype: int64
Time
0 days 00:00:01    1253
Name: count, dtype: int64
Time
0 days 00:00:01    1716
Name: count, dtype: int64
Time
0 days 00:00:01    3427
Name: count, dtype: int64


In [68]:
# Tout est espacé d'une seconde sauf pour df_phy_2 où il y a un trou d'une seconde

In [69]:
# y a t il des valeurs manquantes ?
print(df_phy_1.isna().sum())
print(df_phy_2.isna().sum())
print(df_phy_3.isna().sum())
print(df_phy_4.isna().sum())
print(df_phy_norm.isna().sum())

Time             0
Tank_1           0
Tank_2           0
Tank_3           0
Tank_4           0
Tank_5           0
Tank_6           0
Tank_7           0
Tank_8           0
Pump_1           0
Pump_2           0
Pump_3           0
Pump_4           0
Pump_5           0
Pump_6           0
Flow_sensor_1    0
Flow_sensor_2    0
Flow_sensor_3    0
Flow_sensor_4    0
Valv_1           0
Valv_2           0
Valv_3           0
Valv_4           0
Valv_5           0
Valv_6           0
Valv_7           0
Valv_8           0
Valv_9           0
Valv_10          0
Valv_11          0
Valv_12          0
Valv_13          0
Valv_14          0
Valv_15          0
Valv_16          0
Valv_17          0
Valv_18          0
Valv_19          0
Valv_20          0
Valv_21          0
Valv_22          0
Label_n          0
Label            0
dtype: int64
Time             0
Tank_1           0
Tank_2           0
Tank_3           0
Tank_4           0
Tank_5           0
Tank_6           0
Tank_7           0
Tank_8           0

In [70]:
# non

## Clean data (2)

In [71]:
# Combler le trou de 1 seconde : TODO 
# df2
# combien de différence pour chaque colonne
# genre est ce que si gros trous on peut combler avec la moyenne ?
# ou dupliquer la ligne précédente
# plot 
# regarder ce qu'il se passe sur une visualisation pour dire la décision qu'on prend

In [72]:
# Transformer pour "perdre" le timestamp ? 

## Exploration (3)

### Flow sensor

In [73]:
# Distribution des valeurs numériques pour Flow_sensor
print(df_phy_1[['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']].describe())

       Flow_sensor_1  Flow_sensor_2  Flow_sensor_3  Flow_sensor_4
count    2420.000000    2420.000000         2420.0    2420.000000
mean     1634.090909    1595.041322            0.0    1283.821488
std      1963.983001    1958.977859            0.0    1738.336519
min         0.000000       0.000000            0.0       0.000000
25%         0.000000       0.000000            0.0       0.000000
50%         0.000000       0.000000            0.0       0.000000
75%      4000.000000    4000.000000            0.0    2769.000000
max      4000.000000    4000.000000            0.0    4789.000000


In [74]:
# combien y a t il de valeurs différentes pour Flow_sensor_1, Flow_sensor_2, Flow_sensor_3, Flow_sensor_4
print(df_phy_1[['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']].nunique())
print(df_phy_2[['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']].nunique())
print(df_phy_3[['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']].nunique())
print(df_phy_4[['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']].nunique())
print(df_phy_norm[['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4']].nunique())

Flow_sensor_1      3
Flow_sensor_2      2
Flow_sensor_3      1
Flow_sensor_4    686
dtype: int64
Flow_sensor_1      2
Flow_sensor_2      2
Flow_sensor_3      1
Flow_sensor_4    608
dtype: int64
Flow_sensor_1      2
Flow_sensor_2      2
Flow_sensor_3      1
Flow_sensor_4    262
dtype: int64
Flow_sensor_1      2
Flow_sensor_2      2
Flow_sensor_3      1
Flow_sensor_4    486
dtype: int64
Flow_sensor_1      2
Flow_sensor_2      2
Flow_sensor_3      1
Flow_sensor_4    965
dtype: int64


In [75]:
# qu'une valeur pour Flow_sensor_3, vérifions que c'est la même dans chaque dataset
# 2 valeurs pour Flow_sensor_1 (3 pour df_phy_1) et 2, beaucoup plus pour Flow_sensor_4

In [76]:
# que des 0 en Flow_sensor_3 ? Vérifions 
print(df_phy_1['Flow_sensor_3'].value_counts())
print(df_phy_2['Flow_sensor_3'].value_counts())
print(df_phy_3['Flow_sensor_3'].value_counts())
print(df_phy_4['Flow_sensor_3'].value_counts())
print(df_phy_norm['Flow_sensor_3'].value_counts())

Flow_sensor_3
0    2420
Name: count, dtype: int64
Flow_sensor_3
0    2104
Name: count, dtype: int64
Flow_sensor_3
0    1254
Name: count, dtype: int64
Flow_sensor_3
0    1717
Name: count, dtype: int64
Flow_sensor_3
0    3428
Name: count, dtype: int64


In [77]:
# que des 0 : on peut supprimer cette colonne
df_phy_1.drop(columns='Flow_sensor_3', inplace=True)
df_phy_2.drop(columns='Flow_sensor_3', inplace=True)
df_phy_3.drop(columns='Flow_sensor_3', inplace=True)
df_phy_4.drop(columns='Flow_sensor_3', inplace=True)
df_phy_norm.drop(columns='Flow_sensor_3', inplace=True)

In [78]:
# pour les autres Flow_sensor on a plus de la moitié des valeurs à 0
# on va donc analyser visuellement la distribution des valeurs non nulles
## TODO : commenter viz

In [79]:
# combien de lignes où tout est à 0 ?
print(df_phy_1[(df_phy_1['Flow_sensor_1'] == 0) & (df_phy_1['Flow_sensor_2'] == 0) & (df_phy_1['Flow_sensor_4'] == 0)].shape)

(452, 42)


In [80]:
# est ce qu'on a maximum qu'un Flow_sensor à la fois ?

# Fonction pour vérifier la condition
def check_condition(row):
    columns = ['Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_4']
    
    # vérifier si une seule colonne est non nulle et les autres à 0
    for col in columns:
        if row[col] != 0:
            other_columns = [row[c] for c in columns if c != col]
            if all(value == 0 for value in other_columns):
                return True
    return False

test = df_phy_1.copy()

# Appliquer la fonction à chaque ligne du DataFrame
test['Condition_met'] = test.apply(check_condition, axis=1)

print(test['Condition_met'].value_counts())

Condition_met
False    1462
True      958
Name: count, dtype: int64


In [81]:
# non : on a des lignes où plusieurs Flow_sensor ne sont pas à 0

In [82]:
# quels sont les 2 valeurs de Flow_sensor_1 ?
print(df_phy_1['Flow_sensor_1'].value_counts())
print(df_phy_2['Flow_sensor_1'].value_counts())
print(df_phy_3['Flow_sensor_1'].value_counts())
print(df_phy_4['Flow_sensor_1'].value_counts())
print(df_phy_norm['Flow_sensor_1'].value_counts())

Flow_sensor_1
0       1368
4000     987
100       65
Name: count, dtype: int64
Flow_sensor_1
0       1385
4000     719
Name: count, dtype: int64
Flow_sensor_1
0       876
4000    378
Name: count, dtype: int64
Flow_sensor_1
0       1118
4000     599
Name: count, dtype: int64
Flow_sensor_1
0       2052
4000    1376
Name: count, dtype: int64


In [83]:
# 0, 4k pour tous, et quelques 100 pour df_phy_1
# on catégorise donc Flow_sensor_1 

df_phy_1["Flow_sensor_1"] = df_phy_1["Flow_sensor_1"].astype('category')
df_phy_2["Flow_sensor_1"] = df_phy_2["Flow_sensor_1"].astype('category')
df_phy_3["Flow_sensor_1"] = df_phy_3["Flow_sensor_1"].astype('category')
df_phy_4["Flow_sensor_1"] = df_phy_4["Flow_sensor_1"].astype('category')
df_phy_norm["Flow_sensor_1"] = df_phy_norm["Flow_sensor_1"].astype('category')

In [84]:
# quels sont les 2 valeurs de Flow_sensor_2 ?
print(df_phy_1['Flow_sensor_2'].value_counts())
print(df_phy_2['Flow_sensor_2'].value_counts())
print(df_phy_3['Flow_sensor_2'].value_counts())
print(df_phy_4['Flow_sensor_2'].value_counts())
print(df_phy_norm['Flow_sensor_2'].value_counts())

Flow_sensor_2
0       1455
4000     965
Name: count, dtype: int64
Flow_sensor_2
0       1335
4000     769
Name: count, dtype: int64
Flow_sensor_2
0       867
4000    387
Name: count, dtype: int64
Flow_sensor_2
0       1049
4000     668
Name: count, dtype: int64
Flow_sensor_2
0       2012
4000    1416
Name: count, dtype: int64


In [85]:
# 0 ou 4k pour tous
# on ne va donc pas garder les valeurs numériques pour cette colonne mais
# on va la transformer en catégorielle (booléen car 0 -> 0 ou 4k -> 1)
df_phy_1['Flow_sensor_2'] = df_phy_1['Flow_sensor_2'].apply(lambda x: x != 0)
df_phy_2['Flow_sensor_2'] = df_phy_2['Flow_sensor_2'].apply(lambda x: x != 0)
df_phy_3['Flow_sensor_2'] = df_phy_3['Flow_sensor_2'].apply(lambda x: x != 0)
df_phy_4['Flow_sensor_2'] = df_phy_4['Flow_sensor_2'].apply(lambda x: x != 0)
df_phy_norm['Flow_sensor_2'] = df_phy_norm['Flow_sensor_2'].apply(lambda x: x != 0)

### Tank

In [86]:
print(df_phy_1[['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8']].describe())

            Tank_1       Tank_2       Tank_3       Tank_4       Tank_5  \
count  2420.000000  2420.000000  2420.000000  2420.000000  2420.000000   
mean    686.320661   714.946281   922.677686   362.064463   361.733058   
std     654.955181   656.051613   942.572849   366.970644   278.799110   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       2.000000     5.000000    73.250000    27.000000    30.500000   
50%     598.000000   639.500000   747.500000   231.000000   370.000000   
75%    1244.000000  1293.250000  1226.250000   692.250000   605.250000   
max    1980.000000  1952.000000  3427.000000  1134.000000   835.000000   

            Tank_6       Tank_7       Tank_8  
count  2420.000000  2420.000000  2420.000000  
mean    198.284298   207.890083   163.835124  
std     190.497343   216.415569   179.054521  
min       0.000000     0.000000     0.000000  
25%       3.000000     0.000000     0.000000  
50%     165.500000   144.000000    86.000000  
75% 

In [87]:
print(df_phy_1[['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8']].nunique())

Tank_1    1049
Tank_2    1120
Tank_3    1234
Tank_4     807
Tank_5     721
Tank_6     482
Tank_7      99
Tank_8     424
dtype: int64


In [88]:
# est ce qu'on a qu'un Tank à la fois ?

# Fonction pour vérifier la condition
def check_condition(row):
    columns = ['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8']
    
    # vérifier si une seule colonne est non nulle et les autres à 0
    for col in columns:
        if row[col] != 0:
            other_columns = [row[c] for c in columns if c != col]
            if all(value == 0 for value in other_columns):
                return True
    return False

test = df_phy_1.copy()

# Appliquer la fonction à chaque ligne du DataFrame
test['Condition_met'] = test.apply(check_condition, axis=1)

print(test['Condition_met'].value_counts())

Condition_met
False    2414
True        6
Name: count, dtype: int64


In [89]:
# non on a que 6 lignes où une seule colonne est non nulle

In [90]:
# TODO : plot distribution tank

### Pump

In [91]:
# TODO : visualiser la répartition des valeurs non nulles pour chaque dataset

In [92]:
# est ce qu'on a plusieurs true à la fois ? 
print("nb ligne dans phy1 : ", df_phy_1.shape[0])
val_1 = df_phy_1['Pump_1'].sum()
val_2 = df_phy_1['Pump_2'].sum()
val_3 = df_phy_1['Pump_3'].sum()
val_4 = df_phy_1['Pump_4'].sum()
val_5 = df_phy_1['Pump_5'].sum()
val_6 = df_phy_1['Pump_6'].sum()
print(val_1, val_2, val_3, val_4, val_5, val_6)
print(val_1 + val_2 + val_3 + val_4 + val_5 + val_6)

nb ligne dans phy1 :  2420
726 590 0 461 991 966
3734


In [93]:
# oui : plusieurs true à la fois 
# mais on voit que Pump_3 toujours à False, vérifions dans autres datasets

In [94]:
print(df_phy_1['Pump_3'].value_counts().to_dict())
print(df_phy_2['Pump_3'].value_counts().to_dict())
print(df_phy_3['Pump_3'].value_counts().to_dict())
print(df_phy_4['Pump_3'].value_counts().to_dict())
print(df_phy_norm['Pump_3'].value_counts().to_dict())
# que des false -> colonne à supprimer

{False: 2420}
{False: 2104}
{False: 1254}
{False: 1717}
{False: 3428}


In [95]:
df_phy_1.drop(columns='Pump_3', inplace=True)
df_phy_2.drop(columns='Pump_3', inplace=True)
df_phy_3.drop(columns='Pump_3', inplace=True)
df_phy_4.drop(columns='Pump_3', inplace=True)
df_phy_norm.drop(columns='Pump_3', inplace=True)

### Valv

In [96]:
# combien de valeurs pour True et False pour les Valv ? 
print("nb ligne dans phy1 : ", df_phy_1.shape[0])
sum_valv = 0
for i in range(1, 23):
    sum_valv += df_phy_1[f'Valv_{i}'].sum()
    print(f'Valv_{i}',df_phy_1[f'Valv_{i}'].value_counts().to_dict())
print(sum_valv)
# idem : ne sommes pas à 1 true par valv 
# mais on voit plein de colonnes à valeur constante à false, regardons dans les autres datasets

nb ligne dans phy1 :  2420
Valv_1 {False: 2420}
Valv_2 {False: 2420}
Valv_3 {False: 2420}
Valv_4 {False: 2420}
Valv_5 {False: 2420}
Valv_6 {False: 2420}
Valv_7 {False: 2420}
Valv_8 {False: 2420}
Valv_9 {False: 2420}
Valv_10 {True: 1615, False: 805}
Valv_11 {True: 1615, False: 805}
Valv_12 {True: 1614, False: 806}
Valv_13 {True: 1269, False: 1151}
Valv_14 {True: 1269, False: 1151}
Valv_15 {True: 1269, False: 1151}
Valv_16 {False: 2420}
Valv_17 {False: 1486, True: 934}
Valv_18 {False: 1861, True: 559}
Valv_19 {False: 2420}
Valv_20 {True: 1500, False: 920}
Valv_21 {False: 2420}
Valv_22 {False: 1372, True: 1048}
12692


In [97]:
# que des False dans Valv 1 à 9, 16, 19 et 21 

# vérifions dans autres datasets
test_all = pd.concat([df_phy_1, df_phy_2, df_phy_3, df_phy_4, df_phy_norm])

print(test_all['Valv_1'].value_counts().to_dict())
print(test_all['Valv_2'].value_counts().to_dict())
print(test_all['Valv_3'].value_counts().to_dict())
print(test_all['Valv_4'].value_counts().to_dict())
print(test_all['Valv_5'].value_counts().to_dict())
print(test_all['Valv_6'].value_counts().to_dict())
print(test_all['Valv_7'].value_counts().to_dict())
print(test_all['Valv_8'].value_counts().to_dict())
print(test_all['Valv_9'].value_counts().to_dict())
print(test_all['Valv_16'].value_counts().to_dict())
print(test_all['Valv_19'].value_counts().to_dict())
print(test_all['Valv_21'].value_counts().to_dict())

{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}
{False: 10923}


In [98]:
# on peut supprimer ces colonnes

def drop_columns(df):
    df.drop(columns=["Valv_1", "Valv_2", "Valv_3", "Valv_4", "Valv_5", "Valv_6", "Valv_7", "Valv_8", "Valv_9", "Valv_16", "Valv_19", "Valv_21"], inplace=True)
    return df

df_phy_1 = drop_columns(df_phy_1)
df_phy_2 = drop_columns(df_phy_2)
df_phy_3 = drop_columns(df_phy_3)
df_phy_4 = drop_columns(df_phy_4)
df_phy_norm = drop_columns(df_phy_norm)       

In [99]:
# est ce qu'on a plusieurs true à la fois ? 
print("nb ligne dans phy1 : ", df_phy_1.shape[0])
val_1 = df_phy_1['Valv_10'].sum()
val_2 = df_phy_1['Valv_11'].sum()
val_3 = df_phy_1['Valv_12'].sum()
val_4 = df_phy_1['Valv_13'].sum()
val_5 = df_phy_1['Valv_14'].sum()
val_6 = df_phy_1['Valv_15'].sum()
val_7 = df_phy_1['Valv_17'].sum()
val_8 = df_phy_1['Valv_18'].sum()
val_9 = df_phy_1['Valv_20'].sum()
val_10 = df_phy_1['Valv_22'].sum()
print(val_1, val_2, val_3, val_4, val_5, val_6, val_7, val_8, val_9, val_10)
print(val_1 + val_2 + val_3 + val_4 + val_5 + val_6 + val_7 + val_8 + val_9 + val_10)

nb ligne dans phy1 :  2420
1615 1615 1614 1269 1269 1269 934 559 1500 1048
12692


In [100]:
# oui : plusieurs true à la fois

### Préparation à exporter les données pour les autres notebooks et les visualisations

In [101]:
df_phy_attack = pd.concat([df_phy_1, df_phy_2, df_phy_3, df_phy_4])
df_phy_all = pd.concat([df_phy_1, df_phy_2, df_phy_3, df_phy_4, df_phy_norm])
df_phy_load_all = pd.concat([df_load_phy_1, df_load_phy_2, df_load_phy_3, df_load_phy_4, df_load_phy_norm])
dict_dfs = {
    "phy_att_1": df_phy_1,
    "phy_att_2": df_phy_2,
    "phy_att_3": df_phy_3,
    "phy_att_4": df_phy_4,
    "phy_norm": df_phy_norm,
    "all": df_phy_all,
}

In [102]:
import os
from pickleshare import PickleShareDB

data_dir = '../prep_data' 
os.makedirs(data_dir, exist_ok=True)
db = PickleShareDB(os.path.join(data_dir, 'kity'))

db['df_phy_1'] = df_phy_1
db['df_phy_2'] = df_phy_2
db['df_phy_3'] = df_phy_3
db['df_phy_4'] = df_phy_4
db['df_phy_norm'] = df_phy_norm
db['df_phy_attack'] = df_phy_attack
db['df_phy_all'] = df_phy_all
db['dict_dfs'] = dict_dfs
db['df_load_phy_1'] = df_load_phy_1
db['df_load_phy_2'] = df_load_phy_2
db['df_load_phy_3'] = df_load_phy_3
db['df_load_phy_4'] = df_load_phy_4
db['df_load_phy_norm'] = df_load_phy_norm
db['df_load_phy_all'] = df_phy_load_all
db['dict_dfs_load'] = {
    "phy_att_1": df_load_phy_1,
    "phy_att_2": df_load_phy_2,
    "phy_att_3": df_load_phy_3,
    "phy_att_4": df_load_phy_4,
    "phy_norm": df_load_phy_norm,
    "all": df_phy_load_all,
}