## Partie1: Exploration des Donnees

### Importer les bibliothèque

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve,  precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
import joblib

### Telecharger Le Fichier csv, et Lire les 5 premier ligne

In [4]:
df1 = pd.read_csv("data/mainSimulationAccessTraces.csv")
df2 = pd.read_csv("data/periodicityDataset.csv")

print("Duplicates in df1:", df1.duplicated().sum())
print("Duplicates in df2:", df2.duplicated().sum())

common_rows = pd.merge(df1, df2, how='inner')
print("Number of duplicate rows between df1 and df2:", len(common_rows))
print(df1["timestamp"])
df1['timestamp'] = pd.to_datetime(df1['timestamp'], unit='ms')
df2['timestamp'] = pd.to_datetime(df2['timestamp'], unit='ms')
print(df1["timestamp"])
# Overview of the time ranges
print("Main dataset time range:", df1['timestamp'].min(), "to", df1['timestamp'].max())
print("Periodicity dataset time range:", df2['timestamp'].min(), "to", df2['timestamp'].max())

# Optional: check for overlap
overlap = pd.merge(df1, df2, on='timestamp', how='inner')
print("Number of events with same timestamp in both datasets:", len(overlap))

Duplicates in df1: 1912
Duplicates in df2: 0
Number of duplicate rows between df1 and df2: 0
0         1520031600000
1         1520031603269
2         1520031603279
3         1520031603290
4         1520031603464
              ...      
357947    1520117997001
357948    1520117997339
357949    1520117997401
357950    1520117998907
357951    1520117999000
Name: timestamp, Length: 357952, dtype: int64
0        2018-03-02 23:00:00.000
1        2018-03-02 23:00:03.269
2        2018-03-02 23:00:03.279
3        2018-03-02 23:00:03.290
4        2018-03-02 23:00:03.464
                   ...          
357947   2018-03-03 22:59:57.001
357948   2018-03-03 22:59:57.339
357949   2018-03-03 22:59:57.401
357950   2018-03-03 22:59:58.907
357951   2018-03-03 22:59:59.000
Name: timestamp, Length: 357952, dtype: datetime64[ns]
Main dataset time range: 2018-03-02 23:00:00 to 2018-03-03 22:59:59
Periodicity dataset time range: 2018-02-28 07:00:00 to 2018-03-01 18:59:48.040000
Number of events with same ti

In [5]:
cols = [
    "sourceID", "sourceAddress", "sourceType", "sourceLocation",
    "destinationServiceAddress", "destinationServiceType", "destinationLocation",
    "accessedNodeAddress", "accessedNodeType"
]

main_sub = df1[cols]
period_sub = df2[cols]

for col in cols:
    print(f"Column: {col}")
    print(" - Main dataset unique values:", main_sub[col].nunique())
    print(" - Periodicity dataset unique values:", period_sub[col].nunique())
    print()

for col in cols:
    main_values = set(main_sub[col].dropna().unique())
    period_values = set(period_sub[col].dropna().unique())

    only_in_main = main_values - period_values
    only_in_period = period_values - main_values

    print(f"Column: {col}")
    print(f" - Only in main dataset: {len(only_in_main)} values")
    print(f" - Only in periodicity dataset: {len(only_in_period)} values")
    print()

for col in cols:
    print(f"Column: {col}")
    print("Main dataset top 5:")
    print(main_sub[col].value_counts().head())
    print("Periodicity dataset top 5:")
    print(period_sub[col].value_counts().head())
    print("-" * 40)

Column: sourceID
 - Main dataset unique values: 84
 - Periodicity dataset unique values: 1

Column: sourceAddress
 - Main dataset unique values: 89
 - Periodicity dataset unique values: 1

Column: sourceType
 - Main dataset unique values: 8
 - Periodicity dataset unique values: 1

Column: sourceLocation
 - Main dataset unique values: 21
 - Periodicity dataset unique values: 1

Column: destinationServiceAddress
 - Main dataset unique values: 85
 - Periodicity dataset unique values: 2

Column: destinationServiceType
 - Main dataset unique values: 8
 - Periodicity dataset unique values: 2

Column: destinationLocation
 - Main dataset unique values: 21
 - Periodicity dataset unique values: 2

Column: accessedNodeAddress
 - Main dataset unique values: 170
 - Periodicity dataset unique values: 2

Column: accessedNodeType
 - Main dataset unique values: 12
 - Periodicity dataset unique values: 2

Column: sourceID
 - Only in main dataset: 83 values
 - Only in periodicity dataset: 0 values

Colum

In [2]:
df_original= pd.read_csv("data/mainSimulationAccessTraces.csv", low_memory=False)
df_original.head()

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,normality
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,none,1520031600000,normal
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,none,1520031603269,normal
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,none,1520031603279,normal
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,none,1520031603290,normal
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,none,1520031603464,normal


### Lire la 5 dernier ligne 

In [3]:
df_original.tail()

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,normality
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,20.3479,1520117997001,normal
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,9,1520117997339,normal
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,true,1520117997401,normal
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,0,1520117998907,normal
357951,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/tempin28,/sensorService,room_9,/agent28/tempin28,/sensorService,read,20.4131,1520117999000,normal


### Donner les information sur les donner

In [4]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357952 entries, 0 to 357951
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   sourceID                   357952 non-null  object
 1   sourceAddress              357952 non-null  object
 2   sourceType                 357952 non-null  object
 3   sourceLocation             357952 non-null  object
 4   destinationServiceAddress  357952 non-null  object
 5   destinationServiceType     357952 non-null  object
 6   destinationLocation        357952 non-null  object
 7   accessedNodeAddress        357952 non-null  object
 8   accessedNodeType           357804 non-null  object
 9   operation                  357952 non-null  object
 10  value                      355902 non-null  object
 11  timestamp                  357952 non-null  int64 
 12  normality                  357952 non-null  object
dtypes: int64(1), object(12)
memory usage: 35.5+ 

#### target = "normality", "normal" = 0, "anomalous" = 1, NaN = NaN

In [5]:
df = df_original.copy()
df['normality'] = df['normality'].apply(lambda x: 0 if x == "normal" else 1 if pd.notna(x) else np.nan)
df['normality'].value_counts()


normality
0    347935
1     10017
Name: count, dtype: int64

### Exploraty Data Analysis (EDA): Explorer les donnees

In [6]:
def exploraty_analysis(df, target=None):
  n_rows, n_cols = df.shape
  dtypes = df.dtypes.astype(str)
  missing_count = df.isnull().sum()
  missing_pct = (missing_count / n_rows *100).round(2)
  missing_info = (
      pd.concat([missing_count, missing_pct], axis=1)
        .rename(columns={0:"count", 1:"percentage"})
        .query("count > 0")
  )
  empty_columns = missing_pct[missing_pct == 100].index.tolist()
  duplicates = int(df.duplicated().sum())
  numeric_stats = df.select_dtypes(include="number").describe()
  if target and target in df.columns:
    target_dist = (
        df[target]
        .value_counts()
        .rename("count")
        .to_frame()
        .assign(percentage=lambda x: x["count"] / n_rows * 100)
        .round(2)
    )
  else:
    target_dist = None
  identifier_keywords = ["id", "seq", "time"]
  numeric_cols = df.select_dtypes(include="number").columns
  filtred_numeric = [
      c for c in numeric_cols
      if not any(k in c.lower() for k in identifier_keywords)
  ]

  corr_matrix = (
      df[filtred_numeric].corr()
      if len(filtred_numeric) > 1
      else None
  )

  return {
      "Dimention": {"rows": n_rows, "columns": n_cols},
      "Type de donnees": dtypes.value_counts().to_dict(),
      "valeur manquante": missing_info,
      "colonees vides": empty_columns,
      "doublons": duplicates,
      "statistiques numeriques": numeric_stats,
      "distribution de la target": target_dist,
      "correlations": corr_matrix,
  }

EDA = exploraty_analysis(df, "normality")

EDA

{'Dimention': {'rows': 357952, 'columns': 13},
 'Type de donnees': {'object': 11, 'int64': 2},
 'valeur manquante':                   count  percentage
 accessedNodeType    148        0.04
 value              2050        0.57,
 'colonees vides': [],
 'doublons': 1912,
 'statistiques numeriques':           timestamp      normality
 count  3.579520e+05  357952.000000
 mean   1.520078e+12       0.027984
 std    2.465664e+07       0.164928
 min    1.520032e+12       0.000000
 25%    1.520057e+12       0.000000
 50%    1.520080e+12       0.000000
 75%    1.520099e+12       0.000000
 max    1.520118e+12       1.000000,
 'distribution de la target':             count  percentage
 normality                    
 0          347935        97.2
 1           10017         2.8,
 'correlations': None}

## Partie 2: Preparation et Feature Engineering

### Supprimer Les doublons

In [7]:
# supprimer les doublons
df_with_dorped_duplcate = df.drop_duplicates()
EDA = exploraty_analysis(df_with_dorped_duplcate, "normality")
EDA

{'Dimention': {'rows': 356040, 'columns': 13},
 'Type de donnees': {'object': 11, 'int64': 2},
 'valeur manquante':                   count  percentage
 accessedNodeType    147        0.04
 value              1464        0.41,
 'colonees vides': [],
 'doublons': 0,
 'statistiques numeriques':           timestamp      normality
 count  3.560400e+05  356040.000000
 mean   1.520078e+12       0.022787
 std    2.463930e+07       0.149223
 min    1.520032e+12       0.000000
 25%    1.520057e+12       0.000000
 50%    1.520080e+12       0.000000
 75%    1.520099e+12       0.000000
 max    1.520118e+12       1.000000,
 'distribution de la target':             count  percentage
 normality                    
 0          347927       97.72
 1            8113        2.28,
 'correlations': None}

### Analyse des colonnes catégorielles

In [8]:
def analyse_categorial(df, number_samples=5):
    categorical_cols = df.select_dtypes(include=['object']).columns
    print(f"{len(categorical_cols)} colonnes catégorielles:")
    
    for col in categorical_cols:
        unique_vals = df[col].nunique()
        sample_vals = df[col].unique()[:number_samples]
        print(f"  - {col}: {unique_vals} valeurs uniques | Ex: {sample_vals}")


analyse_categorial(df_with_dorped_duplcate, 8)

11 colonnes catégorielles:
  - sourceID: 84 valeurs uniques | Ex: ['lightcontrol2' 'lightcontrol3' 'lightcontrol1' 'lightcontrol4'
 'movement4' 'tempin2' 'movement2' 'tempin4']
  - sourceAddress: 89 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' '/agent1/lightcontrol1'
 '/agent4/lightcontrol4' '/agent4/movement4' '/agent2/tempin2'
 '/agent2/movement2' '/agent4/tempin4']
  - sourceType: 8 valeurs uniques | Ex: ['/lightControler' '/movementSensor' '/sensorService' '/batteryService'
 '/doorLockService' '/thermostat' '/washingService' '/smartPhone']
  - sourceLocation: 21 valeurs uniques | Ex: ['BedroomParents' 'Dinningroom' 'BedroomChildren' 'Kitchen' 'Garage'
 'Bathroom' 'Livingroom' 'Watterroom']
  - destinationServiceAddress: 85 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' '/agent1/lightcontrol1'
 '/agent4/lightcontrol4' '/agent4/movement4' '/agent2/tempin2'
 '/agent2/movement2' '/agent4/tempin4']
  - destinationServiceType: 8 valeurs u

In [9]:
# la colone SourceId a 82 values unique:
print("Frequency Encoding pour identifiants...")
def source_id_column_encoding(df):
    df_encoded = df.copy()
    sourceID_freq = df_encoded['sourceID'].value_counts(normalize=True)
    df_encoded['sourceID_freq'] = df_encoded['sourceID'].map(sourceID_freq)
    return df_encoded

df_encoded_SourceId = source_id_column_encoding(df)

df_encoded_SourceId

Frequency Encoding pour identifiants...


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,normality,sourceID_freq
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,none,1520031600000,0,0.017667
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,none,1520031603269,0,0.017676
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,none,1520031603279,0,0.023229
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,none,1520031603290,0,0.017692
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,none,1520031603464,0,0.000187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,20.3479,1520117997001,0,0.017567
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,9,1520117997339,0,0.035653
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,true,1520117997401,0,0.035653
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,0,1520117998907,0,0.017547


In [10]:
# la colone accessedNodeAddress a 170 valeurs
print("Frequency Encoding pour identifiants...")
def accessedNodeAddress_column_encoding(df):
    df_encoded = df.copy()

    accessed_freq = df_encoded['accessedNodeAddress'].value_counts(normalize=True)
    df_encoded['accessed_addr_freq'] = df_encoded['accessedNodeAddress'].map(accessed_freq)



    return df_encoded

df_encoded_accessNode = accessedNodeAddress_column_encoding(df_encoded_SourceId)

df_encoded_accessNode

Frequency Encoding pour identifiants...


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,normality,sourceID_freq,accessed_addr_freq
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,none,1520031600000,0,0.017667,0.000003
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,none,1520031603269,0,0.017676,0.000003
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,none,1520031603279,0,0.023229,0.000003
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,none,1520031603290,0,0.017692,0.000003
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,none,1520031603464,0,0.000187,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,20.3479,1520117997001,0,0.017567,0.017896
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,9,1520117997339,0,0.035653,0.053169
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,true,1520117997401,0,0.035653,0.024056
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,0,1520117998907,0,0.017547,0.011647


In [11]:
# Etablir la relation entre la source et la destination et accessed node
print("Création de features relationnelles...")
def make_relations(df):
    df_with_relations = df.copy()

    df_with_relations['src_dest_same_addr'] = (
        df_with_relations['sourceAddress'] == df_with_relations['destinationServiceAddress']
    ).astype(int)

    df_with_relations["src_dest_same_type"] = (
        df_with_relations['sourceType'] == df_with_relations['destinationServiceType']
    )

    df_with_relations['src_dest_same_loc'] = (
        df_with_relations['sourceLocation'] == df_with_relations['destinationLocation']
    ).astype(int)

    return df_with_relations

df_relations_maded = make_relations(df_encoded_accessNode)

df_relations_maded

Création de features relationnelles...


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,normality,sourceID_freq,accessed_addr_freq,src_dest_same_addr,src_dest_same_type,src_dest_same_loc
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,none,1520031600000,0,0.017667,0.000003,1,True,1
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,none,1520031603269,0,0.017676,0.000003,1,True,1
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,none,1520031603279,0,0.023229,0.000003,1,True,1
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,none,1520031603290,0,0.017692,0.000003,1,True,1
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,none,1520031603464,0,0.000187,0.000003,1,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,20.3479,1520117997001,0,0.017567,0.017896,0,False,1
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,9,1520117997339,0,0.035653,0.053169,0,False,1
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,true,1520117997401,0,0.035653,0.024056,0,False,1
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,0,1520117998907,0,0.017547,0.011647,0,False,1


In [12]:
# separer agent , device, and accessed node

def separate_path_device(df):
    df_separated_path = df.copy()
    # pour sourceAdress et destinationServiceAdress
    features = ['src', 'sourceAddress',],['dest','destinationServiceAddress']
    for label, feature  in features:
        df_separated_path[label+'_agent_number'] = ( #-----------------
            df_separated_path[feature]
            .str.extract(r'/agent(\d+)/')[0]
            .astype(float)
        )
    
        df_separated_path[label+'_device_info'] = (
        df_separated_path[feature]
        .str.replace(r'^/agent\d+/', '', regex=True)
        )
    
    # pour AccessNodeLocation

    df_separated_path['acc_node_type'] = df_separated_path['accessedNodeAddress'].str.extract(r'/agent\d+/([a-zA-Z]+)\d*')


    df_separated_path['acc_node_event'] = df_separated_path['accessedNodeAddress'].str.extract(r'/([a-zA-Z]+)$')

    return df_separated_path

df_path_separated = separate_path_device(df_relations_maded)

df_path_separated


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,accessed_addr_freq,src_dest_same_addr,src_dest_same_type,src_dest_same_loc,src_agent_number,src_device_info,dest_agent_number,dest_device_info,acc_node_type,acc_node_event
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,0.000003,1,True,1,2.0,lightcontrol2,2.0,lightcontrol2,lightcontrol,
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,0.000003,1,True,1,3.0,lightcontrol3,3.0,lightcontrol3,lightcontrol,
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,0.000003,1,True,1,1.0,lightcontrol1,1.0,lightcontrol1,lightcontrol,
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,0.000003,1,True,1,4.0,lightcontrol4,4.0,lightcontrol4,lightcontrol,
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,0.000003,1,True,1,4.0,movement4,4.0,movement4,movement,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,0.017896,0,False,1,23.0,lightcontrol23,23.0,tempin23,tempin,
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,0.053169,0,False,1,11.0,washingmachine2,11.0,battery4,battery,charge
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,0.024056,0,False,1,11.0,washingmachine2,11.0,battery4,battery,charging
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,0.011647,0,False,1,28.0,lightcontrol28,28.0,movement28,movement,movement


In [13]:
# pour etre uniforme
df_path_separated['acc_node_event_filled'] = df_path_separated['acc_node_event'].fillna(df_path_separated['acc_node_type'])
df_path_separated

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,src_dest_same_addr,src_dest_same_type,src_dest_same_loc,src_agent_number,src_device_info,dest_agent_number,dest_device_info,acc_node_type,acc_node_event,acc_node_event_filled
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,1,True,1,2.0,lightcontrol2,2.0,lightcontrol2,lightcontrol,,lightcontrol
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,1,True,1,3.0,lightcontrol3,3.0,lightcontrol3,lightcontrol,,lightcontrol
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,1,True,1,1.0,lightcontrol1,1.0,lightcontrol1,lightcontrol,,lightcontrol
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,1,True,1,4.0,lightcontrol4,4.0,lightcontrol4,lightcontrol,,lightcontrol
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,1,True,1,4.0,movement4,4.0,movement4,movement,,movement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,0,False,1,23.0,lightcontrol23,23.0,tempin23,tempin,,tempin
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,0,False,1,11.0,washingmachine2,11.0,battery4,battery,charge,charge
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,0,False,1,11.0,washingmachine2,11.0,battery4,battery,charging,charging
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,0,False,1,28.0,lightcontrol28,28.0,movement28,movement,movement,movement


In [14]:
# Group by acc_node_event_filled and list all unique values in 'value' column
event_values = df_path_separated.groupby('acc_node_event_filled')['value'].unique()

# Display nicely
for event, values in event_values.items():
    print(f"=== {event} ===")
    print(values)

=== battery ===
['none']
=== charge ===
['10' '9' 'org.ds2os.vsl.core.utils.AddressParameters@5c494e7d'
 'org.ds2os.vsl.core.utils.AddressParameters@4ff97c45'
 'org.ds2os.vsl.core.utils.AddressParameters@3ebb34ed' '8'
 'org.ds2os.vsl.core.utils.AddressParameters@2c82b391'
 'org.ds2os.vsl.core.utils.AddressParameters@61ba9b2b'
 'org.ds2os.vsl.core.utils.AddressParameters@715cced' '7' '6' '5' '4' '3'
 '2' '1' '0' nan '19.881']
=== charging ===
['false' 'true' nan]
=== doorlock ===
['none']
=== heatingOn ===
['0']
=== heatingcontrol ===
['none' '1' '0']
=== lastChange ===
['1.52121675521E12' '1.521216758314E12' '1.521216770002E12'
 '1.521216782996E12' '1.521216841986E12' '1.521216899605E12'
 '1.521216902949E12' '1.521216905597E12' '1.521216915966E12'
 '1.521216923287E12' '1.521216947917E12' '1.52121694927E12'
 '1.521216969706E12' '1.521217011952E12' '1.521217013402E12'
 '1.521217017856E12' '1.521217019507E12' '1.521217028092E12'
 '1.52121703437E12' '1.521217043809E12' '1.521217059081E12'


In [15]:
from collections import defaultdict

mapping_first = defaultdict(dict)

for event, group in df_path_separated.groupby('acc_node_event_filled'):
    for _, row in group.iterrows():
        val = row['value']
        if val not in mapping_first[event]:  # only keep the first occurrence
            mapping_first[event][val] = row['accessedNodeAddress']

mapping_first


defaultdict(dict,
            {'battery': {'none': '/agent5/battery2'},
             'charge': {'10': '/agent11/battery4/charge',
              '9': '/agent11/battery4/charge',
              'org.ds2os.vsl.core.utils.AddressParameters@5c494e7d': '/agent11/battery4/charge',
              'org.ds2os.vsl.core.utils.AddressParameters@4ff97c45': '/agent5/battery1/charge',
              'org.ds2os.vsl.core.utils.AddressParameters@3ebb34ed': '/agent12/battery5/charge',
              '8': '/agent11/battery4/charge',
              'org.ds2os.vsl.core.utils.AddressParameters@2c82b391': '/agent12/battery6/charge',
              'org.ds2os.vsl.core.utils.AddressParameters@61ba9b2b': '/agent4/battery3/charge',
              'org.ds2os.vsl.core.utils.AddressParameters@715cced': '/agent5/battery2/charge',
              '7': '/agent11/battery4/charge',
              '6': '/agent11/battery4/charge',
              '5': '/agent11/battery4/charge',
              '4': '/agent11/battery4/charge',
          

In [16]:
mapping_first["charge"]

{'10': '/agent11/battery4/charge',
 '9': '/agent11/battery4/charge',
 'org.ds2os.vsl.core.utils.AddressParameters@5c494e7d': '/agent11/battery4/charge',
 'org.ds2os.vsl.core.utils.AddressParameters@4ff97c45': '/agent5/battery1/charge',
 'org.ds2os.vsl.core.utils.AddressParameters@3ebb34ed': '/agent12/battery5/charge',
 '8': '/agent11/battery4/charge',
 'org.ds2os.vsl.core.utils.AddressParameters@2c82b391': '/agent12/battery6/charge',
 'org.ds2os.vsl.core.utils.AddressParameters@61ba9b2b': '/agent4/battery3/charge',
 'org.ds2os.vsl.core.utils.AddressParameters@715cced': '/agent5/battery2/charge',
 '7': '/agent11/battery4/charge',
 '6': '/agent11/battery4/charge',
 '5': '/agent11/battery4/charge',
 '4': '/agent11/battery4/charge',
 '3': '/agent11/battery4/charge',
 '2': '/agent11/battery4/charge',
 '1': '/agent11/battery4/charge',
 '0': '/agent11/battery4/charge',
 nan: '/agent11/battery4/charge',
 '19.881': '/agent12/battery5/charge'}

In [17]:
print(mapping_first["charging"])
print(mapping_first["doorlock"])
print(mapping_first["heatingOn"])
print(mapping_first["heatingcontrol"])
print(mapping_first["lightOn"])
print(mapping_first["lightcontrol"])
print(mapping_first["movement"])

{'false': '/agent11/battery4/charging', 'true': '/agent11/battery4/charging', nan: '/agent5/battery2/charging'}
{'none': '/agent3/doorlock1'}
{'0': '/agent3/heatingcontrol1/heatingOn'}
{'none': '/agent11/heatingcontrol2', '1': '/agent25/heatingcontrol4', '0': '/agent25/heatingcontrol4'}
{'1': '/agent2/lightcontrol2/lightOn', '0': '/agent5/lightcontrol5/lightOn'}
{'none': '/agent2/lightcontrol2'}
{'none': '/agent4/movement4', '1': '/agent3/movement3/movement', '0': '/agent1/movement1/movement'}


In [18]:
# la colone value a 10,623 valeurs unique
print("Traitement de 'value' (mixte numérique/textuel)...")
def value_colmn_encoding(df):  
    df_encoded = df.copy()
    df_encoded['value_numeric'] = pd.to_numeric(
        df_encoded['value'],
        errors='coerce'
    )
    df_encoded['value_is_none'] = (df_encoded['value'] == 'none').astype(int)
    text_values = df_encoded.loc[
        df_encoded['value_numeric'].isna(),
        'value'
    ]
    value_counts = text_values.value_counts()
    common_text_values = value_counts[value_counts > 100].index.tolist()
    def categorize_value(row):
        if not pd.isna(row['value_numeric']):
            return 'numeric'
        elif row['value'] == 'none':
            return 'none'
        elif row['value'] in common_text_values:
            return row['value']
        else:
            return 'other'
    df_encoded['value_category'] = df_encoded.apply(
        categorize_value,
        axis=1
    )

    return df_encoded

df_encoded_value = value_colmn_encoding(df_path_separated)

df_encoded_value


Traitement de 'value' (mixte numérique/textuel)...


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,src_agent_number,src_device_info,dest_agent_number,dest_device_info,acc_node_type,acc_node_event,acc_node_event_filled,value_numeric,value_is_none,value_category
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,2.0,lightcontrol2,2.0,lightcontrol2,lightcontrol,,lightcontrol,,1,none
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,3.0,lightcontrol3,3.0,lightcontrol3,lightcontrol,,lightcontrol,,1,none
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,1.0,lightcontrol1,1.0,lightcontrol1,lightcontrol,,lightcontrol,,1,none
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,4.0,lightcontrol4,4.0,lightcontrol4,lightcontrol,,lightcontrol,,1,none
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,4.0,movement4,4.0,movement4,movement,,movement,,1,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,23.0,lightcontrol23,23.0,tempin23,tempin,,tempin,20.3479,0,numeric
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,11.0,washingmachine2,11.0,battery4,battery,charge,charge,9.0000,0,numeric
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,11.0,washingmachine2,11.0,battery4,battery,charging,charging,,0,true
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,28.0,lightcontrol28,28.0,movement28,movement,movement,movement,0.0000,0,numeric


In [19]:
# separeation de colone value (colone heterogene), a des colone qui ont les meme mesure physique
def create_event_specific_columns(df):
    df_new = df.copy()
    events = df_new['acc_node_event_filled'].unique()
    
    for event in events:
        
        numeric_mask = df_new['acc_node_event_filled'] == event
        if df_new.loc[numeric_mask, 'value_numeric'].notna().any():
            df_new[f'value_{event}'] = np.where(
                df_new['acc_node_event_filled'] == event,
                df_new['value_numeric'],
                np.nan
            )
        else:
            
            df_new[f'value_{event}'] = np.where(
                df_new['acc_node_event_filled'] == event,
                df_new['value_category'],
                np.nan
            )
    return df_new

df_separate_value_cols = create_event_specific_columns(df_encoded_value)

df_separate_value_cols

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,value_battery,value_doorlock,value_heatingcontrol,value_washingmachine,value_open,value_charge,value_charging,value_questioningservice,value_washing,value_heatingOn
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,,,,,,,,,,
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,,,,,,,,,,
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,,,,,,,,,,
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,,,,,,,,,,
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,,,,,,,,,,
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,,,,,,9.0,,,,
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,,,,,,,true,,,
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,,,,,,,,,,


In [20]:
cols=['value_washingmachine', 'value_washing',
       'value_questioningservice', 'value_lightcontrol','value_doorlock', 'value_battery' ]
df_encoded = df_separate_value_cols.drop(columns=cols)

df_encoded

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,value_category,value_movement,value_tempin,value_lastChange,value_lightOn,value_heatingcontrol,value_open,value_charge,value_charging,value_heatingOn
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,none,,,,,,,,,
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,none,,,,,,,,,
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,none,,,,,,,,,
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,none,,,,,,,,,
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,none,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,numeric,,20.3479,,,,,,,
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,numeric,,,,,,,9.0,,
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,true,,,,,,,,true,
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,numeric,0.0,,,,,,,,


In [21]:
analyse_categorial(df_encoded, 12)

19 colonnes catégorielles:
  - sourceID: 84 valeurs uniques | Ex: ['lightcontrol2' 'lightcontrol3' 'lightcontrol1' 'lightcontrol4'
 'movement4' 'tempin2' 'movement2' 'tempin4' 'tempin1' 'movement1'
 'movement3' 'tempin3']
  - sourceAddress: 89 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' '/agent1/lightcontrol1'
 '/agent4/lightcontrol4' '/agent4/movement4' '/agent2/tempin2'
 '/agent2/movement2' '/agent4/tempin4' '/agent1/tempin1'
 '/agent1/movement1' '/agent3/movement3' '/agent3/tempin3']
  - sourceType: 8 valeurs uniques | Ex: ['/lightControler' '/movementSensor' '/sensorService' '/batteryService'
 '/doorLockService' '/thermostat' '/washingService' '/smartPhone']
  - sourceLocation: 21 valeurs uniques | Ex: ['BedroomParents' 'Dinningroom' 'BedroomChildren' 'Kitchen' 'Garage'
 'Bathroom' 'Livingroom' 'Watterroom' 'Bedroom' 'Entrance' 'Showerroom'
 'room_1']
  - destinationServiceAddress: 85 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' 

In [22]:
def find_duplicate_columns_by_values(df):
    duplicates = []

    cols = df.columns
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            c1, c2 = cols[i], cols[j]

            # comparer en ignorant les NaN
            if df[c1].equals(df[c2]):
                duplicates.append((c1, c2))

    return duplicates

duplicat = find_duplicate_columns_by_values(df_encoded)

duplicat


[('sourceID', 'src_device_info')]

In [23]:
# encoder les colone de haut cardinalite  src_device_info(84 value), dest_device_info(84 value)  acc_node_info(169 value)

def encoder_haut_cardinality_colonne(df):
    df_encoded = df.copy()

    list_colone = ['src_device_info', 'dest_device_info', 'acc_node_info']

    for colone in list_colone:
        if colone in df_encoded.columns:
            # utilisation frequency encoding
            device_freq = df_encoded[colone].value_counts(normalize=True) # question : Why normalize to True
            df_encoded[f'{colone}_freq'] = df_encoded[colone].map(device_freq)

    return df_encoded


df_encoded_haut_cardns = encoder_haut_cardinality_colonne(df_encoded)

df_encoded_haut_cardns
            

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,value_tempin,value_lastChange,value_lightOn,value_heatingcontrol,value_open,value_charge,value_charging,value_heatingOn,src_device_info_freq,dest_device_info_freq
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,,,,,,,,,0.017667,0.000134
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,,,,,,,,,0.017676,0.000145
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,,,,,,,,,0.023229,0.000112
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,,,,,,,,,0.017692,0.000156
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,,,,,,,,,0.000187,0.011798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,20.3479,,,,,,,,0.017567,0.017896
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,,,,,,9.0,,,0.035653,0.077228
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,,,,,,,true,,0.035653,0.077228
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,,,,,,,,,0.017547,0.011705


In [24]:
analyse_categorial(df_encoded_haut_cardns, 21)

19 colonnes catégorielles:
  - sourceID: 84 valeurs uniques | Ex: ['lightcontrol2' 'lightcontrol3' 'lightcontrol1' 'lightcontrol4'
 'movement4' 'tempin2' 'movement2' 'tempin4' 'tempin1' 'movement1'
 'movement3' 'tempin3' 'movement5' 'lightcontrol5' 'tempin5' 'movement6'
 'lightcontrol6' 'movement10' 'tempin6' 'movement11' 'lightcontrol10']
  - sourceAddress: 89 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' '/agent1/lightcontrol1'
 '/agent4/lightcontrol4' '/agent4/movement4' '/agent2/tempin2'
 '/agent2/movement2' '/agent4/tempin4' '/agent1/tempin1'
 '/agent1/movement1' '/agent3/movement3' '/agent3/tempin3'
 '/agent5/movement5' '/agent5/lightcontrol5' '/agent5/tempin5'
 '/agent6/movement6' '/agent6/lightcontrol6' '/agent10/movement10'
 '/agent6/tempin6' '/agent11/movement11' '/agent10/lightcontrol10']
  - sourceType: 8 valeurs uniques | Ex: ['/lightControler' '/movementSensor' '/sensorService' '/batteryService'
 '/doorLockService' '/thermostat' '/washingService' 

In [25]:
# encodage de type des appariels (sourceType, destinationServiceType, accessedNodeType)
def encoder_type_appariels(df):

    df_encoded = df.copy()


    device_cols = ['sourceType', 'destinationServiceType', 'accessedNodeType']
    
    device_risk = {
        '/lightControler': 1,
        '/movementSensor': 2,
        '/sensorService': 1,
        '/batteryService': 1,
        '/doorLockService': 3,
        '/thermostat': 2,
        '/washingService': 1,
        '/smartPhone': 3,
        '/derived/boolean': 1,
        '/basic/number': 1,
        '/basic/text': 1,
        '/basic/composed': 1
    }

    for colone in device_cols:
        if colone in df_encoded.columns:
            df_encoded[f'{colone}_risk'] = df_encoded[colone].map(device_risk).fillna(1)
            # Top 3 One-Hot
            top3 = df_encoded[colone].value_counts().nlargest(3).index
            for val in top3:
                df_encoded[f'{colone}_is_{str(val).replace("/", "")}'] = (df_encoded[colone] == val).astype(int)
    
    return df_encoded


df_type_appariel_encoded = encoder_type_appariels(df_encoded_haut_cardns)

df_type_appariel_encoded

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,sourceType_is_sensorService,sourceType_is_batteryService,destinationServiceType_risk,destinationServiceType_is_sensorService,destinationServiceType_is_batteryService,destinationServiceType_is_movementSensor,accessedNodeType_risk,accessedNodeType_is_sensorService,accessedNodeType_is_derivedboolean,accessedNodeType_is_basicnumber
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,0,0,1,0,0,0,1.0,0,0,0
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,0,0,1,0,0,0,1.0,0,0,0
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,0,0,1,0,0,0,1.0,0,0,0
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,0,0,1,0,0,0,1.0,0,0,0
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,0,0,2,0,0,1,2.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,0,0,1,1,0,0,1.0,1,0,0
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,0,0,1,0,1,0,1.0,0,0,1
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,0,0,1,0,1,0,1.0,0,0,0
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,0,0,2,0,0,1,1.0,0,1,0


In [26]:
# encodge des device locations (sourceLocation, destinationLocation)

def encoder_devices_location(df):

    df_encoded = df.copy()


    location_cols = ['sourceLocation', 'destinationLocation']
    
    location_grouping = {
        # Bedrooms (high privacy)
        'BedroomParents': 3, 'BedroomChildren': 3, 'Bedroom': 3,
        # Living areas
        'Dinningroom': 2, 'Livingroom': 2,
        # Utility rooms
        'Kitchen': 1, 'Garage': 1, 'Watterroom': 1,
        # Bathrooms
        'Bathroom': 3, 'Showerroom': 3,
        # Entry
        'Entrance': 2
    }
    # Generic rooms
    location_grouping.update({f'room_{i}': 2 for i in range(1, 11)})

    for col in location_cols:
        if col in df_encoded.columns:
            df_encoded[f'{col}_privacy_score'] = df_encoded[col].map(location_grouping).fillna(1)
    
    return df_encoded


df_device_location_encoded = encoder_devices_location(df_type_appariel_encoded)

df_device_location_encoded

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,destinationServiceType_risk,destinationServiceType_is_sensorService,destinationServiceType_is_batteryService,destinationServiceType_is_movementSensor,accessedNodeType_risk,accessedNodeType_is_sensorService,accessedNodeType_is_derivedboolean,accessedNodeType_is_basicnumber,sourceLocation_privacy_score,destinationLocation_privacy_score
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,1,0,0,0,1.0,0,0,0,3,3
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,1,0,0,0,1.0,0,0,0,2,2
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,1,0,0,0,1.0,0,0,0,3,3
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,1,0,0,0,1.0,0,0,0,1,1
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,2,0,0,1,2.0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,1,1,0,0,1.0,1,0,0,2,2
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,1,0,1,0,1.0,0,0,1,1,1
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,1,0,1,0,1.0,0,0,0,1,1
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,2,0,0,1,1.0,0,1,0,2,2


In [27]:
# encodage de colone operation
def encoder_operation_colmn(df):

    df_encoded = df.copy()

    operation = {
        'registerService': 1,
        'subscribe': 2,
        'read': 3,
        'write': 4,
        'lockSubtree': 5
    }
    
    if 'operation' in df_encoded.columns:
        df_encoded['operation_criticality'] = df_encoded['operation'].map(operation).fillna(1)
        # One-Hot simple
        for op in operation.keys():
            df_encoded[f'op_{op}'] = (df_encoded['operation'] == op).astype(int)

    return df_encoded

df_operation_encoded = encoder_operation_colmn(df_device_location_encoded)
df_operation_encoded

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,accessedNodeType_is_derivedboolean,accessedNodeType_is_basicnumber,sourceLocation_privacy_score,destinationLocation_privacy_score,operation_criticality,op_registerService,op_subscribe,op_read,op_write,op_lockSubtree
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,0,0,3,3,1,1,0,0,0,0
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,0,0,2,2,1,1,0,0,0,0
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,0,0,3,3,1,1,0,0,0,0
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,0,0,1,1,1,1,0,0,0,0
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,0,0,1,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,0,0,2,2,3,0,0,1,0,0
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,0,1,1,1,3,0,0,1,0,0
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,0,0,1,1,3,0,0,1,0,0
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,1,0,2,2,3,0,0,1,0,0


In [28]:
value = [
'value_charge',
'value_charging',
'value_heatingOn',
'value_heatingcontrol',
'value_lastChange',
'value_lightOn',
'value_movement',
'value_open',
'value_semantic',
'value_tempin']

len(value)

10

In [29]:
# encoder colone value_categoie
def encoder_value_categorie_colmn(df):
        
    df_encoded = df.copy()


    value = {'none':0, 'numeric':1, 'false':2, 'true':3, 'twenty':4, 'other':5}
    if 'value_category' in df_encoded.columns:
            df_encoded['value_semantic'] = df_encoded['value_category'].map(value).fillna(0)
            df_encoded['val_is_none'] = (df_encoded['value_category']=='none').astype(int)
            df_encoded['val_is_boolean'] = df_encoded['value_category'].isin(['true','false']).astype(int)
            df_encoded['val_is_numeric'] = (df_encoded['value_category']=='numeric').astype(int)

    return df_encoded

df_value_categorie_encoded = encoder_value_categorie_colmn(df_operation_encoded)
df_value_categorie_encoded

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,operation_criticality,op_registerService,op_subscribe,op_read,op_write,op_lockSubtree,value_semantic,val_is_none,val_is_boolean,val_is_numeric
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,1,1,0,0,0,0,0,1,0,0
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,1,1,0,0,0,0,0,1,0,0
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,1,1,0,0,0,0,0,1,0,0
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,1,1,0,0,0,0,0,1,0,0
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,1,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,3,0,0,1,0,0,1,0,0,1
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,3,0,0,1,0,0,1,0,0,1
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,3,0,0,1,0,0,3,0,1,0
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,3,0,0,1,0,0,1,0,0,1


In [30]:
analyse_categorial(df_value_categorie_encoded)

19 colonnes catégorielles:
  - sourceID: 84 valeurs uniques | Ex: ['lightcontrol2' 'lightcontrol3' 'lightcontrol1' 'lightcontrol4'
 'movement4']
  - sourceAddress: 89 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' '/agent1/lightcontrol1'
 '/agent4/lightcontrol4' '/agent4/movement4']
  - sourceType: 8 valeurs uniques | Ex: ['/lightControler' '/movementSensor' '/sensorService' '/batteryService'
 '/doorLockService']
  - sourceLocation: 21 valeurs uniques | Ex: ['BedroomParents' 'Dinningroom' 'BedroomChildren' 'Kitchen' 'Garage']
  - destinationServiceAddress: 85 valeurs uniques | Ex: ['/agent2/lightcontrol2' '/agent3/lightcontrol3' '/agent1/lightcontrol1'
 '/agent4/lightcontrol4' '/agent4/movement4']
  - destinationServiceType: 8 valeurs uniques | Ex: ['/lightControler' '/movementSensor' '/sensorService' '/batteryService'
 '/doorLockService']
  - destinationLocation: 21 valeurs uniques | Ex: ['BedroomParents' 'Dinningroom' 'BedroomChildren' 'Kitchen' 'Garage']
  - 

In [31]:
exploraty_analysis(df_value_categorie_encoded)

{'Dimention': {'rows': 357952, 'columns': 63},
 'Type de donnees': {'int64': 28, 'object': 19, 'float64': 15, 'bool': 1},
 'valeur manquante':                        count  percentage
 accessedNodeType         148        0.04
 value                   2050        0.57
 acc_node_event        130688       36.51
 value_numeric          42793       11.95
 value_movement        264904       74.01
 value_tempin          227879       63.66
 value_lastChange      357377       99.84
 value_lightOn         356281       99.53
 value_heatingcontrol  357643       99.91
 value_open            356909       99.71
 value_charge          268475       75.00
 value_charging        318416       88.95
 value_heatingOn       357946      100.00,
 'colonees vides': ['value_heatingOn'],
 'doublons': 1912,
 'statistiques numeriques':           timestamp      normality  sourceID_freq  accessed_addr_freq  \
 count  3.579520e+05  357952.000000  357952.000000       357952.000000   
 mean   1.520078e+12       0.027984

In [32]:
# encoding boolean event value_charging, value_open
def encode_boolean_event(df, col):
    state_map = {
        'true': 1,
        'false': 0,
        'other': -1
    }

    df[col + '_present'] = df[col].notna().astype(int)

    df[col + '_state'] = (
        df[col]
        .map(state_map)
        .fillna(0) 
        .astype(int)
    )

boolean_event = ['value_charging', 'value_open']

for event in boolean_event:
    encode_boolean_event(df_value_categorie_encoded, event)

df_value_categorie_encoded

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,op_write,op_lockSubtree,value_semantic,val_is_none,val_is_boolean,val_is_numeric,value_charging_present,value_charging_state,value_open_present,value_open_state
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,0,0,0,1,0,0,0,0,0,0
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,0,0,0,1,0,0,0,0,0,0
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,0,0,0,1,0,0,0,0,0,0
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,0,0,0,1,0,0,0,0,0,0
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,0,0,1,0,0,1,0,0,0,0
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,0,0,1,0,0,1,0,0,0,0
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,0,0,3,0,1,0,1,1,0,0
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,0,0,1,0,0,1,0,0,0,0


In [33]:
# encoder les colone src_agent_number, dest_agent_number (frequency encoding)
def encoder_agent_cols(df):
    df_encoded = df.copy()

    freq_src = df_encoded['src_agent_number'].value_counts(normalize=True)
    df_encoded['src_agent_number_freq'] = df_encoded['src_agent_number'].map(freq_src)
    

    freq_dest = df_encoded['dest_agent_number'].value_counts(normalize=True)
    df_encoded['dest_agent_number_freq'] = df_encoded['dest_agent_number'].map(freq_dest)

    return df_encoded


df_agent_encoded = encoder_agent_cols(df_value_categorie_encoded)

df_agent_encoded


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,value_semantic,val_is_none,val_is_boolean,val_is_numeric,value_charging_present,value_charging_state,value_open_present,value_open_state,src_agent_number_freq,dest_agent_number_freq
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,0,1,0,0,0,0,0,0,0.030580,0.030180
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,0,1,0,0,0,0,0,0,0.037519,0.030060
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,0,1,0,0,0,0,0,0,0.035860,0.036047
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,0,1,0,0,0,0,0,0,0.052574,0.054303
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,0,1,0,0,0,0,0,0,0.052574,0.054303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,1,0,0,1,0,0,0,0,0.029535,0.029915
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,1,0,0,1,0,0,0,0,0.107593,0.107626
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,3,0,1,0,1,1,0,0,0.107593,0.107626
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,1,0,0,1,0,0,0,0,0.030038,0.030292


In [34]:
# colonnes avec presence binaire
value_cols_is_present = [
    'value_movement',
    'value_lastChange',
    'value_lightOn',
    'value_heatingcontrol',
    'value_open',
    'value_charging',
    'acc_node_event'
]

# colonnes mesures numériques
value_col_mesure = [
    'value_tempin',
    'value_charge'
]


# boucle pour les colonnes avec présence
for col in value_cols_is_present:
    df_agent_encoded[col + '_present'] = df_agent_encoded[col].notna().astype(int)
    df_agent_encoded[col] = df_agent_encoded[col].fillna(0)

# boucle pour colonnes mesures
for col in value_col_mesure:
    df_agent_encoded[col] = df_agent_encoded[col].fillna(0)


In [35]:
exploraty_analysis(df_agent_encoded, "normality")

{'Dimention': {'rows': 357952, 'columns': 74},
 'Type de donnees': {'int64': 37, 'object': 19, 'float64': 17, 'bool': 1},
 'valeur manquante':                    count  percentage
 accessedNodeType     148        0.04
 value               2050        0.57
 value_numeric      42793       11.95
 value_heatingOn   357946      100.00,
 'colonees vides': ['value_heatingOn'],
 'doublons': 1912,
 'statistiques numeriques':           timestamp      normality  sourceID_freq  accessed_addr_freq  \
 count  3.579520e+05  357952.000000  357952.000000       357952.000000   
 mean   1.520078e+12       0.027984       0.024778            0.023599   
 std    2.465664e+07       0.164928       0.013482            0.013977   
 min    1.520032e+12       0.000000       0.000011            0.000003   
 25%    1.520057e+12       0.000000       0.012907            0.012683   
 50%    1.520080e+12       0.000000       0.017667            0.017896   
 75%    1.520099e+12       0.000000       0.040466            0

In [36]:
for i in df_agent_encoded["value_lastChange"]:
    if i > 5000:
        print(i)

1521216755210.0
1521216758314.0
1521216770002.0
1521216782996.0
1521216841986.0
1521216899605.0
1521216902949.0
1521216905597.0
1521216915966.0
1521216923287.0
1521216947917.0
1521216949270.0
1521216969706.0
1521217011952.0
1521217013402.0
1521217017856.0
1521217019507.0
1521217028092.0
1521217034370.0
1521217043809.0
1521217059081.0
1521217059960.0
1521217092117.0
1521217126647.0
1521217145962.0
1521217156513.0
1521217169935.0
1521217204666.0
1521217222784.0
1521217225065.0
1521217255066.0
1521217274587.0
1521217275607.0
1521217290082.0
1521217316165.0
1521217347170.0
1521217364810.0
1521217365495.0
1521217369194.0
1521217370222.0
1521217387666.0
1521217396715.0
1521217418156.0
1521217428289.0
1521217434671.0
1521217446778.0
1521217447122.0
1521217458093.0
1521217467845.0
1521217475751.0
1521217476898.0
1521217508246.0
1521217544199.0
1521217588075.0
1521217594371.0
1521217627977.0
1521217636313.0
1521217638859.0
1521217640291.0
1521217641268.0
1521217655257.0
1521217700140.0
15212177

In [37]:
# Traitement de colone timestamp :
def traite_timestamp_colone(df):
    df_traited = df.copy()

    df_traited['event_time'] = pd.to_datetime(df_traited['timestamp'], unit='ms')

    df_traited['year'] = df_traited['event_time'].dt.year
    df_traited['month'] = df_traited['event_time'].dt.month
    df_traited['day'] = df_traited['event_time'].dt.day
    df_traited['hour'] = df_traited['event_time'].dt.hour
    df_traited['minute'] = df_traited['event_time'].dt.minute
    df_traited['second'] = df_traited['event_time'].dt.second


    return df_traited


df_timestamp_processed = traite_timestamp_colone(df_agent_encoded)

df_timestamp_processed
    


Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,value_lightOn_present,value_heatingcontrol_present,acc_node_event_present,event_time,year,month,day,hour,minute,second
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,...,0,0,0,2018-03-02 23:00:00.000,2018,3,2,23,0,0
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,...,0,0,0,2018-03-02 23:00:03.269,2018,3,2,23,0,3
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,...,0,0,0,2018-03-02 23:00:03.279,2018,3,2,23,0,3
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,...,0,0,0,2018-03-02 23:00:03.290,2018,3,2,23,0,3
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,...,0,0,0,2018-03-02 23:00:03.464,2018,3,2,23,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357947,lightcontrol23,/agent23/lightcontrol23,/lightControler,room_4,/agent23/tempin23,/sensorService,room_4,/agent23/tempin23,/sensorService,read,...,0,0,0,2018-03-03 22:59:57.001,2018,3,3,22,59,57
357948,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charge,/basic/number,read,...,0,0,1,2018-03-03 22:59:57.339,2018,3,3,22,59,57
357949,washingmachine2,/agent11/washingmachine2,/washingService,Watterroom,/agent11/battery4,/batteryService,Watterroom,/agent11/battery4/charging,/basic/text,read,...,0,0,1,2018-03-03 22:59:57.401,2018,3,3,22,59,57
357950,lightcontrol28,/agent28/lightcontrol28,/lightControler,room_9,/agent28/movement28,/movementSensor,room_9,/agent28/movement28/movement,/derived/boolean,read,...,0,0,1,2018-03-03 22:59:58.907,2018,3,3,22,59,58


In [38]:
# utilisation de sin , cos pour leur (eviter que 0 tres loin de 23)
df_timestamp_processed['hour_sin'] = np.sin(2 * np.pi * df_timestamp_processed['hour'] / 24)
df_timestamp_processed['hour_cos'] = np.cos(2 * np.pi * df_timestamp_processed['hour'] / 24)

# creation des relation de timestamp(dela time)
df_timestamp_processed = df_timestamp_processed.sort_values(['sourceID', 'event_time'])

df_timestamp_processed['delta_time'] = (
    df_timestamp_processed.groupby('sourceID')['event_time']
      .diff()
      .dt.total_seconds()
      .fillna(0)
)

# creation des frequence per minute
df_timestamp_processed['events_per_min'] = (
    df_timestamp_processed.groupby(['sourceID', 'minute'])['timestamp']
      .transform('count')
)

# creation des flags
df_timestamp_processed['is_night'] = df_timestamp_processed['hour'].between(0, 5).astype(int)
df_timestamp_processed['is_work_hours'] = df_timestamp_processed['hour'].between(8, 18).astype(int)

In [39]:
df_timestamp_processed

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,day,hour,minute,second,hour_sin,hour_cos,delta_time,events_per_min,is_night,is_work_hours
243,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,registerService,...,2,23,6,43,-0.258819,0.965926,0.000,232,0,0
365,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charge,/basic/number,read,...,2,23,8,0,-0.258819,0.965926,76.434,248,0,0
391,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charging,/basic/text,read,...,2,23,8,11,-0.258819,0.965926,11.613,248,0,0
403,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charge,/basic/number,read,...,2,23,8,16,-0.258819,0.965926,4.766,248,0,0
404,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charge,/basic/number,subscribe,...,2,23,8,16,-0.258819,0.965926,0.133,248,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357794,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charge,/basic/number,read,...,3,22,59,19,-0.500000,0.866025,0.144,272,0,0
357877,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charging,/basic/text,read,...,3,22,59,40,-0.500000,0.866025,20.633,272,0,0
357884,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charge,/basic/number,read,...,3,22,59,40,-0.500000,0.866025,0.318,272,0,0
357885,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charge,/basic/number,write,...,3,22,59,40,-0.500000,0.866025,0.030,272,0,0


In [40]:
df_timestamp_processed.value_lastChange_present

243       0
365       0
391       0
403       0
404       0
         ..
357794    0
357877    0
357884    0
357885    0
357886    0
Name: value_lastChange_present, Length: 357952, dtype: int64

In [41]:
df_timestamp_processed['value_lastChange_dt'] = pd.to_datetime(
    df_timestamp_processed['value_lastChange'].replace(0, pd.NA),
    unit='ms'
)

# Calcul du delta (NaT --> NaN)
df_timestamp_processed['delta_lastChange_s'] = (
    df_timestamp_processed['event_time'] - df_timestamp_processed['value_lastChange_dt']
).dt.total_seconds()

# Cas où lastChange n’existe pas --> delta = 0
df_timestamp_processed.loc[
    (df_timestamp_processed['value_lastChange'] == 0) & (df_timestamp_processed['value_lastChange_present'] == 0),
    'delta_lastChange_s'
] = 0

In [42]:
df_timestamp_processed

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,...,minute,second,hour_sin,hour_cos,delta_time,events_per_min,is_night,is_work_hours,value_lastChange_dt,delta_lastChange_s
243,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,registerService,...,6,43,-0.258819,0.965926,0.000,232,0,0,NaT,0.0
365,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charge,/basic/number,read,...,8,0,-0.258819,0.965926,76.434,248,0,0,NaT,0.0
391,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charging,/basic/text,read,...,8,11,-0.258819,0.965926,11.613,248,0,0,NaT,0.0
403,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charge,/basic/number,read,...,8,16,-0.258819,0.965926,4.766,248,0,0,NaT,0.0
404,battery1,/agent5/battery1,/batteryService,Garage,/agent5/battery1,/batteryService,Garage,/agent5/battery1/charge,/basic/number,subscribe,...,8,16,-0.258819,0.965926,0.133,248,0,0,NaT,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357794,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charge,/basic/number,read,...,59,19,-0.500000,0.866025,0.144,272,0,0,NaT,0.0
357877,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charging,/basic/text,read,...,59,40,-0.500000,0.866025,20.633,272,0,0,NaT,0.0
357884,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charge,/basic/number,read,...,59,40,-0.500000,0.866025,0.318,272,0,0,NaT,0.0
357885,washingmachine3,/agent14/washingmachine3,/washingService,Showerroom,/agent12/battery6,/batteryService,Entrance,/agent12/battery6/charge,/basic/number,write,...,59,40,-0.500000,0.866025,0.030,272,0,0,NaT,0.0


In [43]:
# Nombre d'anomalies avec delta_lastChange_s négatif
anomalies_neg_delta = df_timestamp_processed[(df_timestamp_processed['delta_lastChange_s'] < 0) & (df_timestamp_processed['normality'] == 1)]
count_anomalies_neg_delta = anomalies_neg_delta.shape[0]

print("Nombre d'anomalies avec delta_lastChange_s négatif :", count_anomalies_neg_delta)

# Nombre de delta_lastChange_s positifs
positive_delta = df_timestamp_processed[df_timestamp_processed['delta_lastChange_s'] > 0]
count_positive_delta = positive_delta.shape[0]

print("Nombre de delta_lastChange_s positifs :", count_positive_delta)

# ca confirme que value_lastChange est inutile

Nombre d'anomalies avec delta_lastChange_s négatif : 0
Nombre de delta_lastChange_s positifs : 0


In [44]:
df_encoded = df_timestamp_processed.copy()

colmns_to_delete = ['sourceAddress', 'destinationServiceAddress', 'accessedNodeAddress', 'value',
                    'sourceID', 'dest_device_info','src_device_info', 'sourceType',
                    'sourceLocation', 'destinationServiceType', 'accessedNodeType', 'operation',
                    'value_category','destinationLocation', 'value_is_none', 'value_numeric',
                    'value_heatingOn', 'value_lastChange_dt', 'value_lastChange', 'year', 'month',
                    'day', 'hour', 'minute', 'second', 'acc_node_type', 'acc_node_event', 
                    'acc_node_event_filled', 'value_open', 'value_charging', 'timestamp', 'delta_lastChange_s',
                    'value_lastChange', 'value_lastChange_present', 'src_agent_number', 'dest_agent_number',
                    'event_time'
                    ]

df_encoded = df_encoded.drop(columns=colmns_to_delete)

In [45]:
df_encoded

Unnamed: 0,normality,sourceID_freq,accessed_addr_freq,src_dest_same_addr,src_dest_same_type,src_dest_same_loc,value_movement,value_tempin,value_lightOn,value_heatingcontrol,...,value_movement_present,value_lightOn_present,value_heatingcontrol_present,acc_node_event_present,hour_sin,hour_cos,delta_time,events_per_min,is_night,is_work_hours
243,0,0.040505,0.000003,1,True,1,0.0,0.0,0.0,0.0,...,0,0,0,0,-0.258819,0.965926,0.000,232,0,0
365,0,0.040505,0.046168,1,True,1,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,76.434,248,0,0
391,0,0.040505,0.018952,1,True,1,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,11.613,248,0,0
403,0,0.040505,0.046168,1,True,1,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,4.766,248,0,0
404,0,0.040505,0.046168,1,True,1,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,0.133,248,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357794,0,0.048375,0.045984,0,False,0,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,0.144,272,0,0
357877,0,0.048375,0.018271,0,False,0,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,20.633,272,0,0
357884,0,0.048375,0.045984,0,False,0,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,0.318,272,0,0
357885,0,0.048375,0.045984,0,False,0,0.0,0.0,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,0.030,272,0,0


In [46]:
analyse_categorial(df_encoded)

0 colonnes catégorielles:


In [47]:
exploraty_analysis(df_encoded, "normality")

{'Dimention': {'rows': 357952, 'columns': 53},
 'Type de donnees': {'int64': 37, 'float64': 15, 'bool': 1},
 'valeur manquante': Empty DataFrame
 Columns: [count, percentage]
 Index: [],
 'colonees vides': [],
 'doublons': 21899,
 'statistiques numeriques':            normality  sourceID_freq  accessed_addr_freq  src_dest_same_addr  \
 count  357952.000000  357952.000000       357952.000000       357952.000000   
 mean        0.027984       0.024778            0.023599            0.465272   
 std         0.164928       0.013482            0.013977            0.498793   
 min         0.000000       0.000011            0.000003            0.000000   
 25%         0.000000       0.012907            0.012683            0.000000   
 50%         0.000000       0.017667            0.017896            0.000000   
 75%         0.000000       0.040466            0.024056            1.000000   
 max         1.000000       0.050029            0.053169            1.000000   
 
        src_dest_same

In [48]:
# Calculer min et max pour toutes les colonnes numeriques
numeric_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns

intervals = df_encoded[numeric_cols].agg(['min', 'max']).transpose()
intervals = intervals.rename(columns={'min': 'Min', 'max': 'Max'})

# Filtrer celles qui ont min < -5 ou max > 5
intervals_filtered = intervals[(intervals['Min'] < -5) | (intervals['Max'] > 5)]

intervals_filtered


Unnamed: 0,Min,Max
value_tempin,0.0,38.571738
value_charge,0.0,19.881
delta_time,0.0,67045.116
events_per_min,1.0,2103.0


In [49]:
# Standarisation
cols_to_scale = [
'value_tempin',        
'value_charge',          
'delta_time',            
'events_per_min'
]

scaler = StandardScaler()
df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

df_encoded

Unnamed: 0,normality,sourceID_freq,accessed_addr_freq,src_dest_same_addr,src_dest_same_type,src_dest_same_loc,value_movement,value_tempin,value_lightOn,value_heatingcontrol,...,value_movement_present,value_lightOn_present,value_heatingcontrol_present,acc_node_event_present,hour_sin,hour_cos,delta_time,events_per_min,is_night,is_work_hours
243,0,0.040505,0.000003,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,0,-0.258819,0.965926,-0.072967,0.209502,0,0
365,0,0.040505,0.046168,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,0.220811,0.274155,0,0
391,0,0.040505,0.018952,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,-0.028332,0.274155,0,0
403,0,0.040505,0.046168,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,-0.054648,0.274155,0,0
404,0,0.040505,0.046168,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,-0.072456,0.274155,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357794,0,0.048375,0.045984,0,False,0,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,-0.072413,0.371134,0,0
357877,0,0.048375,0.018271,0,False,0,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,0.006337,0.371134,0,0
357884,0,0.048375,0.045984,0,False,0,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,-0.071745,0.371134,0,0
357885,0,0.048375,0.045984,0,False,0,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.500000,0.866025,-0.072851,0.371134,0,0


In [50]:
exploraty_analysis(df_encoded, "normality")

{'Dimention': {'rows': 357952, 'columns': 53},
 'Type de donnees': {'int64': 36, 'float64': 16, 'bool': 1},
 'valeur manquante': Empty DataFrame
 Columns: [count, percentage]
 Index: [],
 'colonees vides': [],
 'doublons': 21899,
 'statistiques numeriques':            normality  sourceID_freq  accessed_addr_freq  src_dest_same_addr  \
 count  357952.000000  357952.000000       357952.000000       357952.000000   
 mean        0.027984       0.024778            0.023599            0.465272   
 std         0.164928       0.013482            0.013977            0.498793   
 min         0.000000       0.000011            0.000003            0.000000   
 25%         0.000000       0.012907            0.012683            0.000000   
 50%         0.000000       0.017667            0.017896            0.000000   
 75%         0.000000       0.040466            0.024056            1.000000   
 max         1.000000       0.050029            0.053169            1.000000   
 
        src_dest_same

In [51]:
# vérifications de cohérence
y = df_encoded['normality']
y.value_counts(normalize=True)

normality
0    0.972016
1    0.027984
Name: proportion, dtype: float64

In [None]:
df_encoded.to_csv("dataPreprocessed.csv", index=False)

In [66]:
df_pretraited = pd.read_csv("dataPreprocessed.csv", low_memory=False)
df_pretraited.head()

Unnamed: 0,normality,sourceID_freq,accessed_addr_freq,src_dest_same_addr,src_dest_same_type,src_dest_same_loc,value_movement,value_tempin,value_lightOn,value_heatingcontrol,...,value_movement_present,value_lightOn_present,value_heatingcontrol_present,acc_node_event_present,hour_sin,hour_cos,delta_time,events_per_min,is_night,is_work_hours
0,0,0.040505,3e-06,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,0,-0.258819,0.965926,-0.072967,0.209502,0,0
1,0,0.040505,0.046168,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,0.220811,0.274155,0,0
2,0,0.040505,0.018952,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,-0.028332,0.274155,0,0
3,0,0.040505,0.046168,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,-0.054648,0.274155,0,0
4,0,0.040505,0.046168,1,True,1,0.0,-0.753161,0.0,0.0,...,0,0,0,1,-0.258819,0.965926,-0.072456,0.274155,0,0
