In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dropout, Dense
from tensorflow.keras.preprocessing import sequence

from TransformerComplet import *

# Chargement du jeu de données
data = pd.read_csv("data/BenignAndMaliciousDataset.csv")

In [50]:
# Configuration des paramètres d'entraînement
class TrainingConfig(object):
    epochs = 10
    evaluate_every = 100
    checkpoint_every = 100
    learning_rate = 0.001

# Configuration du modèle
class ModelConfig(object):
    embedding_size = 128
    filters = 128
    num_heads = 8
    num_blocks = 1
    epsilon = 1e-8
    keep_prop = 0.9
    dropout_keep_prob = 0.5
    l2_reg_lambda = 0.0

# Configuration générale
class GeneralConfig(object):
    sequence_length = 4
    batch_size = 32
    max_features = 3
    num_features = 0
    num_classes = 2

    training = TrainingConfig()
    model = ModelConfig()

In [32]:
# Les colonnes à garder comme features
# Séparation des fonctionnalités et de la cible

features = ['DNSRecordType', 'NumericSequence', 'NumericRatio', 'StrangeCharacters', 'ConsoantRatio', 'RegisteredOrg']

# La colonne cible
target = 'Class'

# Séparation en features et target
X = data[features]  
y = data[target]


In [22]:
# Chargement des données spécifiques
def load_data():
    X_train = data[features]  # Adaptation selon la structure réelle du jeu de données
    Y_train = data[target]  # Adaptation selon la structure réelle du jeu de données

    # Retirez cette partie si vous ne comprenez pas la fonction loadDomain
    # domain_train = loadDomain(data['domain'])  # Adaptation selon la structure réelle du jeu de données

    return X_train, Y_train


In [33]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

from sklearn.preprocessing import MinMaxScaler
import numpy as np

from tensorflow.keras.layers.experimental.preprocessing import StringLookup


#je tokenrise d'abord les attribut categoriciel

from tensorflow.keras.preprocessing.text import Tokenizer


#la  tokenrisation s'applique sur une listte de chaine de caractere et non sur une serie pandas to je fais un casting [.tolist()]

categorical_feature1 = data['DNSRecordType'].astype(str).tolist()  # Convertir la série en liste de chaînes de caractères
categorical_feature2 = data['RegisteredOrg'].astype(str).tolist()  # Convertir la série en liste de chaînes de caractères

#combinaison avec separateur  comme on peut le faire directement

tokenizer3 = Tokenizer(num_words= 10)


# Concaténation des features textuelles avec un séparateur
combined_text = [f"{feat1} SEPARATOR {feat2}" for feat1, feat2 in zip(categorical_feature1, categorical_feature2)]

# Tokenisation des textes combinés
tokenizer3 = Tokenizer(num_words=10)
tokenizer3.fit_on_texts(combined_text)
tokens = tokenizer3.texts_to_sequences(combined_text)


print("Tokens feature 1 + Tokens feature 2")
print(tokens)

print("Calcul des longueur de sequence \n")

seq_lengths = [len(x) for x in tokens]

print(seq_lengths)

Tokens feature 1 + Tokens feature 2
[[2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1], [2, 1, 4], [2, 1], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1, 4], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], [3, 1, 5, 6], 

In [34]:
vocab_size = 10000  # Taille du vocabulaire
embedding_dim = 256  # Dimension de l'embedding

In [35]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Chargement des données spécifiques
def load_data():
    X_train = pad_sequences(tokens, maxlen=4,padding='post')  # Adaptation selon la structure réelle du jeu de données
    Y_train = data[target]  # Adaptation selon la structure réelle du jeu de données

    # Retirez cette partie si vous ne comprenez pas la fonction loadDomain
    # domain_train = loadDomain(data['domain'])  # Adaptation selon la structure réelle du jeu de données

    return X_train, Y_train


In [47]:
print(X_train.shape)

(90000, 4)


In [None]:
if __name__ == "__main__":
    # Chargement des données
    X_train, Y_train = load_data()

    # Instanciation de l'objet de configuration
    config = GeneralConfig()

    # input_text = Input(shape=(3,))
    # embedding = Embedding(vocab_size, embedding_dim)(input_text)

    # Mise en forme des données
    # X_train = X_train.values.reshape(len(X_train), config.num_features, 1)

    # Construction du modèle Transformer
    inputs_A = Input(shape=(config.sequence_length,))
    embeddings = Embedding(config.max_features, config.model.embedding_size)(inputs_A)
    mask_inputs = masque_remplissage(inputs_A)  # Assurez-vous d'avoir la fonction padding_mask définie
    
    out_seq = Encodeur(
        n_layers=4, d_model=128, num_heads=2,
        middle_units=256, max_seq_len=config.sequence_length)([embeddings, mask_inputs])
    out_seq = GlobalAveragePooling1D()(out_seq)


    ############################## il s'agit la d'un transformer a un block donc maintena t si je veux plusieur block de transformer alor que faire ??? ##################

    '''
    # Ajout de 3 blocs d'encodeur
    num_blocks = 3
    out_seq = embeddings  # Initialisation avec les embeddings
    for _ in range(num_blocks):
        out_seq = Encoder(
            n_layers=config.model.num_blocks,
            d_model=config.model.embedding_size,
            num_heads=config.model.num_heads,
            middle_units=config.model.filters,
            max_seq_len=config.sequence_length)([out_seq, mask_inputs])
     '''       
    out_seq = Dropout(0.3)(out_seq)
    outputs_A = Dense(64, activation='softmax')(out_seq)

    outputs = Dense(1, activation='sigmoid')(outputs_A)

    model = Model(inputs=inputs_A, outputs=outputs)

    print(model.summary())

    # Compilation du modèle
    # opt = Adam(learning_rate=0.0001, decay=1e-5)

    # opt = Adam(learning_rate=0.0001)
   



    # loss = 'sparse_categorical_crossentropy'
    loss = 'binary_crossentropy'

    model.compile(loss=loss,
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Entraînement...')
    history = model.fit(X_train, Y_train,
                        batch_size=config.batch_size,
                        epochs=2,
                        validation_split=0.2)


In [52]:
if __name__ == "__main__":
    # Chargement des données
    X_train, Y_train = load_data()

   
   # Hyperparamètres et dimensions des données*
    taille_num_features = 5
    taille_text_features = 4 # je donne la taille max parmi les longueur des sequences
    vocab_size = 20000  
    embedding_dim = 128 
 
    # Construction du modèle Transformer
    inputs_A = Input(shape=(taille_text_features,))
    embeddings = Embedding(vocab_size, embedding_dim, input_length=taille_text_features)(inputs_A)
    mask_inputs = masque_remplissage(inputs_A)  
    out_seq = Encodeur(
        n_layers=4, d_model=128, num_heads=2,
        middle_units=256, max_seq_len=taille_text_features)([embeddings, mask_inputs])
    out_seq = GlobalAveragePooling1D()(out_seq)
    out_seq = Dropout(0.3)(out_seq)
    # outputs_A = Dense(64, activation='softmax')(out_seq)

    outputs = Dense(1, activation='sigmoid')(out_seq)

    model = Model(inputs=inputs_A, outputs=outputs)

    print(model.summary())

    # Compilation du modèle
    # opt = Adam(learning_rate=0.0001, decay=1e-5)

    # opt = Adam(learning_rate=0.0001)
   



    # loss = 'sparse_categorical_crossentropy'
    loss = 'binary_crossentropy'

    model.compile(loss=loss,
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Entraînement...')
    history = model.fit(X_train, Y_train,
                        batch_size=config.batch_size,
                        epochs=2,
                        validation_split=0.2)


Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, 4)]          0           []                               
                                                                                                  
 tf_op_layer_Equal_14 (TensorFl  [(None, 4)]         0           ['input_15[0][0]']               
 owOpLayer)                                                                                       
                                                                                                  
 tf_op_layer_Cast_14 (TensorFlo  [(None, 4)]         0           ['tf_op_layer_Equal_14[0][0]']   
 wOpLayer)                                                                                        
                                                                                           