In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from deap import base, creator, tools, algorithms
import random

# Load data from the CSV file
data = pd.read_csv('data/DS-2023-000/proxy-sql-dataset.csv', delimiter=";")

# Convert text labels to numbers (0 or 1)
label_encoder = LabelEncoder()
data['malignant'] = label_encoder.fit_transform(data['malignant'])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['query'], 
                                                    data['malignant'], 
                                                    test_size=0.2, 
                                                    random_state=42)

# Tokenization and padding of text
tokenizer = Tokenizer(num_words=5000, 
                      oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=50, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=50, padding='post', truncating='post')

num_words = 5000
output_dim = 16
filters = 128
kernel_size = 5
dense_units = 64

In [7]:

# Construeix el model de CNN
model = Sequential([
    Embedding(input_dim=num_words, output_dim=output_dim, input_length=50),
    Conv1D(filters, kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(dense_units, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenament del model
model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test))

# Avaluació del model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.015937848016619682, Accuracy: 0.9962499737739563


In [8]:
import random
import numpy as np
from deap import base, creator, tools, algorithms
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import clone_model

# Definim el problema d'optimizació (maximitzar la precisió)
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Funció d'avaluació (fitness)
# Aquest codi crea el model a partir del nostre individu (individual)
# i n'avalua la precisió (accuracy)
def evaluate(individual, X_train, y_train, X_test, y_test):
    # Transformem l'individu en paràmetres de la CNN 
    num_words, output_dim, filters, kernel_size, dense_units = individual

    # Construim el model
    model = Sequential([
        Embedding(input_dim=num_words, output_dim=output_dim, input_length=50),
        Conv1D(filters, kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(dense_units, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    # Compilem el model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Tokenització i padding del text
    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(X_train)

    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_test_sequences = tokenizer.texts_to_sequences(X_test)

    X_train_padded = pad_sequences(X_train_sequences, maxlen=50, padding='post', truncating='post')
    X_test_padded = pad_sequences(X_test_sequences, maxlen=50, padding='post', truncating='post')

    # Entrenament del model
    model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test), verbose=0)

    # Avaluació del model
    _, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)

    return accuracy,

# Funció per generar individus aleatoris
def init_individual():
    num_words = random.randint(100, 5000)
    output_dim = random.randint(8, 32)
    filters = random.randint(32, 256)
    kernel_size = random.randint(3, 10)
    dense_units = random.randint(32, 128)
    return [num_words, output_dim, filters, kernel_size, dense_units]

# Configuració de l'evolució
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, init_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt, low=[100, 8, 32, 3, 32], up=[5000, 32, 256, 10, 128], indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# Divisió de les dades d'entrenament
X_train, X_test, y_train, y_test = train_test_split(data['query'], data['malignant'], test_size=0.2, random_state=42)

# Configuració de la població
population_size = 100
generations = 10
population = toolbox.population(n=population_size)

# Algorisme Genètic Simple
algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=generations, stats=None, halloffame=None, verbose=True)

# Millor individu després de l'evolució
best_individual = tools.selBest(population, k=1)[0]
print("Best Individual:", best_individual)


gen	nevals
0  	100   
1  	78    
2  	72    
3  	73    
4  	72    
5  	77    
6  	81    
7  	77    
8  	73    
9  	75    
10 	90    
Best Individual: [1786, 17, 202, 5, 68]


In [9]:
(   gao_num_words, 
    gao_output_dim, 
    gao_filters, 
    gao_kernel_size, 
    gao_dense_units
) = best_individual


print("Best individual num_words = {}".format(gao_num_words))
print("Best individual output_dim = {}".format(gao_output_dim))
print("Best individual filters = {}".format(gao_filters))
print("Best individual kernel_size = {}".format(gao_kernel_size))
print("Best individual dense_units = {}".format(gao_dense_units))

Best individual num_words = 1786
Best individual output_dim = 17
Best individual filters = 202
Best individual kernel_size = 5
Best individual dense_units = 68


In [17]:

# Construeix el model de CNN
model = Sequential([
    Embedding(input_dim=num_words, output_dim=output_dim, input_length=50),
    Conv1D(filters, kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(dense_units, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenament del model
model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test))

# Avaluació del model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.014347785152494907, Accuracy: 0.9966250061988831
