In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Carrega les dades des del fitxer CSV
data = pd.read_csv('data/DS-2023-000/proxy-sql-dataset.csv', delimiter=";")

# Convertim les etiquetes de text a números (0 o 1)
label_encoder = LabelEncoder()
data['malignant'] = label_encoder.fit_transform(data['malignant'])

# Separem les dades en conjunts d'entrenament i prova
X_train, X_test, y_train, y_test = train_test_split(data['query'], data['malignant'], test_size=0.2, random_state=42)

# Tokenització i padding del text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=50, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=50, padding='post', truncating='post')

# Construeix el model amb una capa Bidireccional LSTM
model = Sequential([
    Embedding(input_dim=5000, output_dim=16, input_length=50),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenament del model
model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test))

# Avaluació del model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.01860421895980835, Accuracy: 0.9943749904632568
