In [23]:
import os
import multiprocessing
import string
import pandas as pd
import numpy as np
from matplotlib.pylab import plt
import nltk

from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline

from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Dropout, SpatialDropout1D, SimpleRNN

import optuna

from cybnews.data import get_data, welf_join_text


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [24]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [25]:
# WELF dataset
#
#
DATA_PATH = '/home/tober/devel/lewagon/project/cyb-news/data'
data = welf_join_text(get_data(f'{DATA_PATH}/WELFake_Dataset.csv'))[['all_text', 'label']]

#data = data.sample(frac=0.5, random_state=42)

In [26]:
def clean(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for x in string.punctuation:
        sentence = sentence.replace(x, '')

    for x in ['’', '“', '”', '-', '"' ]:
        sentence = sentence.replace(x, '')
    return sentence

data["all_text_cleaned"] = data["all_text"].apply(clean)

In [27]:
max_seq_length = 3000  # Maximum sequence length
print(f'max_seq_length: {max_seq_length}')

max_seq_length: 3000


In [28]:
X = data.all_text_cleaned
y = data.label

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_tokenized, maxlen=max_seq_length, dtype='float32', padding='pre', value=0)
X_test_padded = pad_sequences(X_test_tokenized, maxlen=max_seq_length, dtype='float32', padding='pre', value=0)

In [29]:
print(X_train_padded.shape)
print(y_train.shape)

(50493, 3000)
(50493,)


In [30]:
vocab_size = len(tokenizer.index_word) + 1  # also num of features
print(f'vocab_size: {vocab_size}')

vocab_size: 319480


In [31]:
vector_size = 200
window_size = 5

word2vec_model = Word2Vec(sentences=X, vector_size=vector_size, window=window_size, min_count=1, workers=4)

embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [71]:
from tensorflow.keras.layers import GRU


def create_model(trial):
    second_gru_layer = trial.suggest_int('gru_layers', 0, 1)
    deep_layers = trial.suggest_int("deep_layers", 0, 1)

    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
    model.add(
        SpatialDropout1D(
            trial.suggest_float('spatial_dropout_rate', 0.1, 0.4)
        )
    )
    if bool(second_gru_layer):
        model.add(
            GRU(
                trial.suggest_int('lstm_units', 50, 200),
                return_sequences=True
            )
        )
        model.add(
            GRU(trial.suggest_int('gru_units', 50, 150))
        )
    else:
        model.add(
            GRU(trial.suggest_int('lstm_units', 50, 200))
        )

    for i in range(deep_layers):
        num_hidden = trial.suggest_int("num_neurons_hidden{}".format(i), 4, 128, log=True)
        model.add(
            tf.keras.layers.Dense(
                num_hidden,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l2(trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)),
            )
        )
    model.add(Dense(1, activation='sigmoid'))
    
    return model


def objective(trial):
    #learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 0.001)
    #batch_size = trial.suggest_int('batch_size', 8, 16)
    #epochs = trial.suggest_int('epochs', 5, 10)

    model = create_model(trial)
    optimizer = Adam()#learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        patience=3,
        restore_best_weights=True
    )

    history = model.fit(
        X_train_padded,
        y_train,
        epochs=5,
        batch_size=32,
        validation_split=0.2,
        verbose=1,
        callbacks=[early_stopping]
    )
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    val_loss = min(history.history['val_loss'])  
    
    return max(val_accuracy)


In [72]:
storage = optuna.storages.RDBStorage('sqlite:///optuna_study.db', )
study = optuna.create_study(direction='maximize', storage=storage)
study.optimize(objective, n_trials=15, show_progress_bar=True)

[I 2024-07-22 18:36:37,585] A new study created in RDB with name: no-name-66927bc1-6dc5-4abb-bf63-048a40a04913


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch 1/5


In [None]:

#study = optuna.load_study(study_name='cybnews', storage='sqlite:///optuna_study.db')
best_trial = study.best_trial

print(f"Best trial value (validation loss): {best_trial.value}")
print("Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")

