In [46]:
import os
import multiprocessing
import string
from datetime import datetime
import pandas as pd
import numpy as np

from matplotlib.pylab import plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline

import nltk
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec

from tensorflow.config.experimental import list_physical_devices, set_memory_growth
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Dropout, SpatialDropout1D, SimpleRNN
from tensorflow.keras.regularizers import l2

import optuna

from cybnews.data import get_data, welf_join_text


In [47]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

print("Num GPUs Available: ", len(list_physical_devices('GPU')))

gpus = list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [48]:
# WELF dataset
#
#
DATA_PATH = '/home/tober/devel/lewagon/project/cyb-news/data'
data = welf_join_text(get_data(f'{DATA_PATH}/WELFake_Dataset.csv'))[['all_text', 'label']]

#data = data.sample(frac=0.5, random_state=42)

In [49]:
def clean(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for x in string.punctuation:
        sentence = sentence.replace(x, '')

    for x in ['’', '“', '”', '-', '"' ]:
        sentence = sentence.replace(x, '')
    return sentence

data["all_text_cleaned"] = data["all_text"].apply(clean)

In [50]:
max_seq_length = 5000  # Maximum sequence length
print(f'max_seq_length: {max_seq_length}')

max_seq_length: 5000


In [51]:
X = data.all_text_cleaned
y = data.label

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X_train_tokenized = tokenizer.texts_to_sequences(X)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_tokenized, maxlen=max_seq_length, dtype='float32', padding='pre', value=0)
X_test_padded = pad_sequences(X_test_tokenized, maxlen=max_seq_length, dtype='float32', padding='pre', value=0)

In [52]:
print(X_train_padded.shape)
print(y_train.shape)

print(X_train[0])
print(X_train_tokenized[0])

(50493, 5000)
(50493,)
law enforcement on high alert following threats against cops and whites on by blacklivesmatter and fyf terrorists video no comment is expected from barack obama members of the fyf or fukyoflag and blacklivesmatter movements called for the lynching and hanging of white people and cops they encouraged others on a radio show tuesday night to  turn the tide  and kill white people and cops to send a message about the killing of black people in americaone of the fyoflag organizers is called  sunshine  she has a radio blog show hosted from texas called  sunshine s fing opinion radio show a snapshot of her fyf lolatwhitefear twitter page at  pm shows that she was urging supporters to  call now fyf tonight we continue to dismantle the illusion of white below is a snapshot twitter radio call invite   fyfthe radio show aired at  pm eastern standard timeduring the show callers clearly call for  lynching  and  killing  of white peoplea  minute clip from the radio show can be 

In [53]:
vocab_size = len(tokenizer.index_word) + 1  # also num of features
print(f'vocab_size: {vocab_size}')

vocab_size: 319480


In [54]:
vector_size = 400
window_size = 4

word_sentences = [word_tokenize(sentence) for sentence in X]
word2vec_model = Word2Vec(sentences=word_sentences, vector_size=vector_size, window=window_size, min_count=1, workers=4)

embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [55]:
def create_model(trial):
    second_gru_layer = trial.suggest_int('second_gru_layer', 0, 1)
    deep_layers = trial.suggest_int("deep_layer", 0, 1)
    gru_units = trial.suggest_int("gru_units", 50, 500)

    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
    model.add(
        SpatialDropout1D(
            trial.suggest_float('spatial_dropout_rate', 0.1, 0.4)
        )
    )
    if bool(second_gru_layer):
        model.add(
            GRU(
                gru_units,
                return_sequences=True
            )
        )
        model.add(
            GRU(trial.suggest_int('second_gru_units', 50, 200))
        )
    else:
        model.add(
            GRU(gru_units)
        )

    for i in range(deep_layers):
        num_hidden = trial.suggest_int("num_neurons_hidden{}".format(i), 4, 128, log=True)
        model.add(
            Dense(
                num_hidden,
                activation="relu",
                kernel_regularizer=l2(trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)),
            )
        )
    model.add(Dense(1, activation='sigmoid'))
    
    return model


def objective(trial):
    #learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 0.001)
    #batch_size = trial.suggest_int('batch_size', 8, 16)
    #epochs = trial.suggest_int('epochs', 5, 10)

    model = create_model(trial)
    optimizer = Adam()#learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        patience=3,
        restore_best_weights=True
    )

    history = model.fit(
        X_train_padded,
        y_train,
        epochs=3,
        batch_size=32,
        validation_split=0.2,
        verbose=1,
        callbacks=[early_stopping]
    )
    loss = min(history.history['loss'])
    val_loss = min(history.history['val_loss'])
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    print(f'loss: {str(loss)}, val_loss: {str(val_loss)}, train_accuracy: {str(train_accuracy)}, val_accuracy: {str(val_accuracy)}')
    
    return max(val_accuracy)


In [56]:
study_name = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')

storage = optuna.storages.RDBStorage('sqlite:///optuna_study.db')
study = optuna.create_study(direction='maximize', storage=storage, study_name=study_name)
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2024-07-23 11:50:22,816] A new study created in RDB with name: 2024-07-23_11:50:22


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1/3
Epoch 2/3
 153/1263 [==>...........................] - ETA: 7:49 - loss: 0.0382 - accuracy: 0.9855[W 2024-07-23 12:01:18,071] Trial 0 failed with parameters: {'second_gru_layer': 1, 'deep_layer': 0, 'gru_units': 472, 'spatial_dropout_rate': 0.2187475676378805, 'second_gru_units': 74} because of the following error: InternalError().
Traceback (most recent call last):
  File "/home/tober/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_10969/3773704726.py", line 57, in objective
    history = model.fit(
  File "/home/tober/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/tober/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
    tensors = py

InternalError: Graph execution error:

Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 3, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 400, 472, 1, 5000, 32, 0] 
	 [[{{node gradients/CudnnRNN_grad/CudnnRNNBackprop}}]]
	 [[Adam/gradients/PartitionedCall_1]] [Op:__inference_train_function_29317]

In [None]:

#study = optuna.load_study(study_name='cybnews', storage='sqlite:///optuna_study.db')
best_trial = study.best_trial

print(f"Best trial value (validation loss): {best_trial.value}")
print("Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")

