In [1]:
import os
import string
import pandas as pd
import numpy as np
from matplotlib.pylab import plt
import nltk

from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline

from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#from scikeras.wrappers import KerasClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, SpatialDropout1D


import optuna
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import imdb

from cybnews.data import get_data, welf_join_text


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [3]:
# WELF dataset
#
#
DATA_PATH = '/home/tober/devel/lewagon/project/cyb-news/data'
data = welf_join_text(get_data(f'{DATA_PATH}/WELFake_Dataset.csv'))[['all_text', 'label']]
data = data.sample(frac=0.1, random_state=42)

In [4]:
def clean(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for x in string.punctuation:
        sentence = sentence.replace(x, '')

    for x in ['’', '“', '”', '-', '"' ]:
        sentence = sentence.replace(x, '')
    return sentence

data["all_text_cleaned"] = data["all_text"].apply(clean)

In [5]:
max_seq_length = 500  # Maximum sequence length
print(f'max_seq_length: {max_seq_length}')

max_seq_length: 500


In [6]:
X = data.all_text_cleaned
y = data.label

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_tokenized, maxlen=max_seq_length, dtype='float32', padding='pre', value=0)
X_test_padded = pad_sequences(X_test_tokenized, maxlen=max_seq_length, dtype='float32', padding='pre', value=0)



In [7]:
print(X_train_padded.shape)
print(y_train.shape)

(5049, 500)
(5049,)


In [8]:
vocab_size = len(tokenizer.index_word) + 1  # also num of features
print(f'vocab_size: {vocab_size}')

vocab_size: 82829


In [9]:
vector_size = 50
window_size = 5

word2vec_model = Word2Vec(sentences=X, vector_size=vector_size, window=window_size, min_count=1, workers=4)

embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [14]:
def create_model(units=50, neurons=50, dropout_rate=0.2):#, learning_rate=0.001):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_seq_length, trainable=False),
        #SpatialDropout1D(dropout_rate),
        #LSTM(units, dropout=dropout_rate, recurrent_dropout=dropout_rate),
        SimpleRNN(units=units, return_sequences = True),
        SimpleRNN(units=units, activation='tanh'),
        Dense(neurons, activation='relu'),
        Dense(1, activation='sigmoid', kernel_initializer=GlorotUniform())
    ])
    optimizer = Adam()#learning_rate=learning_rate, clipvalue=1.0)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    #model.summary()
    return model

model = KerasClassifier(build_fn=create_model, verbose=1)

param_grid = {
    'units': [20],
    'neurons': [12, 24],
    'dropout_rate': [0.1, 0.3],
    'batch_size': [6, 8],
    'epochs': [5, 10],
    #'learning_rate': [0.001, 0.01],
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    n_jobs=28,
    cv=1,
    scoring='accuracy',
    verbose=3,
    error_score="raise",
)


#grid_result = grid.fit(X_train_padded, y_train, validation_split=0.2)
# print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

# best_model = grid_result.best_estimator_.model
# loss, accuracy = best_model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {accuracy:.2f}")

# best_model.model.save('best_model.h5')

  model = KerasClassifier(build_fn=create_model, verbose=1)


In [18]:
#def create_model(units=100, neurons=50, dropout_rate=0.2, learning_rate=0.001)

# param_grid = {
#     'units': [20],
#     'neurons': [12, 24],
#     'dropout_rate': [0.1, 0.3],
#     'batch_size': [6, 8],
#     'epochs': [5, 10],
#     #'learning_rate': [0.001, 0.01],
# }

manual_model = create_model(units=20)
manual_model.fit(X_train_padded, y_train, validation_split=0.2)



<keras.callbacks.History at 0x7e25c5672920>