In [1]:
import os
import string
import pandas as pd
import numpy as np
from matplotlib.pylab import plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline

from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#from scikeras.wrappers import KerasClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from cybnews.data import get_data, welf_join_text


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [3]:
# WELF
#
# 0 = real
# 1 = fake
#
DATA_PATH = '/home/tober/devel/lewagon/project/cyb-news/data'
data = welf_join_text(get_data(f'{DATA_PATH}/WELFake_Dataset.csv'))[['all_text', 'label']]
data = data.sample(frac=0.05, random_state=42)

In [4]:
def clean(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for x in string.punctuation:
        sentence = sentence.replace(x, '')

    for x in ['’', '“', '”', '-', '"' ]:
        sentence = sentence.replace(x, '')
    return sentence

data["all_text_cleaned"] = data["all_text"].apply(clean)

In [5]:
max_len = 5000  # Maximum sequence length

In [6]:
X = data.all_text_cleaned
y = data.label

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

tk = Tokenizer()
tk.fit_on_texts(X_train)

X_train_token = tk.texts_to_sequences(X_train)
X_test_token = tk.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_token, maxlen=max_len, dtype='float32', padding='pre', value=0)
X_test_padded = pad_sequences(X_test_token, maxlen=max_len, dtype='float32', padding='pre', value=0)

In [7]:
vocab_size = len(tk.index_word) + 1  # also num of features
vocab_size

56153

In [8]:
embedding_dim = 16

def simple_model(X, y):
    model = models.Sequential()
    
    model.add(
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            input_length=max_len,
            mask_zero=True
        )
    )
    
    model.add(layers.LSTM(embedding_dim))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, verbose=1)

    return model

#m = simple_model(X_train_padded, y_train)

In [9]:
def evaluate_model(model, X, y, batch_size=64):
    metrics = model.evaluate(
        x=X,
        y=y,
        batch_size=batch_size,
        verbose=1,
        callbacks=None,
        return_dict=True
    )
    return metrics

#metrics = evaluate_model(m, X_test_padded, y_test)

In [12]:
def create_model(
    embedding_dim=50,
    dense_units=50,
):

    model = Sequential()

    model.add(
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            input_length=max_len,
            mask_zero=True
        )
    )
    model.add(LSTM(embedding_dim))
    model.add(Dense(dense_units, activation='relu'))
#    model.add(layers.Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))       
    
    model.compile(
        loss='binary_crossentropy',
        metrics=['accuracy'],
        optimizer='adam',
    )
    print(model.summary())
    return model


model = KerasClassifier(
    build_fn = create_model,
    batch_size = 8,
    verbose = 1,
    dense_units = [10],
    embedding_dim = [12],
    epochs = [10],
)

param_grid = {
    'epochs': [10],
    'dense_units': [12, 24],
    'embedding_dim': [15, 30],
}

model.get_params()

  model = KerasClassifier(


{'batch_size': 8,
 'verbose': 1,
 'dense_units': [10],
 'embedding_dim': [12],
 'epochs': [10],
 'build_fn': <function __main__.create_model(embedding_dim=50, dense_units=50)>}

In [11]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    n_jobs=28,
    cv=3,
    verbose=3,
    error_score="raise"
    )

grid_result = grid.fit(X_train_padded, y_train, validation_split=0.2, callbacks=[early_stopping])
#grid_result = grid.fit(X_train_padded, y_train, validation_split=0.2)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_model = grid_result.best_estimator_

import joblib
joblib.dump(model, "/home/tober/devel/lewagon/project/cyb-news/models/deep_model.pkl")