In [19]:
import string
import pandas as pd
import numpy as np
from matplotlib.pylab import plt

from sklearn.model_selection import train_test_split

from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from cybnews.data import get_data, welf_join_text

In [20]:
# WELF
#
# 0 = real
# 1 = fake
#
DATA_PATH = '/home/tober/devel/lewagon/project/cyb-news/data'
data = welf_join_text(get_data(f'{DATA_PATH}/WELFake_Dataset.csv'))[['all_text', 'label']]

In [21]:
def clean(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for x in string.punctuation:
        sentence = sentence.replace(x, '')

    for x in ['’', '“', '”', '-', '"' ]:
        sentence = sentence.replace(x, '')
    return sentence

data["all_text_cleaned"] = data["all_text"].apply(clean)

In [22]:
max_len = 3000  # Maximum sequence length

In [23]:
X = data.all_text_cleaned
y = data.label

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.5,
    random_state=42
)

tk = Tokenizer()
tk.fit_on_texts(X_train)

X_train_token = tk.texts_to_sequences(X_train)
X_test_token = tk.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_token, maxlen=max_len, dtype='float32', padding='pre', value=0)
X_test_padded = pad_sequences(X_test_token, maxlen=max_len, dtype='float32', padding='pre', value=0)

In [24]:
num_words = len(tk.index_word) + 1  # also num of features

In [25]:
embedding_dim = 16

def simple_model(X, y):
    
    model = models.Sequential()
    model.add(
        layers.Embedding(
            input_dim=num_words,
            output_dim=embedding_dim,
            input_length=max_len,
            mask_zero=True
        )
    )
    
    model.add(layers.LSTM(embedding_dim))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, verbose=1)

    return model

m = simple_model(X_train_padded, y_train)



In [35]:
def evaluate_model(model, X, y, batch_size=64):
    metrics = model.evaluate(
        x=X,
        y=y,
        batch_size=batch_size,
        verbose=1,
        callbacks=None,
        return_dict=True
    )
    return metrics

metrics = evaluate_model(m, X_test_padded, y_test)

