In [47]:
import ast
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
train_data = pd.read_csv("./test_data.csv")
test_data = pd.read_csv("./test_data.csv")

# convert `text` column back to list
train_data["text"] = train_data["text"].apply(ast.literal_eval)
test_data["text"] = test_data["text"].apply(ast.literal_eval)

### 1. Naive Bayes - based only on the content of the articles

In [12]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(train_data["text"].apply(lambda words: " ".join(words)))
y = train_data["label"]

In [18]:
NB_classifier = MultinomialNB()
NB_classifier.fit(X, y)

pass

In [25]:
X_test = vectorizer.transform(test_data["text"].apply(lambda words: " ".join(words)))
y_pred = NB_classifier.predict(X_test)

accuracy = accuracy_score(test_data["label"], y_pred)

accuracy

0.9035385361124576

In [29]:
print(classification_report(test_data["label"], y_pred))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      2037
           1       0.96      0.85      0.90      2089

    accuracy                           0.90      4126
   macro avg       0.91      0.90      0.90      4126
weighted avg       0.91      0.90      0.90      4126



### 2. Logistic Regression - based on author

In [42]:
onehot = OneHotEncoder()

X = onehot.fit_transform(np.array(train_data["author"]).reshape(-1, 1))
y = test_data["label"]

LR = LogisticRegression()

LR.fit(X, y)

pass

In [43]:
X_test = onehot.transform(np.array(test_data["author"]).reshape(-1, 1))
y_pred = LR.predict(X_test)

accuracy = accuracy_score(test_data["label"], y_pred)

accuracy

0.8691226369365003

In [44]:
print(classification_report(test_data["label"], y_pred))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87      2037
           1       0.86      0.89      0.87      2089

    accuracy                           0.87      4126
   macro avg       0.87      0.87      0.87      4126
weighted avg       0.87      0.87      0.87      4126



### 3. Custom Neural Network - based on both conent and author

In [50]:
vocab_file = open("./data/text_vocab.json", mode="r")
text_vocab = json.load(vocab_file)


vocab_size

140914

In [87]:
def padding_post(sequence, length):
    return sequence + ['<pad>'] * (length - len(sequence))

In [88]:
def encode_text(sequence):
    return [text_vocab[word] for word in sequence]

In [118]:
vocab_size = len(text_vocab)
embedding_dim = 2
max_len = max(train_data['text'].str.len())
drop_value = 0.2
batch_size = 32
epoch=10

In [101]:
def get_data(df, batch_size, max_len):
    for i in range(0, len(df["text"]), batch_size):
        x = [encode_text(padding_post(sequence, max_len)) for sequence in df["text"].iloc[i:i+batch_size]]
        y = [label for label in df["label"].iloc[i:i+batch_size]]

        yield x, y

In [120]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

token = Tokenizer(num_words=vocab_size, oov_token='<pad>')
token.fit_on_texts(train_data["text"])

Trainning_seq = token.texts_to_sequences(train_data["text"])
Trainning_pad = pad_sequences(Trainning_seq, maxlen=max_len, padding='post')

model=tf.keras.models.Sequential([tf.keras.layers.Embedding(vocab_size, 16, input_length=max_len),
                                  tf.keras.layers.GlobalAveragePooling1D(),
                                  tf.keras.layers.Dropout(0.3),
                                  tf.keras.layers.Dense(32,activation='relu'),
                                  tf.keras.layers.Dropout(0.3),
                                  tf.keras.layers.Dense(1,activation='sigmoid')])


model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'],optimizer='adam')

early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)

history = model.fit(Trainning_pad, train_data["label"], epochs=epoch, callbacks=[early_stop], verbose=2)


Epoch 1/10
129/129 - 5s - loss: 0.6142 - accuracy: 0.6842 - 5s/epoch - 38ms/step
Epoch 2/10
129/129 - 4s - loss: 0.3293 - accuracy: 0.9072 - 4s/epoch - 31ms/step
Epoch 3/10
129/129 - 4s - loss: 0.1602 - accuracy: 0.9569 - 4s/epoch - 31ms/step
Epoch 4/10
129/129 - 4s - loss: 0.1008 - accuracy: 0.9716 - 4s/epoch - 31ms/step
Epoch 5/10
129/129 - 4s - loss: 0.0707 - accuracy: 0.9840 - 4s/epoch - 30ms/step
Epoch 6/10
129/129 - 4s - loss: 0.0559 - accuracy: 0.9859 - 4s/epoch - 30ms/step
Epoch 7/10
129/129 - 4s - loss: 0.0419 - accuracy: 0.9908 - 4s/epoch - 30ms/step
Epoch 8/10
129/129 - 4s - loss: 0.0332 - accuracy: 0.9932 - 4s/epoch - 31ms/step
Epoch 9/10
129/129 - 4s - loss: 0.0242 - accuracy: 0.9971 - 4s/epoch - 32ms/step
Epoch 10/10
129/129 - 4s - loss: 0.0189 - accuracy: 0.9981 - 4s/epoch - 32ms/step


In [83]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping



# Define the model architecture
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("model compiled")

# Set up early stopping
early_stop = EarlyStopping(patience=3, restore_best_weights=True)

print("start training")
# Train the model
model.fit(
    tf.convert_to_tensor(test_data["text"].apply(lambda words: " ".join(words)).tolist()),
    tf.convert_to_tensor(train_data['label']),
    epochs=10, batch_size=32, callbacks=[early_stop]
)

print("model trained")

# Evaluate the model
loss, accuracy = model.evaluate(test_data['text'], test_data['label'])
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

model compiled
start training
Epoch 1/10


ValueError: in user code:

    File "c:\Users\hurub\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\hurub\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\hurub\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\hurub\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\hurub\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\hurub\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_14' (type Sequential).
    
    Input 0 of layer "lstm_13" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 2)
    
    Call arguments received by layer 'sequential_14' (type Sequential):
      • inputs=tf.Tensor(shape=(None,), dtype=string)
      • training=True
      • mask=None


Here's a step-by-step explanation of the model architecture:

    tf.keras.layers.Embedding: This layer represents word embeddings. It takes an integer-encoded vocabulary and converts it into dense vector representations. It has vocab_size as the input dimension, 16 as the embedding dimension, and input_length=50 indicates that each input sequence has a length of 50.

    tf.keras.layers.GlobalAveragePooling1D: This layer performs average pooling across the sequence dimension. It takes the embedded sequences and calculates the average value for each feature dimension. This reduces the dimensionality of the data.

    tf.keras.layers.Dense: This layer is a fully connected (dense) layer with 32 units/neurons. It applies the ReLU activation function, which introduces non-linearity to the model.

    tf.keras.layers.Dropout: This layer applies dropout regularization to prevent overfitting. It randomly sets a fraction (0.3) of input units to 0 at each update during training time.

    tf.keras.layers.Dense: This is the final output layer with a single neuron. It uses the sigmoid activation function to produce a binary classification output (0 or 1), indicating the likelihood of a sample belonging to a specific class (e.g., fake news detection).

Overall, this model architecture consists of an embedding layer, a pooling layer, two dense layers with activation functions, and a dropout layer. It is suitable for tasks like sentiment analysis or binary classification, where the input consists of text sequences of fixed length.