In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
df = pd.read_csv("news.csv", on_bad_lines='skip')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Preprocessing Dataset

In [7]:
df = df.drop(["Unnamed: 0"], axis=1)
df.head(5)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Data Encoding
It converts the categorical column (label in our case) into numerical values.  
0 for REAL, 1 for FAKE

In [9]:
le = preprocessing.LabelEncoder()
le.fit(df['label'])
df['label'] = le.transform(df['label'])

### Variable Setp up

In [10]:
embedding_dim = 50
max_length = 54
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
training_size = 3000
test_portion = 0.1

### Tokenization

In [11]:
title = []
text = []
labels = []
for x in range(training_size):
    title.append(df['title'][x])
    text.append(df['text'][x])
    labels.append(df['label'][x])

tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title)
word_index1 = tokenizer1.word_index
vocab_size1 = len(word_index1)
sequences1 = tokenizer1.texts_to_sequences(title)
padded1 = pad_sequences(sequences1, padding=padding_type, truncating=trunc_type)

### Splitting Data for Training and Testing


In [12]:
split = int(test_portion * training_size)
training_sequences1 = padded1[split:training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

### Reshaping Data for LSTM
Using LSTM(Long Short Term Memory) model for prediction and for that we need to reshape padded sequence

In [13]:
training_sequences1 = np.array(training_sequences1)
test_sequences1 = np.array(test_sequences1)

### Generating Word Embedding
Using GlOve to generate embeddings

In [14]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-03-27 06:18:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-27 06:18:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-27 06:18:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [15]:
embedding_index = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((vocab_size1 + 1, embedding_dim))

for word, i in word_index1.items():
    if i < vocab_size1:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


### Model Architecture
use the TensorFlow embedding technique with Keras Embedding Layer where we map original input data into some set of real-valued dimensions.

Embedding: The embedding layer uses pre-trained GloVe embeddings.  
Conv1D: A 1D convolutional layer to detect patterns in the text.  
LSTM(64): An LSTM layer to capture long-term dependencies in the data.

In [19]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size1 + 1,
                              output_dim=embedding_dim,
                              input_length=max_length,  # Ensure input_length is defined
                              weights=[embedding_matrix],
                              trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Force model building by calling summary
model.build(input_shape=(None, max_length))  # Ensure model is built
model.summary()


### Training the model

In [20]:
history = model.fit(
    training_sequences1,
    np.array(training_labels),
    epochs=50,
    validation_data=(test_sequences1, np.array(test_labels)),
    verbose=2
)


Epoch 1/50
85/85 - 6s - 76ms/step - accuracy: 0.6037 - loss: 0.6488 - val_accuracy: 0.6900 - val_loss: 0.5669
Epoch 2/50
85/85 - 1s - 14ms/step - accuracy: 0.7159 - loss: 0.5621 - val_accuracy: 0.6867 - val_loss: 0.5629
Epoch 3/50
85/85 - 1s - 16ms/step - accuracy: 0.7389 - loss: 0.5087 - val_accuracy: 0.7033 - val_loss: 0.5284
Epoch 4/50
85/85 - 1s - 14ms/step - accuracy: 0.7663 - loss: 0.4890 - val_accuracy: 0.7200 - val_loss: 0.5333
Epoch 5/50
85/85 - 1s - 15ms/step - accuracy: 0.7900 - loss: 0.4491 - val_accuracy: 0.7267 - val_loss: 0.5021
Epoch 6/50
85/85 - 3s - 32ms/step - accuracy: 0.8178 - loss: 0.3987 - val_accuracy: 0.7200 - val_loss: 0.5027
Epoch 7/50
85/85 - 3s - 38ms/step - accuracy: 0.8470 - loss: 0.3536 - val_accuracy: 0.7500 - val_loss: 0.4891
Epoch 8/50
85/85 - 2s - 19ms/step - accuracy: 0.8722 - loss: 0.3081 - val_accuracy: 0.7467 - val_loss: 0.5328
Epoch 9/50
85/85 - 1s - 14ms/step - accuracy: 0.8681 - loss: 0.2953 - val_accuracy: 0.7533 - val_loss: 0.5226
Epoch 10/5

### Sample Prediction

In [21]:
X = "Karry to go to France in gesture of sympathy"

sequences = tokenizer1.texts_to_sequences([X])
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")


This news is False


Improvement: This model can be further improved by fine-tuning the hyperparameters, trying different architectures or using more advanced techniques like attention mechanisms