In [1]:
# Install required packages
!pip install -q scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
true = pd.read_csv('/content/drive/MyDrive/True.csv')
fake = pd.read_csv('/content/drive/MyDrive/Fake.csv')


In [4]:
print("True News Sample:")
print(true.head())

True News Sample:
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   


In [5]:
print("\nFake News Sample:")
print(fake.head())


Fake News Sample:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [6]:
# Label the datasets
true['label'] = 1  # Genuine
fake['label'] = 0  # Fake

In [7]:
# Concatenate
news = pd.concat([true, fake], axis=0)
news = news.drop(['title', 'subject', 'date'], axis=1)


In [8]:
# Shuffle the dataset
news = news.sample(frac=1).reset_index(drop=True)


In [9]:
# Preprocessing function
def wordopt(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\n', ' ', text)
    return text

In [10]:
# Apply preprocessing
tqdm.pandas()
news['text'] = news['text'].progress_apply(wordopt)


100%|██████████| 44898/44898 [00:05<00:00, 7872.35it/s]


In [11]:
# Prepare features and labels
x = news['text']
y = news['label']

In [12]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


In [13]:
# TF-IDF Vectorization
vectorization = TfidfVectorizer(max_features=5000)
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [14]:
# Train and Evaluate Models
def train_model(model, name):
    model.fit(xv_train, y_train)
    pred = model.predict(xv_test)
    acc = accuracy_score(y_test, pred)
    print(f"\nModel: {name}")
    print("Accuracy:", acc)
    print("Classification Report:")
    print(classification_report(y_test, pred))
    return model

In [15]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Passive Aggressive Classifier": PassiveAggressiveClassifier(max_iter=1000),
}


In [16]:
trained_models = {}
for name, model in models.items():
    trained_models[name] = train_model(model, name)



Model: Logistic Regression
Accuracy: 0.9883444691907943
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7047
           1       0.99      0.99      0.99      6423

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470


Model: Decision Tree
Accuracy: 0.9949517446176689
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7047
           1       1.00      0.99      0.99      6423

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470


Model: Random Forest
Accuracy: 0.9977728285077951
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7047
         

In [17]:
# Label Output Function
def output_label(n):
    return "It is a Genuine News" if n == 1 else "It is a Fake News"

In [18]:
# Manual Testing Function
def manual_testing(news_text):
    new_input = pd.DataFrame({'text': [news_text]})
    new_input['text'] = new_input['text'].apply(wordopt)
    vectorized_input = vectorization.transform(new_input['text'])

    results = []
    for name, model in trained_models.items():
        prediction = model.predict(vectorized_input)[0]
        results.append(f"{name} Prediction: {output_label(prediction)}")
    return "\n".join(results)

In [19]:


# Print the third test case specifically
print("\nThird Test Case (True News):")
print(true.iloc[2])



Third Test Case (True News):
title      Senior U.S. Republican senator: 'Let Mr. Muell...
text       WASHINGTON (Reuters) - The special counsel inv...
subject                                         politicsNews
date                                      December 31, 2017 
label                                                      1
Name: 2, dtype: object


In [20]:


# Print the third test case specifically
print("\nThird Test Case (True News):")
print(fake.iloc[2])



Third Test Case (True News):
title       Sheriff David Clarke Becomes An Internet Joke...
text       On Friday, it was revealed that former Milwauk...
subject                                                 News
date                                       December 30, 2017
label                                                      0
Name: 2, dtype: object


In [21]:
# Extract the actual text of the third true news article
user_news = true.iloc[2]['text']

# Run manual testing
print(manual_testing(user_news))



Logistic Regression Prediction: It is a Genuine News
Decision Tree Prediction: It is a Genuine News
Random Forest Prediction: It is a Genuine News
Gradient Boosting Prediction: It is a Genuine News
Passive Aggressive Classifier Prediction: It is a Genuine News


In [22]:
user_news = fake.iloc[2]['text']
print(manual_testing(user_news))

Logistic Regression Prediction: It is a Fake News
Decision Tree Prediction: It is a Fake News
Random Forest Prediction: It is a Fake News
Gradient Boosting Prediction: It is a Fake News
Passive Aggressive Classifier Prediction: It is a Fake News


In [23]:
# Install TensorFlow if not available
!pip install -q tensorflow

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [24]:
# Parameters
vocab_size = 10000
max_len = 300
embedding_dim = 128

In [25]:
# Tokenization
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

In [26]:
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)


In [27]:
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding='post', truncating='post')

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# Define the model without `input_length`
model = Sequential([
    Embedding(vocab_size, embedding_dim),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])


In [29]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])


In [30]:
# Train the model
history = model.fit(x_train_pad, y_train, epochs=20, batch_size=64, validation_data=(x_test_pad, y_test))


Epoch 1/20
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 36ms/step - accuracy: 0.9457 - loss: 0.1594 - val_accuracy: 0.9989 - val_loss: 0.0060
Epoch 2/20
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 30ms/step - accuracy: 0.9995 - loss: 0.0036 - val_accuracy: 0.9990 - val_loss: 0.0085
Epoch 3/20
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.9996 - loss: 0.0020 - val_accuracy: 0.9989 - val_loss: 0.0076
Epoch 4/20
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 32ms/step - accuracy: 0.9996 - loss: 0.0018 - val_accuracy: 0.9990 - val_loss: 0.0080
Epoch 5/20
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 31ms/step - accuracy: 0.9999 - loss: 8.1404e-04 - val_accuracy: 0.9991 - val_loss: 0.0093
Epoch 6/20
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 30ms/step - accuracy: 1.0000 - loss: 2.3460e-04 - val_accuracy: 0.9990 - val_loss: 0.0091
Epoch 7/

In [31]:
# Evaluate
loss, accuracy = model.evaluate(x_test_pad, y_test)
print("\nBiLSTM Accuracy:", accuracy)

[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9990 - loss: 0.0121

BiLSTM Accuracy: 0.9988864064216614


In [39]:
def manual_testing(news_text):
    new_input = pd.DataFrame({'text': [news_text]})
    new_input['text'] = new_input['text'].apply(wordopt)

    # TF-IDF vectorization for classical models
    vectorized_input = vectorization.transform(new_input['text'])

    results = []

    # Classical ML model predictions
    for name, model_clf in trained_models.items():
        prediction = model_clf.predict(vectorized_input)[0]
        results.append(f"{name} Prediction: {output_label(prediction)}")

    # BiLSTM model prediction
    cleaned_text = new_input['text'].iloc[0]
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    pred_dl = model.predict(padded, verbose=0)[0][0]
    label_dl = 1 if pred_dl >= 0.5 else 0
    results.append(f"BiLSTM Prediction: {output_label(label_dl)}")

    return "\n".join(results)


In [40]:
print(manual_testing(true['text'].iloc[2]))


Logistic Regression Prediction: It is a Genuine News
Decision Tree Prediction: It is a Genuine News
Random Forest Prediction: It is a Genuine News
Gradient Boosting Prediction: It is a Genuine News
Passive Aggressive Classifier Prediction: It is a Genuine News
BiLSTM Prediction: It is a Genuine News


In [45]:
user_input = input("Enter news text for testing:\n")
print(manual_testing(user_input))

Enter news text for testing:
Sheriff David Clarke Becomes An Internet Joke
Logistic Regression Prediction: It is a Fake News
Decision Tree Prediction: It is a Fake News
Random Forest Prediction: It is a Fake News
Gradient Boosting Prediction: It is a Fake News
Passive Aggressive Classifier Prediction: It is a Fake News
BiLSTM Prediction: It is a Fake News
