In [19]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!pip install tensorflow keras nltk



In [6]:
import os
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout


In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/fake-and-real-news-dataset


In [8]:
# Load datasets
fake_df = pd.read_csv(os.path.join(path, "Fake.csv"))
real_df = pd.read_csv(os.path.join(path, "True.csv"))

fake_df["label"] = 0
real_df["label"] = 1

In [9]:
df = pd.concat([fake_df, real_df]).sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1
...,...,...,...,...,...
44893,UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...,,politics,"Mar 27, 2017",0
44894,PM May seeks to ease Japan's Brexit fears duri...,LONDON/TOKYO (Reuters) - British Prime Ministe...,worldnews,"August 29, 2017",1
44895,Merkel: Difficult German coalition talks can r...,BERLIN (Reuters) - Chancellor Angela Merkel sa...,worldnews,"November 16, 2017",1
44896,Trump Stole An Idea From North Korean Propaga...,Jesus f*cking Christ our President* is a moron...,News,"July 14, 2017",0


In [10]:
# Combine title and text
df["content"] = df["title"] + " " + df["text"]
df

Unnamed: 0,title,text,subject,date,label,content
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0,Ben Stein Calls Out 9th Circuit Court: Committ...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1,Trump drops Steve Bannon from National Securit...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1,Puerto Rico expects U.S. to lift Jones Act shi...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0,OOPS: Trump Just Accidentally Confirmed He Le...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1,Donald Trump heads for Scotland to reopen a go...
...,...,...,...,...,...,...
44893,UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...,,politics,"Mar 27, 2017",0,UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...
44894,PM May seeks to ease Japan's Brexit fears duri...,LONDON/TOKYO (Reuters) - British Prime Ministe...,worldnews,"August 29, 2017",1,PM May seeks to ease Japan's Brexit fears duri...
44895,Merkel: Difficult German coalition talks can r...,BERLIN (Reuters) - Chancellor Angela Merkel sa...,worldnews,"November 16, 2017",1,Merkel: Difficult German coalition talks can r...
44896,Trump Stole An Idea From North Korean Propaga...,Jesus f*cking Christ our President* is a moron...,News,"July 14, 2017",0,Trump Stole An Idea From North Korean Propaga...


In [11]:

# Preprocessing
nltk.download('punkt')
X = df["content"].astype(str).values
y = df["label"].values


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
df["content"].astype(str).values

array(['Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution 21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame appearing in TV shows and films such as Ferris Bueller s Day Off) made some provocative statements on Judge Jeanine Pirro s show recently. While discussing the halt that was imposed on President Trump s Executive Order on travel. Stein referred to the judgement by the 9th Circuit Court in Washington state as a  Coup d tat against the executive branch and against the constitution.  Stein went on to call the Judges in Seattle  political puppets  and the judiciary  political pawns. Watch the interview below for the complete statements and note the stark contrast to the rhetoric of the leftist media and pundits who neglect to note that no court has ever blocked any Presidential orders in immigration in the past or discuss the legal efficacy of the halt or the actual text of the Execut

In [16]:
df["label"].values

array([0, 1, 1, ..., 1, 0, 0])

In [17]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences, maxlen=300, padding='post', truncating='post')


In [18]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [22]:
vocab_size

147710

**GloVe** (Global Vectors for Word Representation) is a **pretrained word embedding** technique developed by researchers at Stanford. It’s used to convert words into meaningful numerical vectors so that machine learning models can understand and process text data.

---

### 🔍 What Exactly Is GloVe?
- GloVe creates a **vector space** where **words with similar meaning are close together**.
- Unlike one-hot encoding, where each word is just a unique index, GloVe captures **semantic relationships** between words.
- Example:
  ```
  vector("king") - vector("man") + vector("woman") ≈ vector("queen")
  ```

---

### ⚙️ How Does GloVe Work?
- It uses **word co-occurrence statistics** from a large corpus.
- Instead of just learning based on neighboring words (like Word2Vec), GloVe builds a **global co-occurrence matrix**.
- It learns embeddings such that the **dot product of two word vectors equals the log of their co-occurrence probability**.

---

### 🧠 Why Use GloVe in NLP Projects?
- **Pretrained** on massive datasets like Wikipedia + Gigaword.
- Captures **both syntactic and semantic meaning**.
- Helps deep learning models **train faster** and generalize better on small datasets.
- Works well with models like **LSTM, BiLSTM, GRU, and CNNs for text**.

---

### 📦 Popular GloVe Versions
| File | Dimension | Size | Corpus |
|------|-----------|------|--------|
| `glove.6B.50d.txt` | 50 | ~822 MB | Wikipedia 2014 + Gigaword 5 |
| `glove.6B.100d.txt` | 100 | ~1.3 GB | Same |
| `glove.6B.300d.txt` | 300 | ~3.6 GB | Same |

---

### 📌 Where to Download?
You can download GloVe from:  
➡️ https://nlp.stanford.edu/projects/glove/

---

In [25]:
# Load GloVe embeddings
embedding_index = {}
with open("/content/drive/Othercomputers/Vignesh MacBook Air/Fake_News_Detection/glove/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeffs

In [26]:
# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if word in embedding_index:
        embedding_matrix[i] = embedding_index[word]

In [28]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [29]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [30]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128)


Epoch 1/5
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 227ms/step - accuracy: 0.8962 - loss: 0.2389 - val_accuracy: 0.9977 - val_loss: 0.0104
Epoch 2/5
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 225ms/step - accuracy: 0.9982 - loss: 0.0091 - val_accuracy: 0.9986 - val_loss: 0.0073
Epoch 3/5
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 225ms/step - accuracy: 0.9982 - loss: 0.0081 - val_accuracy: 0.9983 - val_loss: 0.0075
Epoch 4/5
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 224ms/step - accuracy: 0.9990 - loss: 0.0048 - val_accuracy: 0.9987 - val_loss: 0.0065
Epoch 5/5
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 226ms/step - accuracy: 0.9990 - loss: 0.0042 - val_accuracy: 0.9984 - val_loss: 0.0066


In [31]:
# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.9985 - loss: 0.0068

Test Accuracy: 0.9984
