In [1]:
pip install sentence-transformers scikit-learn pandas numpy

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB)
Collecting pyyaml>=5.1 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)


In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [7]:
# Load pre-trained embedding model (FREE)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load datasets (each file contains one article per line)
with open("Fake.csv", "r", encoding="utf-8") as f:
    fake_news = f.readlines()

with open("True.csv", "r", encoding="utf-8") as f:
    real_news = f.readlines()


In [8]:
# Create DataFrame
df_fake = pd.DataFrame({"text": fake_news, "label": 1})  # Fake news = 1
df_real = pd.DataFrame({"text": real_news, "label": 0})  # Real news = 0

df = pd.concat([df_fake, df_real], ignore_index=True)

# Generate embeddings
df["embeddings"] = df["text"].apply(lambda x: model.encode(x.strip()))

In [9]:
# Prepare training data
X = np.vstack(df["embeddings"].values)
y = df["label"].values

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [11]:
# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.93


In [26]:
# Example Prediction
def predict_fake_news(text):
    embedding = model.encode(text)
    prediction = clf.predict([embedding])[0]
    return "Fake News" if prediction == 1 else "Real News"

# Test exampleMisinformation
example_text = "Trump says Russia probe will be fair, but timeline unclear: NYT"
print(predict_fake_news(example_text))


Real News


In [28]:
# Test exampleMisinformation
example_text = "Rainbow has 7 colors"
print(predict_fake_news(example_text))

Real News


In [29]:
# Test exampleMisinformation
example_text = "Rainbow has 9 colors"
print(predict_fake_news(example_text))

Fake News
