In [1]:
import pyarrow
import numpy as np
import pandas as pd
import re
import string
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("PyArrow version:", pyarrow.__version__)
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

PyArrow version: 19.0.1
NumPy version: 2.0.2
Pandas version: 2.2.3


In [3]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [4]:
# Load the datasets
#Change the path if you want to run it on your computer

df_fake = pd.read_csv("/Users/temkaisea/Desktop/Uni/CodeWorks/News Predictor/Fake News Detection/Fake.csv")  
df_true = pd.read_csv("/Users/temkaisea/Desktop/Uni/CodeWorks/News Predictor/Fake News Detection/True.csv")

# Add labels (0 for fake, 1 for true)
df_fake["label"] = 0
df_true["label"] = 1

# Combine both datasets
df = pd.concat([df_fake, df_true])

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Keep only the 'text' and 'label' columns
df = df[['text', 'label']]

# Display sample
df.head()


Unnamed: 0,text,label
0,"Buzzfeed reports, that, from the start, Megyn ...",0
1,"According to the New York Times, outlet store...",0
2,"COX S BAZAR, Bangladesh (Reuters) - More than ...",1
3,NEW YORK/WASHINGTON (Reuters) - New York Attor...,1
4,"Barack Obama s legacy will be a divided, lawle...",0


In [5]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove extra spaces
    return text

df["text"] = df["text"].apply(clean_text)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)


In [7]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)  # Limit to 5000 words for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [8]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [9]:
y_pred = model.predict(X_test_tfidf)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9875278396436525

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4760
           1       0.98      0.99      0.99      4220

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4696   64]
 [  48 4172]]


In [10]:
def predict_news(text):
    text = clean_text(text)  # Clean the input text
    text_tfidf = vectorizer.transform([text])  # Transform it using the trained vectorizer
    prediction = model.predict(text_tfidf)[0]  # Predict the label
    return "Real News" if prediction == 1 else "Fake News"

# Example test
print(predict_news("Breaking news: Scientists discover a new planet with signs of life!"))
print(predict_news("Government officials confirm that the earth is flat, shocking discovery!"))


Fake News
Fake News
