In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'fake-and-real-news-dataset' dataset.
Path to dataset files: /kaggle/input/fake-and-real-news-dataset


In [2]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the datasets
true_df = pd.read_csv(f"{path}/True.csv")
fake_df = pd.read_csv(f"{path}/Fake.csv")

# 1. Add labels
true_df['label'] = 1
fake_df['label'] = 0

# 2. Combine datasets
df = pd.concat([true_df, fake_df], ignore_index=True)

# 3. Remove missing values
df.dropna(inplace=True)

# 4. Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\n', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# 5. Apply cleaning (change 'text' if column name differs)
df['text'] = df['text'].apply(clean_text)

# 6. Split features and labels
X = df['text']
y = df['label']

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 8. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Preprocessing completed successfully")
print("Training data shape:", X_train_vec.shape)
print("Testing data shape:", X_test_vec.shape)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Preprocessing completed successfully
Training data shape: (35918, 186587)
Testing data shape: (8980, 186587)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1,2),
        min_df=3,
        max_df=0.9
    )),
    ("clf", LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)
print("Model trained")


Model trained


In [4]:
import joblib

# Save the trained model (includes TF-IDF + classifier)
joblib.dump(model, "fake_news_model.pkl")

print("‚úÖ Trained model saved successfully")



‚úÖ Trained model saved successfully


In [5]:
!apt-get install -y tesseract-ocr
!pip install pytesseract pillow


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [6]:
import pytesseract
from PIL import Image
from google.colab import files


In [7]:
def predict_news(text):
    if len(text.split()) < 20:
        return "NOT A NEWS ARTICLE"

    probs = model.predict_proba([text])[0]
    confidence = max(probs)

    if confidence < 0.65:
        return "UNCERTAIN / GENERIC NEWS"

    label = model.predict([text])[0]
    return "REAL NEWS" if label == 1 else "FAKE NEWS"


In [8]:
from IPython.display import display, HTML

def show_prediction(result):
    if result == "REAL NEWS":
        color, emoji = "green", "‚úÖ"
    elif result == "FAKE NEWS":
        color, emoji = "red", "‚ùå"
    elif "UNCERTAIN" in result:
        color, emoji = "orange", "‚ö†Ô∏è"
    else:
        color, emoji = "blue", "‚ÑπÔ∏è"

    display(HTML(f"""
        <div style="font-size:22px;font-weight:bold;
        color:{color};border:2px solid {color};
        padding:10px;border-radius:10px;">
        {emoji} Prediction: {result}
        </div>
    """))


In [9]:
import joblib

# Load the trained model
model = joblib.load("fake_news_model.pkl")

print("Choose input method:")
print("1Ô∏è‚É£ Upload text file")
print("2Ô∏è‚É£ Enter text manually")
print("3Ô∏è‚É£ Upload article image")

choice = input("Enter your choice (1 / 2 / 3): ")

# -------- OPTION 1: TEXT FILE --------
if choice == "1":
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]

    with open(filename, "r", encoding="utf-8") as f:
        text = f.read()

    result = predict_news(text)
    show_prediction(result)

# -------- OPTION 2: MANUAL TEXT --------
elif choice == "2":
    print("\nPaste or type the news article:")
    text = input()

    result = predict_news(text)
    show_prediction(result)

# -------- OPTION 3: IMAGE (OCR) --------
elif choice == "3":
    uploaded = files.upload()
    image_name = list(uploaded.keys())[0]

    img = Image.open(image_name)
    extracted_text = pytesseract.image_to_string(img)

    print("\nüìù Extracted Text from Image:\n")
    print(extracted_text)

    result = predict_news(extracted_text)
    show_prediction(result)

else:
    print("Invalid option selected")

Choose input method:
1Ô∏è‚É£ Upload text file
2Ô∏è‚É£ Enter text manually
3Ô∏è‚É£ Upload article image
Enter your choice (1 / 2 / 3): 3


Saving real.jpeg to real.jpeg

üìù Extracted Text from Image:

 

   

ee SSR nee reee BL reer net ener

 

oe es

Title: Government Announces New Infrastructure Development Plan

The Government of India today announced a new infrastructure development plan aimed at improving
transportation and employment opportunities across several states. The Prime Minister said the initiative will

focus on highway expansion, railway modernization, and renewable energy projects.

According to official sources, the project is expected to generate over one million jobs during its first phase.
The Ministry of Finance confirmed that funding will be allocated in the upcoming budget session. Officials

stated that the plan will strengthen economic growth and improve regional connectivity.

