<a href="https://colab.research.google.com/github/yyashaswini736/program2026/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# ==============================
# Fake News Detection Project
# ==============================

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# ==============================
# 1. Load Dataset
# ==============================

# Add low_memory=False to help pandas handle mixed types more reliably
data = pd.read_csv("/content/fake and real news.csv", engine='python', doublequote=True)   # <-- your dataset file

# Drop rows where the 'label' column (assumed to be news content for now) has NaN values.
# This helps with the 'Input contains NaN' error, but 'label' still needs to be correctly identified as content vs. target.
data.dropna(subset=['label'], inplace=True)

# ==============================
# 2. Preprocessing Function
# ==============================

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)   # remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Apply preprocessing - Assuming 'label' column contains the news text
data['clean_text'] = data['label'].apply(preprocess_text) # Changed 'text' to 'label' here

# ==============================
# 3. Convert Text to Numerical (TF-IDF)
# ==============================

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_text'])


print("Available columns in your dataset:", data.columns.tolist())
y = pd.Series(np.random.randint(0, 2, size=len(data)), index=data.index) # Temporary, needs replacement


# ==============================
# 4. Train-Test Split
# ==============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================
# 5. Train Model
# ==============================

model = LogisticRegression()
model.fit(X_train, y_train)

# ==============================
# 6. Evaluate Model
# ==============================

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# ==============================
# 7. Predict New News
# ==============================

def predict_news(news_text):
    clean_text = preprocess_text(news_text)
    vectorized_text = vectorizer.transform([clean_text])
    prediction = model.predict(vectorized_text)

    # Note: This prediction logic assumes '1' is 'Real News' and '0' is 'Fake News'
    # Adjust based on your actual label mapping.
    if prediction[0] == 1:
        return "Real News"
    else:
        return "Fake News"

# Example
sample_news = input("\nEnter news text to predict: ")
result = predict_news(sample_news)

print("Prediction:", result)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Available columns in your dataset: ['Unnamed: 0', 'label', 'text', 'label.1', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65'

KeyboardInterrupt: Interrupted by user