<a href="https://colab.research.google.com/github/yuvasri09-art/NLP_FAKE_NEWS_DETECTOR-/blob/main/Pahalga_Fake_News_Detection_UI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('vader_lexicon')
from IPython import get_ipython
from IPython.display import display

import os
import json
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import nlpaug.augmenter.word as naw
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords  # Import stopwords
from nltk.stem import WordNetLemmatizer # Import WordNetLemmatizer
import gradio as gr


# Create augmenters
aug_syn = naw.SynonymAug(aug_src='wordnet')
aug_del = naw.RandomWordAug(action="delete")

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

def clean_text(text):
    text = re.sub(r"[^a-zA-Z ]+", '', text)
    text = text.lower()
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

def main():
    json_data = {
        "real_samples": [
            {"text": "Security forces neutralized two terrorists in Pahalgam, Jammu and Kashmir. Officials confirmed no civilian casualties.", "label": "REAL"},
            {"text": "ANI reports: Joint operation by Indian Army and J&K Police foils terror plot in Pahalgam. Explosives recovered.", "label": "REAL"},
            {"text": "The Home Ministry confirmed that the Pahalgam attack was orchestrated by a banned outfit. Investigation ongoing.", "label": "REAL"},
            {"text": "Local authorities in Pahalgam urge residents to avoid crowded areas temporarily. Security measures heightened.", "label": "REAL"},
            {"text": "Jammu and Kashmir DGP stated: 'Situation in Pahalgam is under control. No further threats detected.'", "label": "REAL"},
            {"text": "Increased security presence in Pahalgam following the recent incident.", "label": "REAL"},
            {"text": "Officials are investigating the Pahalgam attack to determine the motives and those responsible.", "label": "REAL"},
            {"text": "The Pahalgam attack is a reminder of the ongoing security challenges in the region.", "label": "REAL"},
            {"text": "Authorities are working to ensure the safety and security of residents and visitors in Pahalgam.", "label": "REAL"},
            {"text": "The government has condemned the Pahalgam attack and vowed to bring the perpetrators to justice.", "label": "REAL"}
        ],
        "fake_samples": [
            {"text": "BREAKING: 50 killed in Pahalgam massacre! Government hiding the truth!", "label": "FAKE"},
            {"text": "Secret documents reveal Pahalgam attack was staged by the army. SHARE NOW!", "label": "FAKE"},
            {"text": "Exclusive: Pahalgam victims' families claim no bodies were returned. Cover-up exposed!", "label": "FAKE"},
            {"text": "Shocking: Pahalgam attack linked to foreign spies. Prime Minister silent!", "label": "FAKE"},
            {"text": "Hidden truth: Pahalgam terrorists were paid by political parties. VIRAL VIDEO!", "label": "FAKE"},
            {"text": "Pahalgam attack was an inside job, claims controversial blogger.", "label": "FAKE"},
            {"text": "Leaked video shows Pahalgam terrorists escaping unharmed.", "label": "FAKE"},
            {"text": "Government using Pahalgam attack to distract from economic woes.", "label": "FAKE"},
            {"text": "Pahalgam attack a hoax to justify military buildup.", "label": "FAKE"},
            {"text": "Aliens responsible for Pahalgam attack, claims conspiracy theorist.", "label": "FAKE"}
        ]
    }

    real_df = pd.DataFrame(json_data["real_samples"])
    fake_df = pd.DataFrame(json_data["fake_samples"])

    min_samples = min(len(fake_df), len(real_df))
    fake_df = fake_df.sample(n=min_samples, random_state=42)
    real_df = real_df.sample(n=min_samples, random_state=42)

    combined_df = pd.concat([fake_df, real_df]).sample(frac=1, random_state=42)
    combined_df['clean_text'] = combined_df['text'].apply(clean_text)
    combined_df = combined_df[combined_df['clean_text'].str.strip() != '']

    processed_path = os.path.join('data', 'processed', 'cleaned_news.csv')
    os.makedirs(os.path.dirname(processed_path), exist_ok=True)
    combined_df.to_csv(processed_path, index=False)
    return combined_df

def predict_fake_news(text, tokenizer, model, scaler):
    cleaned = clean_text(text)
    if not cleaned.strip():
        return "FAKE (Invalid Input)"

    sequence = tokenizer.texts_to_sequences([cleaned])
    if not sequence or len(sequence[0]) == 0:
        return "FAKE (No Tokens)"

    padded = pad_sequences(sequence, maxlen=300, padding='post', truncating='post')
    sentiment_features = pd.DataFrame([get_sentiment_features(cleaned)])
    scaled_sentiment = scaler.transform(sentiment_features.values)

    proba = model.predict([padded, scaled_sentiment], verbose=0)[0][0]
    return "REAL" if proba >= 0.5 else "FAKE"

def get_sentiment_features(text):
    blob = TextBlob(text)
    vader = sia.polarity_scores(text)
    return {
        'textblob_polarity': blob.sentiment.polarity,
        'textblob_subjectivity': blob.sentiment.subjectivity,
        'vader_neg': vader['neg'],
        'vader_neu': vader['neu'],
        'vader_pos': vader['pos'],
        'vader_compound': vader['compound']
    }

# ... (your existing code) ...

def predict_ui(headline):
    return predict_fake_news(headline, tokenizer, model, scaler)

if __name__ == '__main__':
    combined_df = main()

    # Tokenization and padding
    tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')
    tokenizer.fit_on_texts(combined_df['clean_text'])
    sequences = tokenizer.texts_to_sequences(combined_df['clean_text'])
    padded_sequences = pad_sequences(sequences, maxlen=300, padding='post')

    # Sentiment features
    sentiment_features = combined_df['text'].apply(get_sentiment_features).apply(pd.Series)
    scaler = StandardScaler()
    X_sentiment = scaler.fit_transform(sentiment_features)

    # Data splitting
    X_train, X_test, y_train, y_test = train_test_split(
        np.hstack([padded_sequences, X_sentiment]),
        LabelEncoder().fit_transform(combined_df['label']),
        test_size=0.2,
        stratify=combined_df['label'],
        random_state=42
    )

    # Data augmentation
    train_indices = np.where(combined_df.index.isin(X_train[:, 0]))[0]  # Change this line
    X_train_texts = combined_df.iloc[train_indices]['clean_text'].tolist()

    augmented_texts = []
    augmented_labels = []
    for text, label in zip(X_train_texts, y_train[np.isin(np.arange(len(y_train)), train_indices)]): # Change this line
        if label == 1:  # REAL
            augs = [aug_syn.augment(text), aug_del.augment(text)]
        else:  # FAKE
            augs = [aug_syn.augment(text), aug_syn.augment(text)]  # Double augmentation
        for aug in augs:
            if isinstance(aug, list):
                aug = aug[0]
            augmented_texts.append(aug)
            augmented_labels.append(label)
    # Process augmented data
    aug_sequences = tokenizer.texts_to_sequences(augmented_texts)
    aug_padded = pad_sequences(aug_sequences, maxlen=300, padding='post')
    # Change this line to apply get_sentiment_features to each text string
    aug_sentiment = [get_sentiment_features(text) for text in augmented_texts]
    # Convert to DataFrame and scale
    aug_sentiment = scaler.transform(pd.DataFrame(aug_sentiment))    # Combine datasets
    X_train_full = np.vstack([
        X_train,
        np.hstack([aug_padded, aug_sentiment])
    ])
    y_train_full = np.concatenate([y_train, augmented_labels])

    # Model architecture
    text_input = Input(shape=(300,))
    embedding = Embedding(50000, 128)(text_input)
    lstm = Bidirectional(LSTM(64))(embedding)
    sentiment_input = Input(shape=(6,))
    merged = concatenate([lstm, sentiment_input])
    dense = Dense(32, activation='relu')(merged)
    output = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[text_input, sentiment_input], outputs=output)
    model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Class weights
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_full), y=y_train_full)
    class_weights = {0: class_weights[0], 1: class_weights[1]}

    # Training
    history = model.fit(
        [X_train_full[:, :300], X_train_full[:, 300:]],
        y_train_full,
        validation_data=([X_test[:, :300], X_test[:, 300:]], y_test),
        epochs=20,
        batch_size=32,
        class_weight=class_weights,
        callbacks=[EarlyStopping(patience=3)]
    )

    # Evaluation
    loss, accuracy = model.evaluate([X_test[:, :300], X_test[:, 300:]], y_test, verbose=0)
    print(f'\nFinal Prediction Accuracy: {accuracy:.2f}')

    # Gradio UI
    iface = gr.Interface(
        fn=predict_ui,
        inputs=gr.Textbox(lines=2, placeholder="Enter news headline here..."), # Updated to use gr.Textbox
        outputs="text",
        title="Fake News Detection",
        description="Enter a news headline to check if it's real or fake."
    )
    iface.launch()



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.4643 - loss: 0.7270 - val_accuracy: 0.5000 - val_loss: 0.7456
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.5000 - loss: 0.7133 - val_accuracy: 0.5000 - val_loss: 0.7344
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - accuracy: 0.5000 - loss: 0.7009 - val_accuracy: 0.5000 - val_loss: 0.7220
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - accuracy: 0.5000 - loss: 0.6897 - val_accuracy: 0.5000 - val_loss: 0.7095
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302ms/step - accuracy: 0.5000 - loss: 0.6793 - val_accuracy: 0.5000 - val_loss: 0.6969
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - accuracy: 0.5357 - loss: 0.6692 - val_accuracy: 0.5000 - val_loss: 0.6860
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━