In [41]:
import pandas as pd
import numpy as np
import pickle
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Download necessary NLTK data (only the first time)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
# Load the PHEME dataset from 'dataset.csv'
df = pd.read_csv("dataset.csv")

# Rename the rumor label column (assuming it's named 'is_rumor') to 'label'
df.rename(columns={"is_rumor": "label"}, inplace=True)
df = df.dropna(subset=["label"])  # Drop rows with missing labels
df["label"] = df["label"].astype(int)

# Cache stopwords for efficiency
english_stopwords = set(stopwords.words('english'))

# Define an optimized clean_text function that preserves key domain-specific words
def clean_text(text):
    text = str(text).lower()
    # Remove numbers and URLs
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation but preserve hyphens (to keep words like 'covid-19' intact)
    punctuation_to_remove = string.punctuation.replace("-", "")
    text = text.translate(str.maketrans("", "", punctuation_to_remove))
    words = word_tokenize(text)
    # Cache stopwords and preserve key domain-specific words
    english_stopwords = set(stopwords.words("english"))
    preserved_words = {"5g", "covid-19", "nasa", "rover", "perseverance"}
    filtered_words = [word for word in words if word not in english_stopwords or word in preserved_words]
    return " ".join(filtered_words)

# Apply cleaning function to the text column and create a new column 'cleaned_text'
df['cleaned_text'] = df['text'].fillna("").apply(clean_text)

# Verify preprocessing
print(df[['text', 'cleaned_text', 'label']].head())


                                                text  \
0  Charlie Hebdo became well known for publishing...   
1  Now 10 dead in a shooting there today RT "@BBC...   
2  @BBCDanielS @BBCWorld I'm guessing this is bei...   
3  @BBCDanielS @BBCWorld why would you mention th...   
4            @BBCDanielS @BBCWorld perps identified?   

                                        cleaned_text  label  
0  charlie hebdo became well known publishing muh...      0  
1  dead shooting today rt bbcdaniels charlie hebd...      0  
2  bbcdaniels bbcworld im guessing considered ter...      0  
3  bbcdaniels bbcworld would mention knowing fact...      0  
4               bbcdaniels bbcworld perps identified      0  


In [43]:
# Split data into training and testing sets (stratified to preserve label distribution)
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Initialize TF-IDF Vectorizer with n-grams to capture context
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression classifier (robust and simple)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.8700456401633437
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      9724
           1       0.84      0.51      0.63      2765

    accuracy                           0.87     12489
   macro avg       0.86      0.74      0.78     12489
weighted avg       0.87      0.87      0.86     12489



In [44]:
# Save the trained model to disk
with open("naive_bayes_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the TF-IDF vectorizer to disk
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [45]:
import requests

def google_fact_check(query):
    api_key = "AIzaSyBGoW7Q8teenIn1FUaXE6jZSisLVgEx_7A"  # Replace with your actual API key
    url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={query}&key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "claims" in data and len(data["claims"]) > 0:
            return data["claims"]
    return None

def combine_results(user_input):
    # Model Prediction
    user_input_cleaned = clean_text(user_input)
    user_vectorized = vectorizer.transform([user_input_cleaned])
    model_prediction = model.predict(user_vectorized)[0]
    model_result = "Rumor" if model_prediction == 1 else "True News"
    
    # Fact Check API Results
    fact_results = google_fact_check(user_input)
    if fact_results:
        rating = fact_results[0]["claimReview"][0]["textualRating"].lower()
        if rating in ["false", "misleading"]:
            final_result = "Verified Rumor 🚫"
        elif rating in ["true", "mostly true"]:
            final_result = "Verified True News ✅"
        else:
            final_result = f"Model Prediction: {model_result} (Further review needed)"
    else:
        final_result = f"No fact-check found. Model Prediction: {model_result}"
    
    return final_result

# Example test (uncomment to test)
# test_input = "NASA confirms that the Perseverance rover has successfully landed on Mars."
# print("Combined Result for test input:", combine_results(test_input))
