In [None]:
import pandas as pd
import numpy as np
import pickle
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# Load the PHEME dataset (ensure the file is named 'dataset.csv')
df = pd.read_csv("dataset.csv")

# Rename the rumor column for consistency (PHEME dataset uses 'is_rumor')
df.rename(columns={"is_rumor": "label"}, inplace=True)
df = df.dropna(subset=["label"])

# Clean the text data
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

df['cleaned_text'] = df['text'].apply(clean_text)


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# Initialize TF-IDF Vectorizer and transform the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
with open("naive_bayes_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully.")


In [None]:
import requests

def google_fact_check(query):
    api_key = "AIzaSyBGoW7Q8teenIn1FUaXE6jZSisLVgEx_7A"  # Replace with your actual API key
    url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={query}&key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "claims" in data:
            return data["claims"]
    return None

def combine_results(user_input):
    # Model Prediction
    user_input_cleaned = clean_text(user_input)
    user_vectorized = vectorizer.transform([user_input_cleaned])
    model_prediction = model.predict(user_vectorized)[0]
    
    # Fact Check API Results
    fact_results = google_fact_check(user_input)
    if fact_results:
        fact_verified = any("true" in claim["claimReview"][0]["textualRating"].lower() for claim in fact_results)
        if fact_verified:
            combined = "Verified True News ✅"
        else:
            combined = "Verified Rumor 🚫"
    else:
        combined = "No fact-check found. Model Prediction: " + ("Rumor" if model_prediction == 1 else "True")
    
    return combined

# Example test
test_input = "COVID-19 vaccines cause infertility."
print("Combined Result:", combine_results(test_input))
