In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import torch
import logging
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# Load your dataset (assuming a CSV file with 'title', 'text', 'label' columns)
logger.info("Loading dataset...")
df = pd.read_csv("final_data.csv")

df['label'] = df['label'].apply(lambda x: 1 if x == 'fake' else 0)
df = df.dropna(subset='title')
df = df.dropna(axis=1)

# Preprocess the data by concatenating title and text
logger.info("Preprocessing data...")
df['input_text'] = "Title: " + df['title'] + " Text: " + df['text']
texts = df['input_text'].tolist()
labels = df['label'].astype(int).tolist()
df.head(5)

INFO:__main__:Loading dataset...
INFO:__main__:Preprocessing data...


Unnamed: 0.1,Unnamed: 0,title,text,label,input_text
0,0,Obama says it is possible Russia would try to ...,WASHINGTON (Reuters) - U.S. President Barack O...,0,Title: Obama says it is possible Russia would ...
1,1,U.S. challenged by rising North Korea tensions...,UNITED NATIONS (Reuters) - Russia urged “hot h...,0,Title: U.S. challenged by rising North Korea t...
2,2,THE WOMAN WHO Moved Freedom Loving Americans T...,MARCH 1st is the day! VOTE FOR BECKY GERRITSON...,1,Title: THE WOMAN WHO Moved Freedom Loving Amer...
3,3,"Kenya president lifts travel restrictions, say...",NAIROBI (Reuters) - Kenyan President Uhuru Ken...,0,Title: Kenya president lifts travel restrictio...
4,4,"Philippine ferry capsizes with 251 on board, f...",MANILA (Reuters) - Four people were killed and...,0,Title: Philippine ferry capsizes with 251 on b...


In [4]:
# Tokenize the dataset with optimization for smaller batch size and sequence length
logger.info("Tokenizing data...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_inputs = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")

INFO:__main__:Tokenizing data...


In [5]:
# Convert to Hugging Face Dataset format
input_ids = tokenized_inputs["input_ids"].tolist()
attention_mask = tokenized_inputs["attention_mask"].tolist()

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "labels": labels  # `labels` should already be in list format
})

# Split dataset for training and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [6]:
# Load the DistilBERT model for sequence classification
logger.info("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define training arguments optimized for MacBook M1 Air
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Smaller batch size to fit within M1 Air's memory limits
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    use_mps_device=True  # Use Apple M1's Metal API for acceleration
)

INFO:__main__:Loading model...
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Initialize the Trainer
logger.info("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
logger.info("Starting training...")
trainer.train()

# Evaluate the model
logger.info("Evaluating model...")
metrics = trainer.evaluate()
logger.info(f"Evaluation metrics: {metrics}")

# Save the fine-tuned model and tokenizer

INFO:__main__:Initializing Trainer...
INFO:__main__:Starting training...


Epoch,Training Loss,Validation Loss
1,0.1116,0.15257
2,0.1351,0.163473
3,0.0769,0.228265


INFO:__main__:Evaluating model...


INFO:__main__:Evaluation metrics: {'eval_loss': 0.228265181183815, 'eval_runtime': 332.1053, 'eval_samples_per_second': 33.218, 'eval_steps_per_second': 8.305, 'epoch': 3.0}


In [8]:
logger.info("Saving model and tokenizer...")
model.save_pretrained("./fake_news_detector")
tokenizer.save_pretrained("./fake_news_detector")
logger.info("Training complete.")

INFO:__main__:Saving model and tokenizer...
INFO:__main__:Training complete.


In [19]:
pip install python-docx

Note: you may need to restart the kernel to use updated packages.


In [None]:
basic - check this 

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import requests
from textblob import TextBlob  # For basic sentiment analysis

# Initialize tokenizer and model (assuming you have a pre-trained DiOSBERT model for news classification)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('/Users/vineetahuja/Downloads/fake_news_detector_model')  # Update with your DiOSBERT model path

# Define your API keys
aqi_api_key = '0c55aeba-af32-4ced-989a-d4d5b0b535e9'  # Replace with your AQI API key
serp_api_key = '9f1822f370a6c6757264cf88cd1dd65ee80243de84259ab94c413cd093c4c64a'  # Replace with your SerpAPI key
weather_api_key = '3463038f0c9c76354f386c470f87824f'  # Replace with your Weather API key
sports_api_key = '07bdb0e0aamsh7475b04bc326ccdp10fad6jsnb83610355df6'  # Replace with your Sports API key

# Function to fetch AQI data
def fetch_aqi_data(city):
    url = f'http://api.airvisual.com/v2/city?city={city}&state=Delhi&country=India&key={aqi_api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to fetch weather data
def fetch_weather_data(city):
    url = f'http://api.weatherapi.com/v1/current.json?key={weather_api_key}&q={city}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to fetch news using SerpAPI
def fetch_serp_data(query):
    url = f'https://serpapi.com/search?q={query}&api_key={serp_api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to fetch sports data using ESPN Rapid API
def fetch_sports_data():
    url = f'https://espn-api.com/sportsdata/{sports_api_key}/sports'  # Replace with the correct endpoint if needed
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to predict news validity using DiOSBERT
def predict_news(news_text):
    inputs = tokenizer(news_text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][predicted_class_id].item()
    return predicted_class_id, confidence

# Function to analyze sentiment of snippets
def analyze_sentiment(snippet):
    # Using TextBlob for sentiment analysis (returns polarity)
    analysis = TextBlob(snippet)
    return analysis.sentiment.polarity

# Function to determine news validity
def validate_news(news_text, query):
    # Predict the news validity using DiOSBERT
    prediction, confidence = predict_news(news_text)
    if prediction == 0:
        prediction_label = 'Fake News'
    else:
        prediction_label = 'Real News'

    # Fetch relevant news snippets using SerpAPI
    serp_data = fetch_serp_data(query)
    snippets = serp_data.get('organic_results', [])

    # Sentiment analysis of snippets to determine if they support the prediction
    positive_snippets = []
    negative_snippets = []
    for snippet in snippets:
        sentiment = analyze_sentiment(snippet['snippet'])
        if sentiment > di0:
            positive_snippets.append(snippet['snippet'])
        else:
            negative_snippets.append(snippet['snippet'])

    # Handle AQI specific query
    aqi_data = None
    if 'aqi' in query.lower():
        aqi_data = fetch_aqi_data('Delhi')

    # Handle Weather specific query
    weather_data = None
    if 'weather' in query.lower():
        weather_data = fetch_weather_data('Delhi')

    # Fetch sports data if the query is about sports
    sports_data = None
    if 'sports' in query.lower():
        sports_data = fetch_sports_data()

    # Prepare validation message
    validation_message = f"Prediction: {prediction_label} (Confidence: {confidence:.2f})\n"

    if aqi_data:
        aqi_value = aqi_data['data']['current']['pollution']['aqius']
        validation_message += f"AQI in Delhi is {aqi_value}.\n"

    if weather_data:
        temp = weather_data['current']['temp_c']
        validation_message += f"Current temperature in Delhi is {temp}°C.\n"

    if sports_data:
        validation_message += f"Sports Snippet: {sports_data.get('sports_headline', 'No relevant data available')}\n"

    # Adding categorized snippets to the validation message
    validation_message += "\nPositive Snippets:\n"
    for snippet in positive_snippets:
        validation_message += f"- {snippet}\n"

    validation_message += "\nNegative Snippets:\n"
    for snippet in negative_snippets:
        validation_message += f"- {snippet}\n"

    # Final validation message based on the number of positive and negative snippets
    if len(positive_snippets) > len(negative_snippets):
        validation_message += "\nValidation: Real News."
    else:
        validation_message += "\nValidation: Fake News."

    return validation_message

# Interactive chat loop
def start_chatbot():
    print("Welcome to the Universal Smart News Chatbot!")
    print("Type 'exit' to quit.")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            break
        
        response = validate_news(user_input, user_input)
        print("Bot:", response)

if __name__ == "__main__":
    start_chatbot()
    

Welcome to the Universal Smart News Chatbot!
Type 'exit' to quit.


You:  delhi aqi is 800 today


Bot: Prediction: Fake News (Confidence: 0.97)
AQI in Delhi is 453.

Positive Snippets:
- ITI Jahangirpuri, Delhi AQI: ITI Jahangirpuri, Delhi Real-time Air Quality Index (AQI). 372. Hazardous. Updated on Thursday 6:00. temperature: 21°C. current ...
- Current New Delhi Air Quality Index (AQI) is 339 Severe level with real-time air pollution PM2.5 (168µg/m³), PM10 (316µg/m³), Temperature (17.7°C) in Delhi.
- The AQI in some areas of Delhi stood between 800-1100 - 'hazardous' category - at the time of last update in this report, as per Swiss air ...
- AQI values at or below 100 are generally thought of as satisfactory. When AQI values are above 100, air quality is unhealthy: at first for certain sensitive ...

Negative Snippets:
- Delhi Air Quality Index (AQI) is now Very unhealthy. Get real-time, historical and forecast PM2.5 and weather data. Read the air pollution in Delhi, ...
- Current Delhi Air Quality Index (AQI) is 308 Severe level with real-time air pollution PM2.5 (133µg/m³), P

You:  donald trump wins the elections


Bot: Prediction: Fake News (Confidence: 0.96)

Positive Snippets:
- It was clear, but not a landslide by historical standards. Trump won both the Electoral College and the popular vote; in fact, Trump this year ...
- View live election results from the 2024 presidential race as Kamala Harris and Donald Trump face off. See the map of votes by state as results are tallied.
- Republican President-elect Donald Trump has said his election victory handed him an “unprecedented and powerful” mandate to govern.
- There were 93 electoral votes at stake among the seven swing states. Mr. Trump needed at least 51 electoral votes from these states to secure ...
- Donald Trump can claim a lot out of his 2024 election win. What Trump cannot claim is a landslide victory, although that's how he will ...
- In a decisive victory, Trump wins a series of swing states, while his Republican Party also gains control of the Senate.
- Harris' message to supporters: Vice President Kamala Harris urged supporters t

You:  ISRAEL LOST WAE FROM GAZA


Bot: Prediction: Real News (Confidence: 0.91)

Positive Snippets:
- The 7 October Hamas attack overwhelmed Israel and utterly changed its face. The country experienced a tactical defeat after a colossal failure by Israeli ...
- After more than a year, Israel and Lebanon have reached a ceasefire agreement that effectively ends the war that Hezbollah began on October 8, ...

Negative Snippets:
- Failure to Achieve War Aims in Gaza​​ Israel is not achieving its war aims against Hamas. First, it has only obtained a handful of the hostages.
- Some argue that Israel has already lost the war in Gaza. Why? First, despite its stated aim of eliminating Hamas, Israel appears unlikely or ...
- Hamas already won on Oct. 7 when it embarrassed the Israeli military by overrunning bases, killing Israelis, and taking hostages.
- 62 soldiers have been killed in combat, and 15 civilians and two policemen have been killed in missile strikes and attacks inside Israel.
- Israel has lost its war on the besieg

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import requests
from textblob import TextBlob

# Initialize tokenizer and model (update with your DiOSBERT model path)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('/Users/vineetahuja/Downloads/fake_news_detector_model')

# Define API keys
aqi_api_key = '0c55aeba-af32-4ced-989a-d4d5b0b535e9'
serp_api_key = '9f1822f370a6c6757264cf88cd1dd65ee80243de84259ab94c413cd093c4c64a'
weather_api_key = '3463038f0c9c76354f386c470f87824f'
sports_api_key = '07bdb0e0aamsh7475b04bc326ccdp10fad6'

# Function to fetch snippets using SerpAPI
def fetch_snippets(query):
    url = f'https://serpapi.com/search?q={query}&api_key={serp_api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json().get('organic_results', [])
    return []

# Function for sentiment analysis of snippets
def analyze_sentiment(snippet):
    analysis = TextBlob(snippet)
    return analysis.sentiment.polarity

# Function to predict news validity
def predict_news(news_text):
    inputs = tokenizer(news_text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][predicted_class_id].item()
    return predicted_class_id, confidence

# Function to validate news and respond with categorized snippets
def validate_news(news_text):
    # Predict using the model
    prediction, confidence = predict_news(news_text)
    prediction_label = 'Real News' if prediction == 1 else 'Fake News'
    
    # Fetch relevant snippets
    snippets = fetch_snippets(news_text)
    
    positive_snippets = []
    negative_snippets = []

    for snippet in snippets:
        snippet_text = snippet.get('snippet', '')
        sentiment = analyze_sentiment(snippet_text)
        if sentiment > 0:
            positive_snippets.append(snippet_text)
        else:
            negative_snippets.append(snippet_text)
    
    # Prepare the response
    response = f"Prediction: {prediction_label} (Confidence: {confidence:.2f})\n"
    response += "\n--- Positive Snippets ---\n"
    response += "\n".join([f"- {s}" for s in positive_snippets]) if positive_snippets else "No positive snippets found.\n"
    response += "\n\n--- Negative Snippets ---\n"
    response += "\n".join([f"- {s}" for s in negative_snippets]) if negative_snippets else "No negative snippets found.\n"
    
    # Final validation
    if len(positive_snippets) > len(negative_snippets):
        response += "\nFinal Validation: Real News."
    else:
        response += "\nFinal Validation: Fake News."
    
    return response

# BuzzBot Chat Interface
def start_buzzbot():
    print("Welcome to BuzzBot! A smart fake news detector.")
    print("Ask me about any news, and I'll validate it for you.")
    print("Type 'exit' to end the chat.\n")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("BuzzBot: Thank you for using BuzzBot. Stay informed!")
            break
        else:
            try:
                bot_response = validate_news(user_input)
                print(f"BuzzBot:\n{bot_response}\n")
            except Exception as e:
                print(f"BuzzBot: Sorry, I encountered an error: {str(e)}\n")

# Run the chatbot
if __name__ == "__main__":
    start_buzzbot()

Welcome to BuzzBot! A smart fake news detector.
Ask me about any news, and I'll validate it for you.
Type 'exit' to end the chat.

