In [22]:
import asyncio
import aiohttp
import async_timeout
import os
import logging
import re
import nest_asyncio
import pandas as pd
from bs4 import BeautifulSoup
from transformers import pipeline
from pymongo import MongoClient
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Apply nest_asyncio to make async functions work in Jupyter
nest_asyncio.apply()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(), logging.FileHandler("scraper.log")]
)

# MongoDB Setup
MONGO_URI = "mongodb+srv://user:pass123ynm@cluster0.16nk9hq.mongodb.net/sentiment-analysis"
client = MongoClient(MONGO_URI)
db = client["sentiment-analysis"]
collection = db["scraped_articles"]

# Initialize the transformers pipelines
logging.info("Initializing models...")
try:
    summarizer = pipeline("summarization", model="t5-small")
    transformer_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    logging.info("Models initialized successfully")
except Exception as e:
    logging.error(f"Error initializing models: {e}")
    raise

# Custom headers to avoid blocking
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/110.0.0.0 Safari/537.36"
}

# Define sentiment label mapping
LABEL_MAP = {"NEGATIVE": "negative", "POSITIVE": "positive"}

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)                     # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)                 # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()            # Remove extra spaces
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

# Async function to fetch a URL with retry logic
async def fetch(session, url, retries=3):
    for attempt in range(retries):
        try:
            async with async_timeout.timeout(10):
                async with session.get(url, headers=HEADERS) as response:
                    response.raise_for_status()
                    return await response.text()
        except (aiohttp.ClientError, async_timeout.TimeoutError) as e:
            logging.warning(f"Attempt {attempt+1} for {url} failed: {e}")
            await asyncio.sleep(2)
    logging.error(f"Failed to fetch {url} after {retries} attempts.")
    return None

# Async function to scrape a URL
async def scrape_url(session, url):
    html = await fetch(session, url)
    if not html:
        return None

    soup = BeautifulSoup(html, "html.parser")
    
    # Extract heading
    heading_tag = soup.find("h1") or soup.find("title")
    heading_text = heading_tag.get_text(strip=True) if heading_tag else "No Heading"

    # Extract body
    paragraphs = soup.find_all("p")
    body_text = " ".join([p.get_text(strip=True) for p in paragraphs])
    
    # Extract meta description
    meta_desc = ""
    meta_tag = soup.find("meta", attrs={"name": "description"})
    if meta_tag and meta_tag.get("content"):
        meta_desc = meta_tag.get("content").strip()

    return [heading_text, body_text, meta_desc, url]

# Function to summarize text
def summarize_text(text):
    try:
        max_input_length = 512
        if len(text) > max_input_length:
            text = text[:max_input_length]

        summary = summarizer(text, max_length=50, min_length=30, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        logging.warning(f"Summarization failed: {e}")
        return text[:300] + "..."

# Function to analyze sentiment
def analyze_sentiment(text):
    try:
        max_input_length = 512
        if len(text) > max_input_length:
            text = text[:max_input_length]

        result = transformer_classifier(text)[0]
        sentiment = LABEL_MAP.get(result['label'], result['label'])
        return sentiment, result['score']
    except Exception as e:
        logging.error(f"Sentiment analysis failed: {e}")
        return "Error", None

# Store results in MongoDB
def store_results(data):
    if collection.find_one({"url": data["url"]}):
        logging.info("Article already exists in MongoDB. Skipping insertion.")
        return
    collection.insert_one(data)
    logging.info("Data stored in MongoDB.")

# Function to display the scraped data
def display_results(data):
    print("\n===== Scraped Article Analysis =====")
    print(f"Heading: {data['heading']}")
    print(f"URL: {data['url']}")
    print(f"Meta Description: {data['meta_description']}")
    print(f"Summary: {data['summary']}")
    print(f"Overall Sentiment: {data['overall_sentiment']}")
    print("====================================\n")

# Function to display previously scraped articles
def show_scraped_data():
    articles = collection.find({})
    if collection.count_documents({}) == 0:
        print("No articles found in the database.")
        return

    print("\n==== Scraped Articles ====")
    for i, article in enumerate(articles, start=1):
        print(f"\n--- Article {i} ---")
        print(f"Heading: {article['heading']}")
        print(f"URL: {article['url']}")
        print(f"Overall Sentiment: {article['overall_sentiment']}")
    print("\n")

# Main function to process a URL
async def process_url(url):
    logging.info(f"Processing URL: {url}")
    async with aiohttp.ClientSession() as session:
        result = await scrape_url(session, url)

    if not result:
        logging.error(f"Failed to scrape URL: {url}")
        return

    heading, body, meta_desc, url = result
    summary = summarize_text(body)
    overall_sentiment, _ = analyze_sentiment(body)

    data = {
        "heading": heading,
        "meta_description": meta_desc,
        "url": url,
        "summary": summary,
        "overall_sentiment": overall_sentiment
    }

    # Store and display results
    store_results(data)
    display_results(data)
    return data

# Interactive menu
async def interactive_menu():
    while True:
        print("\n==== Web Scraping and Sentiment Analysis Tool ====")
        print("1. Analyze a new article")
        print("2. View previously scraped articles")
        print("3. Exit")
        choice = input("Enter your choice (1-3): ").strip()

        if choice == "1":
            url = input("Enter the URL of the article to analyze: ").strip()
            if url:
                await process_url(url)
            else:
                print("No URL provided.")
        elif choice == "2":
            show_scraped_data()
        elif choice == "3":
            print("Exiting. Goodbye!")
            break
        else:
            print("Invalid choice. Please try again.")

# Entry point
if __name__ == "__main__":
    try:
        asyncio.run(interactive_menu())
    except RuntimeError:
        asyncio.get_event_loop().run_until_complete(interactive_menu())
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        print(f"An unexpected error occurred: {e}")
    finally:
        client.close()
        logging.info("MongoDB connection closed.")
        client.close()
        


2025-03-31 14:10:19,631 [INFO] Initializing models...
Device set to use cpu
Device set to use cpu
2025-03-31 14:10:20,658 [INFO] Models initialized successfully



==== Web Scraping and Sentiment Analysis Tool ====
1. Analyze a new article
2. View previously scraped articles
3. Exit


2025-03-31 14:10:28,192 [INFO] Processing URL: https://edition.cnn.com/2025/03/30/europe/europe-defense-wake-up-ukraine-russia-trump-intl/index.html
2025-03-31 14:10:30,844 [INFO] Article already exists in MongoDB. Skipping insertion.



===== Scraped Article Analysis =====
Heading: ‘PATHETIC’ Europe may finally be waking up from its military slumber
URL: https://edition.cnn.com/2025/03/30/europe/europe-defense-wake-up-ukraine-russia-trump-intl/index.html
Meta Description: It was a televised ambush that many in Europe hope will stop a war.
Summary: a televised ambush that many in Europe hope will stop a war . it was a lightning strike to the transatlantic alliance, dispelling lingering illusions . "it is as if
Overall Sentiment: positive


==== Web Scraping and Sentiment Analysis Tool ====
1. Analyze a new article
2. View previously scraped articles
3. Exit


2025-03-31 14:10:55,425 [INFO] MongoDB connection closed.


Exiting. Goodbye!
