<a href="https://colab.research.google.com/github/yashaswini-cyber/news-aggregator-summarizer/blob/main/news_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import sys
import os
import time

# Debugging Information
print("Python executable being used:", sys.executable)
print("Process ID:", os.getpid())

# Replace with your actual NewsAPI key
API_KEY = '98afa421e8784abd93f1eb00510347d1'
URL = f'https://newsapi.org/v2/top-headlines?country=us&apiKey={API_KEY}'

# Fetch the news articles
try:
    print("Fetching news articles...")
    response = requests.get(URL)
    response.raise_for_status()  # Check for HTTP request errors
    news_data = response.json()
    articles = news_data.get('articles', [])
    if not articles:
        raise Exception("No articles found")
except requests.exceptions.RequestException as e:
    raise SystemExit(f"Failed to fetch news articles: {e}")

# Function to extract article content using BeautifulSoup
def get_article_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from paragraphs
        paragraphs = soup.find_all('p')
        article_text = ' '.join([para.get_text() for para in paragraphs])

        # Ensure article is not empty and truncate to avoid token overflow
        return article_text[:4000] if article_text else None
    except Exception as e:
        print(f"Failed to extract article content from {url}: {e}")
        return None

# Extract content for all articles
article_contents = []
for article in articles:
    print(f"Fetching article: {article['title']}")
    content = get_article_content(article['url'])
    if content:
        article_contents.append((article['title'], article['url'], content))
    time.sleep(1)  # Adding delay to avoid request rate limits

if not article_contents:
    raise Exception("No valid article contents extracted")

# Initialize the summarization pipeline
try:
    print("Initializing summarizer...")
    summarizer = pipeline('summarization', model='facebook/bart-large-cnn', device=-1)  # Use CPU
except Exception as e:
    raise SystemExit(f"Failed to initialize summarization pipeline: {e}")

# Function to summarize article content
def summarize_article(content):
    try:
        # Truncate content to 1024 tokens (BART model limitation)
        if len(content) > 1024:
            content = content[:1024]
        summary = summarizer(content, max_length=130, min_length=30, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Failed to summarize article content: {e}")
        return None

# Summarize all articles
summaries = []
for title, url, content in article_contents:
    print(f"Summarizing article: {title}")
    summary = summarize_article(content)
    if summary:
        summaries.append((title, summary, url))
    else:
        print(f"Skipping article due to summarization failure: {title}")

if not summaries:
    raise Exception("No summaries generated")

# Print titles, summaries, and links
print("\n--- News Summaries ---\n")
for title, summary, url in summaries:
    print(f"Title: {title}")
    print(f"Summary: {summary}")
    print(f"Link: {url}\n")


Python executable being used: /usr/bin/python3
Process ID: 3555
Fetching news articles...
Fetching article: Live Briefing: Gaza death toll crosses 45,000, Health Ministry says - The Washington Post
Fetching article: What we learned in NFL Week 15: MVP is Josh Allen’s to lose, Packers stay hot - The Athletic - The Athletic
Fetching article: The federal EV tax credit has an uncertain fate. What car shoppers should know - NPR
Fetching article: Kraven the Hunter Has Worst Ever Box Office Opening Weekend for a Sony Spider-Man Universe Movie, Even Below Madame Web - IGN
Fetching article: Memphis man convicted of killing his bride on their honeymoon in Fiji in 2022 - CBS News
Fetching article: Seven hotel guests suffer suspected poisoning after drinking cocktails at 5-star Fiji resort - CNN
Fetching article: Expect an Icy Commute for Parts of the Northeast on Monday Morning - The New York Times
Fetching article: Schumer urges Biden admin to deploy "special drone-detection tech" to N.Y. and N.