<H1>Project 7</H1>
Load RSS content and then extract content from each link. Do this in multiple threads<br/>
<H3>Requirements</H3>
Load an RSS xml file (Format: https://www.w3schools.com/xml/xml_rss.asp)<br/>
Loop through each link<br/>
Extract content from each link and write to “output.txt”<br/>
Execute reading from multiple links in parallel<br/>
<H3>Error Handling</H3>
Take care of case where no RSS xml file is available<br/>
Take care of case where xml file is empty<br/>


In [None]:
#RSS files

#https://feeds.washingtonpost.com/rss/business/technology
#https://news.yahoo.com/rss/science
#https://feeds.feedburner.com/TechCrunch/

In [23]:
import requests
import bs4
import xml.etree.ElementTree as ET
import os
from concurrent.futures import ThreadPoolExecutor,as_completed

RSS_URL = "https://news.yahoo.com/rss/science"
MAX_THREADS = 5
OUTPUT_FILE = "output.txt"

def fetch_rss_feed_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = bs4.BeautifulSoup(response.text,"lxml")
        
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching RSS feed: {e}")
        return None

# Function to fetch content from a link
def fetch_content_from_link(link):
    try:
        response = requests.get(link)
        response.raise_for_status()
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        
        content = soup.get_text()
        return content.strip()  # Strip to avoid extra whitespaces
    except requests.RequestException as e:
        print(f"Error fetching content from {link}: {e}")
        return None
        
def parse_rss_feed_contents(rss_feed_content):
    try:
        tree = ET.fromstring(rss_feed_content)
    
        links = []
        for item_list in tree.findall('.//item'):
            for link in item_list.iter('link'):
                links.append(link.text)
        return links
    except Exception as e:
        print(f"Error parsing RSS feed: {e}")
        return []
        
def write_to_output_file(output_file, content, link):
    try:
        with open(output_file, "a", encoding="utf-8") as f:
            f.write(f"\nThe contents from link [{link}] : \n")
            f.write(content+"\n\n")
    except Exception as e:
        print("Error while writing to file", e.args('msg'))
        return None

def process_rss_feed():
    rss_content = fetch_rss_feed_content(RSS_URL)
    if not rss_content:
        print("Failed to fetch RSS feed.")
        return

    links = parse_rss_feed_contents(rss_content)
    if not links:
        print("No links found in the RSS feed.")
        return

    # Create a ThreadPoolExecutor to fetch content in parallel
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = {executor.submit(fetch_content_from_link, link): link for link in links}
        print(f"\nfutures : {futures}")
        print(f"\nas_completed(futures) : {as_completed(futures)}")
        for future in as_completed(futures):
            print(f"\nfuture : {future}")
            print(f"\nfutures[future] : {futures[future]}")

            link = futures[future]
            try:
                content = future.result()
                if content:
                    print(f"Writing content from {link} to file.")
                    write_to_output_file(OUTPUT_FILE, content, link)
            except Exception as e:
                print(f"Error processing {link}: {e}")
    
if __name__ == "__main__":
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)  # Clear output file if it already exists

    process_rss_feed()

print("completed processing!")

  soup = bs4.BeautifulSoup(response.text,"lxml")



futures : {<Future at 0x169e2eba0 state=running>: 'https://www.yahoo.com/news/3-dozen-high-rise-buildings-173141377.html', <Future at 0x17a121370 state=running>: 'https://www.yahoo.com/news/two-stars-may-orbiting-other-160052730.html', <Future at 0x173f048f0 state=running>: 'https://www.yahoo.com/news/water-destruction-deadly-heat-associated-120220230.html', <Future at 0x179ae43b0 state=running>: 'https://www.yahoo.com/news/meet-endurance-pioneering-nasa-moon-110000420.html', <Future at 0x179ab2ab0 state=running>: 'https://www.yahoo.com/news/attackers-cannibalized-victims-early-bronze-231343442.html'}

as_completed(futures) : <generator object as_completed at 0x167613940>

future : <Future at 0x17a121370 state=finished returned str>

futures[future] : https://www.yahoo.com/news/two-stars-may-orbiting-other-160052730.html
Writing content from https://www.yahoo.com/news/two-stars-may-orbiting-other-160052730.html to file.

future : <Future at 0x169e2eba0 state=finished returned str>

fu