In [23]:
import feedparser
import re
from bs4 import BeautifulSoup

def parse_rss_feed(rss_url):
    """
    Parse an RSS feed and extract specific fields from each item.

    Args:
        rss_url (str): The URL of the RSS feed to parse

    Returns:
        list: A list of dictionaries containing the extracted fields
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    extracted_data = []

    # Iterate through each item in the feed
    for entry in feed.entries:
        item_data = {
            "title": clean_text(entry.get('title', '')),
            "link": entry.get('link', ''),
            "description": clean_html(entry.get('description', '')),
            "pub_date": entry.get('published', ''),
            "content": clean_html(get_content(entry))
        }
        extracted_data.append(item_data)

    return extracted_data

def get_content(entry):
    """
    Extract content from RSS entry, prioritizing content module if available.

    Args:
        entry: A feedparser entry object

    Returns:
        str: The content text
    """
    # Try to get content from content module first
    if hasattr(entry, 'content'):
        for content in entry.content:
            if hasattr(content, 'value'):
                return content.value

    # Fallback to description if content is not available
    return entry.get('description', '')

def clean_html(html_text):
    """
    Remove HTML tags and clean the text.

    Args:
        html_text (str): Text containing HTML tags

    Returns:
        str: Clean text without HTML tags
    """
    if not html_text:
        return ""

    # Use BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(html_text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Remove extra whitespace and clean up
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

def clean_text(text):
    """
    Basic text cleaning (remove extra whitespace, etc.)

    Args:
        text (str): Text to clean

    Returns:
        str: Cleaned text
    """
    if not text:
        return ""

    return re.sub(r'\s+', ' ', text).strip()

# Example usage
if __name__ == "__main__":
    # Extract the RSS URL from the snippet
    rss_url = "https://nawaat.org/feed/"

    # Parse the feed
    data = parse_rss_feed(rss_url)

    # Print the extracted data in a clean format
    for i, item in enumerate(data, 1):
        print(f"Item {i}:")
        print(f"  Title: {item['title']}")
        print(f"  Link: {item['link']}")
        print(f"  Description: {(item['description'])}")
        print(f"  Publication Date: {item['pub_date']}")
        print(f"  Content: {(item['content'])}")
        print("-" * 80)

def truncate_text(text, max_length):
    """
    Truncate text to specified length and add ellipsis if needed.

    Args:
        text (str): Text to truncate
        max_length (int): Maximum length before truncation

    Returns:
        str: Truncated text
    """
    if len(text) <= max_length:
        return text
    return text[:max_length].rsplit(' ', 1)[0] + '...'

Item 1:
  Title: لن نخشى دفع أي ثمن لمواجهة الاحتلال الصهيوني، حوار مع عمر الخطيب
  Link: https://nawaat.org/2025/09/18/%d9%84%d9%86-%d9%86%d8%ae%d8%b4%d9%89-%d8%af%d9%81%d8%b9-%d8%a3%d9%8a-%d8%ab%d9%85%d9%86-%d9%84%d9%85%d9%88%d8%a7%d8%ac%d9%87%d8%a9-%d8%a7%d9%84%d8%a7%d8%ad%d8%aa%d9%84%d8%a7%d9%84-%d8%a7%d9%84%d8%b5/
  Description: لم تكتف آلة القتل والاجرام الصهيونية بتدمير قطاع غزّة وإبادة سكّانه، بل جعلت معتقلاتها قبورا للأحياء مارست خلف أسوارها شتّى أنواع الجرائم والتنكيل في حقّ الأسرى الفلسطينيّين. بعد اعتقال دام 16 شهرا في سجن النقب، تستضيف نواة الباحث والأسير المحرّر عمر الخطيب ليقدّم شهادة حيّة عن المسالخ البشريّة للاحتلال وليتحدًث عن واقع الحراك الشبابي في فلسطين المحتلة.
  Publication Date: Thu, 18 Sep 2025 17:45:34 +0000
  Content: لم تكتف آلة القتل والاجرام الصهيونية بتدمير قطاع غزّة وإبادة سكّانه، بل جعلت معتقلاتها قبورا للأحياء مارست خلف أسوارها شتّى أنواع الجرائم والتنكيل في حقّ الأسرى الفلسطينيّين. بعد اعتقال دام 16 شهرا في سجن النقب، تستضيف نواة الباحث والأسير المحرّ

In [24]:
import feedparser
from bs4 import BeautifulSoup
import html

def parse_rss_feed(url):
    """
    Parse RSS feed and extract required fields with HTML cleaning
    """
    # Parse the RSS feed
    feed = feedparser.parse(url)

    extracted_data = []

    for entry in feed.entries:
        # Extract basic fields
        title = getattr(entry, 'title', '')
        link = getattr(entry, 'link', '')
        pub_date = getattr(entry, 'published', getattr(entry, 'pubDate', ''))

        # Extract and clean description
        description = getattr(entry, 'description', '')
        description = clean_html_content(description)

        # Extract content (try multiple possible fields)
        content = getattr(entry, 'content', [{}])[0].get('value', '') if hasattr(entry, 'content') else ''
        if not content:
            content = getattr(entry, 'summary', '')
        content = clean_html_content(content)

        # Create entry dictionary
        entry_data = {
            "title": clean_text(title),
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        extracted_data.append(entry_data)

    return extracted_data

def clean_html_content(html_content):
    """
    Clean HTML content using BeautifulSoup to extract plain text
    """
    if not html_content:
        return ""

    try:
        # Parse HTML content
        soup = BeautifulSoup(html_content, 'html.parser')

        # Remove unwanted tags but keep text content
        for element in soup(['script', 'style', 'img', 'iframe', 'form', 'input', 'button']):
            element.decompose()

        # Get clean text
        clean_text = soup.get_text(separator=' ', strip=True)

        # Decode HTML entities and clean up whitespace
        clean_text = html.unescape(clean_text)
        clean_text = ' '.join(clean_text.split())

        return clean_text

    except Exception as e:
        print(f"Error cleaning HTML content: {e}")
        # Fallback: return original content with basic cleaning
        return clean_text(html_content)

def clean_text(text):
    """
    Basic text cleaning for non-HTML fields
    """
    if not text:
        return ""

    text = html.unescape(text)
    text = ' '.join(text.split())
    return text

# Example usage with the provided RSS feed URL
if __name__ == "__main__":
    rss_url = "http://assarih.com/feed/"

    try:
        data = parse_rss_feed(rss_url)

        # Print extracted data
        for i, entry in enumerate(data, 1):
            print(f"Entry {i}:")
            print(f"Title: {entry['title']}")
            print(f"Link: {entry['link']}")
            print(f"Description: {entry['description']}")
            print(f"Publication Date: {entry['pub_date']}")
            print(f"Content: {entry['content']}")
            print("-" * 80)

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

Entry 1:
Title: جون أفريك: 2 من كل 3 مسلمين في فرنسا ضحايا للعنصرية
Link: http://assarih.com/%d8%ac%d9%88%d9%86-%d8%a3%d9%81%d8%b1%d9%8a%d9%83-2-%d9%85%d9%86-%d9%83%d9%84-3-%d9%85%d8%b3%d9%84%d9%85%d9%8a%d9%86-%d9%81%d9%8a-%d9%81%d8%b1%d9%86%d8%b3%d8%a7-%d8%b6%d8%ad%d8%a7%d9%8a%d8%a7-%d9%84/
Description: كشفت دراسة أجراها معهد “إيفوب” بدعم من جامع باريس الكبير عن أشكال التمييز المختلفة التي يواجهها المسلمون المقيمون في فرنسا، والنتيجة كانت صادمة.. حيث أكدت نتائج الدراسة أن ‘الإسلاموفوبيا’ في تصاعد مطرد، وذلك بحسب ما أوردته مجلة جون أفريك الفرنسية. سؤال مباشر وأوضحت المجلة أن سؤال الدراسة كان مباشرا: “هل تعتقد أنك كنت ضحية سلوكيات …
Publication Date: Thu, 18 Sep 2025 22:11:54 +0000
Content: كشفت دراسة أجراها معهد “إيفوب” بدعم من جامع باريس الكبير عن أشكال التمييز المختلفة التي يواجهها المسلمون المقيمون في فرنسا، والنتيجة كانت صادمة.. حيث أكدت نتائج الدراسة أن ‘الإسلاموفوبيا’ في تصاعد مطرد، وذلك بحسب ما أوردته مجلة جون أفريك الفرنسية. سؤال مباشر وأوضحت المجلة أن سؤال الدراسة كان مباشرا: 

In [27]:
import feedparser
from bs4 import BeautifulSoup
import re

def extract_rss_feed_data(rss_url):
    """
    Extract and clean RSS feed data from the given URL
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    extracted_data = []

    for entry in feed.entries:
        # Extract basic fields
        title = entry.get('title', '')
        link = entry.get('link', '')
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Clean description using BeautifulSoup
        description_html = entry.get('description', '')
        description_clean = clean_html(description_html)
        description_clean = remove_boilerplate(description_clean, title)

        # Extract content (try multiple possible fields)
        content_html = entry.get('content', [{}])[0].get('value', '') if 'content' in entry else ''
        if not content_html:
            content_html = entry.get('content:encoded', '')

        content_clean = clean_html(content_html)
        content_clean = remove_boilerplate(content_clean, title)

        # Create data dictionary
        item_data = {
            "title": title,
            "link": link,
            "description": description_clean,
            "pub_date": pub_date,
            "content": content_clean
        }

        extracted_data.append(item_data)

    return extracted_data

def clean_html(html_content):
    """
    Clean HTML content using BeautifulSoup to extract text only
    """
    if not html_content:
        return ""

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove unwanted tags (script, style, etc.)
    for unwanted_tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
        unwanted_tag.decompose()

    # Get clean text
    clean_text = soup.get_text(separator=' ', strip=True)

    # Clean up extra whitespace
    clean_text = ' '.join(clean_text.split())

    return clean_text

def remove_boilerplate(text, title):
    """
    Remove repetitive boilerplate text from the content
    """
    if not text:
        return ""

    # Pattern to match the repetitive boilerplate text
    patterns = [
        r"L'article.*est apparu en premier sur WMC\..*$",
        r"L'article.*est apparu en premier sur WMC.*$",
        r"est apparu en premier sur WMC\..*$",
        r"est apparu en premier sur WMC.*$"
    ]

    # Try each pattern to remove boilerplate
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Also remove any trailing "L'article" references that might be left
    text = re.sub(r"L'article.*$", '', text)

    # Clean up any extra whitespace created by the removal
    text = text.strip()

    # Remove any trailing ellipsis or incomplete sentences
    text = re.sub(r'\[…\]$|\.\.\.$|…$', '', text).strip()
    text = re.sub(r'\[\…\]$', '', text).strip()

    return text

# Example usage
if __name__ == "__main__":
    rss_url = "https://www.webmanagercenter.com/feed/"

    try:
        # Extract data from RSS feed
        data = extract_rss_feed_data(rss_url)

        # Print the extracted data
        for i, item in enumerate(data, 1):
            print(f"Item {i}:")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("-" * 80)

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

Item 1:
Title: Flottille mondiale Soumoud : la Grèce envoie six bateaux vers Gaza
Link: https://www.webmanagercenter.com/2025/09/19/551857/flottille-mondiale-soumoud-la-grece-envoie-six-bateaux-vers-gaza/
Description: Le Comité international pour briser le blocus de Gaza a annoncé le départ de six navires grecs pour rejoindre la Flottille de la résistance mondiale en route vers la Palestine. À bord se trouvent 26 militants grecs et 20 militants internationaux venus apporter leur solidarité. Les bateaux portent les noms de : « Oxygène », […] L’article Flottille mondiale Soumoud : la Grèce envoie six bateaux vers Gaza
Publication Date: Fri, 19 Sep 2025 14:07:49 +0000
Content: Le Comité international pour briser le blocus de Gaza a annoncé le départ de six navires grecs pour rejoindre la Flottille de la résistance mondiale en route vers la Palestine. À bord se trouvent 26 militants grecs et 20 militants internationaux venus apporter leur solidarité. Les bateaux portent les noms de : « Oxy

In [28]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean the text using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Get clean text
    clean_text = soup.get_text(separator=' ', strip=False)

    # Clean up any remaining HTML entities and extra whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text)  # Replace multiple spaces with single space
    clean_text = clean_text.strip()

    return clean_text

def extract_rss_feed_data(url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    results = []

    for entry in feed.entries:
        # Extract the required fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')

        # Handle description (could be in different fields)
        description = clean_html_content(entry.get('description', ''))
        if not description:
            description = clean_html_content(entry.get('summary', ''))

        # Handle publication date
        pub_date = entry.get('published', '')
        if not pub_date:
            pub_date = entry.get('pubDate', '')

        # Handle content (could be in different fields)
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value)
        elif hasattr(entry, 'content_encoded'):
            content = clean_html_content(entry.content_encoded)
        elif hasattr(entry, 'description'):
            # Fallback to description if no specific content field
            content = clean_html_content(entry.description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# Extract the RSS URL from the snippet
rss_url = "https://www.leconomistemaghrebin.com/feed/"

# Parse and extract data
try:
    extracted_data = extract_rss_feed_data(rss_url)

    # Print the extracted data without truncation
    for i, item in enumerate(extracted_data, 1):
        print(f"Item {i}:")
        print(f"Title: {item['title']}")
        print(f"Link: {item['link']}")
        print(f"Description: {item['description']}")
        print(f"Publication Date: {item['pub_date']}")
        print(f"Content: {item['content']}")
        print("-" * 80)

except Exception as e:
    print(f"Error parsing RSS feed: {e}")

Item 1:
Title: Leith Ben Becher : « Notre système fiscal agricole est injuste »
Link: https://www.leconomistemaghrebin.com/2025/09/19/leith-ben-beche-systeme-fiscal-agricole/
Description: Le système fiscal appliqué à l’agriculture ne favorise pas les exploitants agricoles, principalement parce qu’il ne prend pas en compte les spécificités fondamentales du métier d’agriculteur. C’est le constat dressé par Leith Ben Becher, président d’honneur de l’association pour l’agriculture durable et fondateur du Syndicat des agriculteurs de Tunisie. Notre invité dénonce le système fiscal […] L’article Leith Ben Becher : « Notre système fiscal agricole est injuste » est apparu en premier sur Leconomiste Maghrebin .
Publication Date: Fri, 19 Sep 2025 17:07:53 +0000
Content: Le système fiscal appliqué à l’agriculture ne favorise pas les exploitants agricoles, principalement parce qu’il ne prend pas en compte les spécificités fondamentales du métier d’agriculteur. C’est le constat dressé par Leith Ben

In [34]:
import feedparser
from bs4 import BeautifulSoup
import html
import re

def clean_html_content(text):
    """
    Remove HTML tags and clean text content using BeautifulSoup
    """
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities and clean up whitespace
    clean_text = html.unescape(clean_text)
    clean_text = ' '.join(clean_text.split())

    return clean_text

def extract_description_and_content(text):
    """
    Extract description (after "The post") and content (before "[...]") from HTML text
    """
    # Parse the HTML content
    soup = BeautifulSoup(text, 'html.parser')

    # Extract all text content
    full_text = soup.get_text(separator=' ', strip=True)

    # Extract content before "[...]" - look for the main content
    content_match = re.search(r'^(.*?)\s*\[\.\.\.\]', full_text)
    content = content_match.group(1).strip() if content_match else full_text

    # Extract description from the "The post" line
    description_match = re.search(r'The post\s+(.*?)\s+appeared first', full_text)
    if description_match:
        description = description_match.group(1).strip()
    else:
        # Fallback: if no "The post" pattern found, try to get the title from the link
        post_link = soup.find('a', rel='nofollow')
        if post_link:
            description = post_link.get_text(strip=True)
        else:
            # Final fallback: use first part of content
            words = content.split()
            description = ' '.join(words[:min(10, len(words))]) + ('...' if len(words) > 10 else '')

    return description, content

def extract_rss_feed_data(url):
    """
    Extract and clean data from RSS feed
    """
    # Parse the RSS feed
    feed = feedparser.parse(url)

    extracted_data = []

    for entry in feed.entries:
        # Extract basic fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Get the raw description (HTML content)
        raw_description = entry.get('description', '')

        # Extract description and content from the HTML description field
        description, content = extract_description_and_content(raw_description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        extracted_data.append(result)

    return extracted_data

# Example usage with the RSS URL from your snippet
if __name__ == "__main__":
    rss_url = "https://radioexpressfm.com/fr/feed/"

    try:
        # Extract data from RSS feed
        data = extract_rss_feed_data(rss_url)

        # Print the extracted data without truncation
        for i, item in enumerate(data, 1):
            print(f"Item {i}:")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("-" * 80)

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

Item 1:
Title: Baisse des recettes des exportations de l’huile d’olive et des dattes
Link: https://radioexpressfm.com/fr/actualites/baisse-des-recettes-des-exportations-de-lhuile-dolive-et-des-dattes/
Description: Baisse des recettes des exportations de l’huile d’olive et des dattes
Publication Date: Fri, 19 Sep 2025 14:25:12 +0000
Content: Les recettes des exportations de l’huile d’olive et des dattes ont baissé respectivement de 29,5% et de 6%, selon le bulletin mensuel « ONAGRI vigilance », publié, vendredi, par l’Observatoire National de l’Agriculture. Huile d’olive : baisse de de 29,5% des recettes des exportations Les recettes des exportations de l’huile d’olive tunisienne
--------------------------------------------------------------------------------
Item 2:
Title: Produits agricoles biologiques : les recettes des exportations atteignent 558,3 MD
Link: https://radioexpressfm.com/fr/actualites/produits-agricoles-biologiques-les-recettes-des-exportations-atteignent-5583-md/
Descr

In [35]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse HTML content
    soup = BeautifulSoup(text, 'html.parser')

    # Remove unwanted tags but keep text content
    for element in soup(['script', 'style', 'img', 'iframe', 'form', 'input', 'button']):
        element.decompose()

    # Get clean text and unescape HTML entities
    clean_text = soup.get_text(separator=' ', strip=True)
    clean_text = html.unescape(clean_text)

    # Remove extra whitespace
    clean_text = ' '.join(clean_text.split())

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    feed = feedparser.parse(rss_url)

    articles = []

    for entry in feed.entries:
        article = {
            "title": clean_html_content(entry.get('title', '')),
            "link": entry.get('link', ''),
            "description": clean_html_content(entry.get('description', '')),
            "pub_date": entry.get('published', entry.get('pubDate', '')),
            "content": clean_html_content(entry.get('content', [{}])[0].get('value', '')) if entry.get('content') else clean_html_content(entry.get('description', ''))
        }
        articles.append(article)

    return articles

def main():
    # RSS feed URL from the snippet
    rss_url = "https://realites.com.tn/feed/"

    try:
        articles = parse_rss_feed(rss_url)

        for i, article in enumerate(articles, 1):
            print(f"=== Article {i} ===")
            print(f"Title: {article['title']}")
            print(f"Link: {article['link']}")
            print(f"Description: {article['description']}")
            print(f"Publication Date: {article['pub_date']}")
            print(f"Content: {article['content']}")
            print("\n" + "-" * 80 + "\n")

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

if __name__ == "__main__":
    main()

=== Article 1 ===
Title: CAN 2025 au Maroc : l’accès pour les Tunisiens désormais soumis à un visa
Link: https://realites.com.tn/fr/can-2025-au-maroc-lacces-pour-les-tunisiens-desormais-soumis-a-un-visa/
Description: A l’approche de la Coupe d’Afrique des Nations de football 2025, qui se tiendra au Maroc, les autorités marocaines ont décidé d’instaurer une obligation de visa pour les Tunisiens et…
Publication Date: Fri, 19 Sep 2025 17:16:05 +0000
Content: A l’approche de la Coupe d’Afrique des Nations de football 2025, qui se tiendra au Maroc, les autorités marocaines ont décidé d’instaurer une obligation de visa pour les Tunisiens et ce, pour la première fois dans l’histoire des relations entre les deux pays. Jusque-là exemptés de visa pour des séjours de moins de 90 jours, les Tunisiens devront désormais obtenir un visa électronique pour assister aux matchs ou participer aux événements liés au tournoi. Pour simplifier cette démarche, le Maroc a lancé l’application mobile « Yalla », q

In [37]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse HTML content
    soup = BeautifulSoup(text, 'html.parser')

    # Get clean text without HTML tags
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities
    clean_text = html.unescape(clean_text)

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields with swapped description/content"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    # Extract information from each item
    for entry in feed.entries:
        # Get the raw values before cleaning
        raw_title = entry.get('title', '')
        raw_description = entry.get('description', '')

        # Clean the content
        clean_title = clean_html_content(raw_title)
        clean_description = clean_html_content(raw_description)

        # For content field, try multiple sources and use description if content is empty or same as title
        content_candidates = []

        # Try content field first
        if entry.get('content'):
            content_value = clean_html_content(entry.get('content', [{}])[0].get('value', ''))
            if content_value and content_value != clean_title:
                content_candidates.append(content_value)

        # Try description as content (if it's different from title)
        if clean_description and clean_description != clean_title:
            content_candidates.append(clean_description)

        # Try summary field
        if entry.get('summary'):
            summary_value = clean_html_content(entry.get('summary', ''))
            if summary_value and summary_value != clean_title:
                content_candidates.append(summary_value)

        # Use the first available content candidate, or fallback to description
        final_content = content_candidates[0] if content_candidates else clean_description

        # For description, use title if content was taken from description
        final_description = clean_title if final_content == clean_description else clean_description

        item_data = {
            "title": clean_title,
            "link": entry.get('link', ''),
            "description": final_description,
            "pub_date": entry.get('published', entry.get('pubDate', '')),
            "content": final_content
        }
        results.append(item_data)

    return results

# RSS URL from the snippet
rss_url = "https://www.radiotunisienne.tn/articles/rss"

# Parse the RSS feed
parsed_data = parse_rss_feed(rss_url)

# Print the extracted data without truncation
for i, item in enumerate(parsed_data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: الإذاعة التونسية تحتفي بأبنائها المتوجين في المهرجان العربي للإذاعة والتلفزيون 2025
Link: https://www.radiotunisienne.tn/article/6870dac3fa439dee1a8c6653/%D8%A7%D9%84%D8%A5%D8%B0%D8%A7%D8%B9%D8%A9-%D8%A7%D9%84%D8%AA%D9%88%D9%86%D8%B3%D9%8A%D8%A9-%D8%AA%D8%AD%D8%AA%D9%81%D9%8A-%D8%A8%D8%A3%D8%A8%D9%86%D8%A7%D8%A6%D9%87%D8%A7-%D8%A7%D9%84%D9%85%D8%AA%D9%88%D8%AC%D9%8A%D9%86-%D9%81%D9%8A-%D8%A7%D9%84%D9%85%D9%87%D8%B1%D8%AC%D8%A7%D9%86-%D8%A7%D9%84%D8%B9%D8%B1%D8%A8%D9%8A-%D9%84%D9%84%D8%A5%D8%B0%D8%A7%D8%B9%D8%A9-%D9%88%D8%A7%D9%84%D8%AA%D9%84%D9%81%D8%B2%D9%8A%D9%88%D9%86-2025
Description: الإذاعة التونسية تحتفي بأبنائها المتوجين في المهرجان العربي للإذاعة والتلفزيون 2025
Publication Date: Fri, 11 Jul 2025 09:16:00 +0100
Content: نظمت الإذاعة التونسية الإثنين 30 جوان 2025 لقاءا خصص لتكريم ابناء المؤسسة المتوجين مؤخرا بجوائز ضمن مسابقات المهرجان العربي للإذاعة و التلفزيون في دورته الخامسة و العشرين التي ينظمها سنويا اتحاد اذاعات الدول العربية بتونس.
-----------------------

In [42]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean the text using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Get clean text
    clean_text = soup.get_text(separator=' ', strip=True)

    # Remove extra whitespace and normalize
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

def extract_rss_feed_data(rss_url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    extracted_data = []

    # Process each item in the feed
    for entry in feed.entries:
        # Get description content
        description = clean_html_content(entry.get('description', ''))

        # Get content - if not available, use description as fallback
        content = ''
        if entry.get('content'):
            content = clean_html_content(entry.get('content', [{}])[0].get('value', ''))
        else:
            content = description  # Use description as content

        item_data = {
            "title": clean_html_content(entry.get('title', '')),
            "link": entry.get('link', ''),
            "description": description,
            "pub_date": entry.get('published', entry.get('pubDate', '')),
            "content": content
        }
        extracted_data.append(item_data)

    return extracted_data

# RSS URL from the snippet
rss_url = "https://www.leaders.com.tn/rss"

try:
    # Extract data from the RSS feed
    feed_data = extract_rss_feed_data(rss_url)

    # Print the extracted data without truncation
    for i, item in enumerate(feed_data, 1):
        print(f"Item {i}:")
        print(f"Title: {item['title']}")
        print(f"Link: {item['link']}")
        print(f"Description: {item['description']}")
        print(f"Publication Date: {item['pub_date']}")
        print(f"Content: {item['content']}")
        print("-" * 80)

except Exception as e:
    print(f"Error parsing RSS feed: {e}")

Item 1:
Title: Reconnaissance de l’État palestinien : une illusion diplomatique qui masque l’urgence des sanctions ?
Link: http://www.leaders.com.tn/article/37315-reconnaissance-de-l-etat-palestinien-une-illusion-diplomatique-qui-masque-l-urgence-des-sanctions
Description: L’Assemblée Générale des Nations Unies a adopté, le 12 septembre 2025, à une large majorité, une résolution appelée «Déclaration de New York sur le règlement pacifique de la question de Palestine et la mise en œuvre de la solution à deux États». Présentée comme une avancée diplomatique majeure, cette reconnaissance n’est pourtant qu’un trompe-l’œil. Elle ne met fin ni à l’occupation, ni aux bombardements, ni au processus d’extermination à Gaza. Elle ne protège pas le peuple palestinien et ne contraint pas l’entité sioniste à respecter le droit international. La fiction des deux États: de 1967 aux accords bilatéraux de normalisation La «solution des ...
Publication Date: Fri, 19 Sep 2025 14:23:00 GMT
Content: L’Assemb

In [43]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and decode HTML entities from text"""
    if not text:
        return ""

    # Decode HTML entities first
    cleaned_text = html.unescape(text)

    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(cleaned_text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ', strip=True)

    return cleaned_text

def extract_rss_feed_data(rss_url):
    """Extract and clean data from RSS feed"""

    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    extracted_data = []

    for entry in feed.entries:
        # Extract and clean each field
        item_data = {
            "title": clean_html_content(entry.get('title', '')),
            "link": entry.get('link', ''),
            "description": clean_html_content(entry.get('description', '')),
            "pub_date": entry.get('published', entry.get('pubDate', '')),
            "content": clean_html_content(entry.get('content', [{}])[0].get('value', '')) if entry.get('content') else ''
        }

        # If content is empty, try other common content fields
        if not item_data['content']:
            item_data['content'] = clean_html_content(
                entry.get('summary', '') or
                entry.get('content:encoded', '') or
                entry.get('description', '')
            )

        extracted_data.append(item_data)

    return extracted_data

# Example usage with the provided RSS URL
rss_url = "https://africanmanager.com/feed/"

try:
    data = extract_rss_feed_data(rss_url)

    for i, item in enumerate(data, 1):
        print(f"Item {i}:")
        print(f"Title: {item['title']}")
        print(f"Link: {item['link']}")
        print(f"Description: {item['description']}")
        print(f"Publication Date: {item['pub_date']}")
        print(f"Content: {item['content']}")
        print("-" * 80)
        print()

except Exception as e:
    print(f"Error parsing RSS feed: {e}")

Item 1:
Title: Tourisme : La Tunisie dans le gotha des vacances en famille et des voyages de luxe
Link: https://africanmanager.com/tourisme-la-tunisie-dans-le-gotha-des-vacances-en-famille-et-des-voyages-de-luxe/
Description: La Tunisie, la Turquie, l’Italie, l’Égypte, l’Espagne, la Thaïlande, l’Allemagne, les Émirats arabes unis et la Grèce se sont imposées comme les destinations phares de la dernière étude Dertour, confirmant leur popularité continue pour les vacances en famille et les voyages de luxe. Si la Grèce domine le secteur du luxe, d’autres pays restent de […]
Publication Date: Fri, 19 Sep 2025 15:41:18 +0000
Content: La Tunisie, la Turquie, l’Italie, l’Égypte, l’Espagne, la Thaïlande, l’Allemagne, les Émirats arabes unis et la Grèce se sont imposées comme les destinations phares de la dernière étude Dertour, confirmant leur popularité continue pour les vacances en famille et les voyages de luxe. Si la Grèce domine le secteur du luxe, d’autres pays restent de […]
-----------

In [45]:
import feedparser
from bs4 import BeautifulSoup
import html
import re

def extract_rss_fields(rss_url):
    """
    Extract and clean RSS feed fields using feedparser and BeautifulSoup
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = getattr(entry, 'title', '')
        link = getattr(entry, 'link', '')
        pub_date = getattr(entry, 'published', getattr(entry, 'pubDate', ''))

        # Extract and clean description - remove author and date information
        description = getattr(entry, 'description', '')
        description = clean_description(description)

        # For content, use the cleaned description (since content field may not be available)
        content = description

        # Create result dictionary
        result = {
            "title": clean_text(title),
            "link": clean_text(link),
            "description": description,
            "pub_date": clean_text(pub_date),
            "content": content
        }

        results.append(result)

    return results

def clean_description(description):
    """
    Clean description by extracting only the main text content
    and removing author and date information
    """
    if not description:
        return ""

    try:
        # Parse HTML content
        soup = BeautifulSoup(description, 'html.parser')

        # Find the span with property="schema:name" which contains the main title
        title_span = soup.find('span', {'property': 'schema:name'})

        if title_span:
            # Extract text from the title span
            main_text = title_span.get_text(strip=True)
            return clean_text(main_text)
        else:
            # Fallback: get all text and try to extract the main content
            all_text = soup.get_text(separator=' ', strip=True)
            # Remove author and date information (simple pattern matching)
            # This regex removes everything after the main title text
            cleaned_text = re.sub(r'(\s+\w+_\w+\s+.*|\s+ven\s+\d{2}/\d{2}/\d{4}.*)', '', all_text)
            return clean_text(cleaned_text)

    except Exception as e:
        print(f"Error cleaning description: {e}")
        return clean_text(description)

def clean_text(text):
    """
    Clean plain text by unescaping HTML entities and normalizing whitespace
    """
    if not text:
        return ""

    try:
        # Unescape HTML entities
        text = html.unescape(text)

        # Normalize whitespace
        text = ' '.join(text.split())

        return text.strip()
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return str(text)

def print_results(results):
    """
    Print the extracted results in the exact format requested
    """
    for i, result in enumerate(results, 1):
        print(f"=== Entry {i} ===")
        print(f"Title: {result['title']}")
        print(f"Link: {result['link']}")
        print(f"Description: {result['description']}")
        print(f"Publication Date: {result['pub_date']}")
        print(f"Content: {result['content']}")
        print("=" * 50)
        print()

# Example usage with the provided RSS snippet URL
if __name__ == "__main__":
    # The URL from the RSS snippet
    rss_url = "https://www.alchourouk.com/rss"

    # Extract fields from the RSS feed
    extracted_data = extract_rss_fields(rss_url)

    # Print the results
    print_results(extracted_data)

=== Entry 1 ===
Title: بطولة افريقيا لكرة اليد صغريات: المنتخب التونسي يفوز على نظيره الأنغولي
Link: https://www.alchourouk.com/article/%D8%A8%D8%B7%D9%88%D9%84%D8%A9-%D8%A7%D9%81%D8%B1%D9%8A%D9%82%D9%8A%D8%A7-%D9%84%D9%83%D8%B1%D8%A9-%D8%A7%D9%84%D9%8A%D8%AF-%D8%B5%D8%BA%D8%B1%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D9%85%D9%86%D8%AA%D8%AE%D8%A8-%D8%A7%D9%84%D8%AA%D9%88%D9%86%D8%B3%D9%8A-%D9%8A%D9%81%D9%88%D8%B2-%D8%B9%D9%84%D9%89-%D9%86%D8%B8%D9%8A%D8%B1%D9%87-%D8%A7%D9%84%D8%A3%D9%86%D8%BA%D9%88%D9%84%D9%8A
Description: بطولة افريقيا لكرة اليد صغريات: المنتخب التونسي يفوز على نظيره الأنغولي
Publication Date: Fri, 19 Sep 2025 17:52:17 +0000
Content: بطولة افريقيا لكرة اليد صغريات: المنتخب التونسي يفوز على نظيره الأنغولي

=== Entry 2 ===
Title: الاتحاد الدولي للنقل الجوي يؤكد استعداده لدعم تونس في تنفيذ مشاريعها ذات الصلة
Link: https://www.alchourouk.com/article/%D8%A7%D9%84%D8%A7%D8%AA%D8%AD%D8%A7%D8%AF-%D8%A7%D9%84%D8%AF%D9%88%D9%84%D9%8A-%D9%84%D9%84%D9%86%D9%82%D9%84-%D8%A7%D9%84%D8%AC%D9%

In [46]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_feed_content(rss_url):
    """
    Extract and clean RSS feed content from the given URL
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = getattr(entry, 'title', '')
        link = getattr(entry, 'link', '')
        pub_date = getattr(entry, 'published', getattr(entry, 'pubDate', ''))

        # Extract and clean description
        description = getattr(entry, 'description', '')
        description = clean_html_content(description)

        # Extract and clean content (try multiple possible fields)
        content = ''
        if hasattr(entry, 'content'):
            content = entry.content[0].value if entry.content else ''
        elif hasattr(entry, 'content_encoded'):
            content = entry.content_encoded
        elif hasattr(entry, 'summary'):
            content = entry.summary

        content = clean_html_content(content)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

def clean_html_content(text):
    """
    Clean HTML content using BeautifulSoup and handle HTML entities
    """
    if not text:
        return ""

    # Decode HTML entities first
    text = html.unescape(text)

    # Use BeautifulSoup to extract text content
    soup = BeautifulSoup(text, 'html.parser')

    # Get clean text without HTML tags
    clean_text = soup.get_text(separator=' ', strip=False)

    # Remove extra whitespace but preserve paragraph structure
    clean_text = ' '.join(clean_text.split())

    return clean_text

def print_extracted_content(results):
    """
    Print the extracted content without truncation
    """
    for i, result in enumerate(results, 1):
        print(f"=== Entry {i} ===")
        print(f"Title: {result['title']}")
        print(f"Link: {result['link']}")
        print(f"Description: {result['description']}")
        print(f"Publication Date: {result['pub_date']}")
        print(f"Content: {result['content']}")
        print("\n" + "="*50 + "\n")

# Main execution
if __name__ == "__main__":
    rss_url = "https://realites.com.tn/feed/"

    try:
        extracted_data = extract_rss_feed_content(rss_url)
        print_extracted_content(extracted_data)
    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

=== Entry 1 ===
Title: Ridha Charfeddine : report de l’examen en appel
Link: https://realites.com.tn/fr/ridha-charfeddine-report-de-lexamen-en-appel/
Description: La chambre judiciaire de la Cour d’appel de Tunis a décidé, ce vendredi, de reporter au 10 octobre 2025 l’examen de l’affaire concernant l’homme d’affaires et ancien député du Parlement…
Publication Date: Fri, 19 Sep 2025 17:48:33 +0000
Content: La chambre judiciaire de la Cour d’appel de Tunis a décidé, ce vendredi, de reporter au 10 octobre 2025 l’examen de l’affaire concernant l’homme d’affaires et ancien député du Parlement dissous, Ridha Charfeddine. Cette procédure porte sur des accusations de constitution de gains à l’étranger sans en avoir informé la Banque centrale de Tunisie ni obtenu son autorisation. Le report intervient dans le cadre de l’examen en appel, après que Charfeddine a été condamné en première instance à trois ans de prison et à une amende de 72 millions de dinars. L’affaire concerne un compte bancaire 

In [47]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text using BeautifulSoup"""
    if not text:
        return ""

    # Parse HTML content
    soup = BeautifulSoup(text, 'html.parser')

    # Remove unwanted elements (social media buttons, scripts, styles, etc.)
    for element in soup.find_all(['script', 'style', 'a', 'div', 'span', 'class']):
        if 'a2a_button' in str(element.get('class', [])):
            element.decompose()

    # Get clean text
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities
    clean_text = html.unescape(clean_text)

    # Remove extra whitespace
    clean_text = ' '.join(clean_text.split())

    return clean_text

def extract_rss_feed_data(url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')

        # Handle description (could be in different fields)
        description = ''
        if hasattr(entry, 'description'):
            description = clean_html_content(entry.description)
        elif hasattr(entry, 'summary'):
            description = clean_html_content(entry.summary)

        # Handle publication date
        pub_date = entry.get('published', '') or entry.get('pubDate', '') or entry.get('updated', '')

        # Handle content (could be in different fields)
        content = ''
        if hasattr(entry, 'content'):
            # If multiple content entries, take the first one
            if entry.content:
                content = clean_html_content(entry.content[0].value)
        elif hasattr(entry, 'content:encoded'):
            content = clean_html_content(entry.get('content:encoded', ''))
        elif hasattr(entry, 'summary_detail'):
            content = clean_html_content(entry.summary_detail.value)

        # If content is empty but description exists, use description
        if not content and description:
            content = description

        results.append({
            'title': title,
            'link': link,
            'description': description,
            'pub_date': pub_date,
            'content': content
        })

    return results

# Extract data from the RSS feed URL found in the snippet
rss_url = "https://www.webdo.tn/fr/feed/"
extracted_data = extract_rss_feed_data(rss_url)

# Print the extracted data without truncation
for i, item in enumerate(extracted_data, 1):
    print(f"=== Item {i} ===")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("\n" + "="*80 + "\n")

=== Item 1 ===
Title: CAN 2025 au Maroc : Un e-visa exceptionnel imposé aux Tunisiens
Link: https://www.webdo.tn/fr/actualite/sport/can-2025-au-maroc-un-e-visa-exceptionnel-impose-aux-tunisiens/383890/
Description: Le Maroc et la Confédération africaine de football (CAF) ont annoncé, vendredi, la mise en place d’une nouvelle application baptisée Yalla pour la Coupe d’Afrique des Nations 2025. Ce guichet numérique regroupera la billetterie électronique, le Fan-ID obligatoire et un module e-visa pour les supporters étrangers. Mesure inédite : les citoyens tunisiens devront, eux aussi, […] L’article CAN 2025 au Maroc : Un e-visa exceptionnel imposé aux Tunisiens est apparu en premier sur webdo .
Publication Date: Fri, 19 Sep 2025 17:52:05 +0000
Content: Le Maroc et la Confédération africaine de football (CAF) ont annoncé, vendredi, la mise en place d’une nouvelle application baptisée Yalla pour la Coupe d’Afrique des Nations 2025. Ce guichet numérique regroupera la billetterie électroniqu

In [48]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities and clean extra whitespace
    clean_text = html.unescape(clean_text)
    clean_text = ' '.join(clean_text.split())

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract fields with fallbacks for missing data
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))

        # Handle pubDate with different possible field names
        pub_date = entry.get('published', entry.get('pubDate', entry.get('updated', '')))
        pub_date = clean_html_content(pub_date)

        # Handle content - try different possible content fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        elif hasattr(entry, 'description'):
            content = clean_html_content(entry.description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# RSS URL from the snippet
rss_url = "https://www.babnet.net/feed.php"

# Parse the feed
parsed_data = parse_rss_feed(rss_url)

# Print the results without truncation
for i, item in enumerate(parsed_data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: سليانة: متابعة مشاريع القطاع الصحي من قبل اللجنة الجهوية للتسريع في إنجاز المشاريع العمومية
Link: https://www.babnet.net/cadredetail-315108.asp
Description: مثلت متابعة مشاريع القطاع الصحي، محور جلسة عمل اللجنة الجهوية للتسريع في إنجاز المشاريع العمومية المنعقدة اليوم الجمعة بمقر ولاية سليانة. وبين المدير الجهوي للتجهيز و الإسكان مهدي العوني في تصريح لصحفية وكالة تونس إفريقيا على هامش الجلسة، أن أبرز الإشكاليات إدارية تتعلق بتعيين مكاتب مراقبة بعد تخل
Publication Date: Fri, 19 Sep 2025 19:01:28 +0100
Content: مثلت متابعة مشاريع القطاع الصحي، محور جلسة عمل اللجنة الجهوية للتسريع في إنجاز المشاريع العمومية المنعقدة اليوم الجمعة بمقر ولاية سليانة. وبين المدير الجهوي للتجهيز و الإسكان مهدي العوني في تصريح لصحفية وكالة تونس إفريقيا على هامش الجلسة، أن أبرز الإشكاليات إدارية تتعلق بتعيين مكاتب مراقبة بعد تخل
--------------------------------------------------------------------------------

Item 2:
Title: بطولة الرابطة الاولى : برنامج مباريات الجولة الثامنة
Link: https://www.bab

In [49]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean the text using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup and get clean text
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Clean up any remaining HTML entities and extra spaces
    clean_text = re.sub(r'\s+', ' ', clean_text)
    clean_text = clean_text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    clean_text = clean_text.replace('&quot;', '"').replace('&#039;', "'")

    return clean_text.strip()

def extract_rss_feed(rss_url):
    """Extract and parse RSS feed content"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Try to get content from different possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        else:
            content = description  # Fallback to description

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# RSS URL from the snippet
rss_url = "https://www.nessma.tv/fr/rss/news/7"

# Extract and process the feed
feed_data = extract_rss_feed(rss_url)

# Print the results without truncation
for i, item in enumerate(feed_data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: Le président Saïed dénonce une campagne de déstabilisation depuis l’étranger
Link: https://www.nessma.tv/fr/nationale/actu/le-president-saied-denonce-une-campagne-de-destabilisation-depuis-l-etranger/554720
Description: Le président de la République, Kaïs Saïed, a souligné que « la Tunisie a été visée dans son existence et dans son unité, assurant qu'elle demeurera à l'abri des traîtres ». Présidant, jeudi, la réunion du Conseil des ministres, le chef de l'Etat a affirmé que « plusieurs crises sont orchestrées depuis l'étranger à travers certaines parties qui ne cherchent qu'à fragiliser l'État tunisien et à accaparer ses ressources », selon une vidéo publiée, vendredi, sur la page de la Présidence.
Publication Date: Fri, 19 Sep 2025 17:16:42 UTC
Content: Le président de la République, Kaïs Saïed, a souligné que « la Tunisie a été visée dans son existence et dans son unité, assurant qu'elle demeurera à l'abri des traîtres ». Présidant, jeudi, la réunion du Conseil des mi

In [53]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Handle case where text might be a list
    if isinstance(text, list):
        # If it's a list, take the first element or join them
        if text and isinstance(text[0], dict) and 'value' in text[0]:
            text = text[0]['value']
        else:
            text = ' '.join([str(item) for item in text])

    # Handle case where text might be a dictionary
    if isinstance(text, dict):
        if 'value' in text:
            text = text['value']
        else:
            text = str(text)

    # Parse with BeautifulSoup and get text
    soup = BeautifulSoup(str(text), 'html.parser')
    cleaned_text = soup.get_text(separator=' ', strip=True)

    # Clean up any remaining HTML entities and extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

def get_field_value(entry, field_names):
    """Safely get field value from entry, handling lists, dicts and multiple field names"""
    for field_name in field_names:
        if field_name in entry:
            value = entry[field_name]

            # Handle lists
            if isinstance(value, list):
                if value:
                    # If list contains dictionaries with 'value' key
                    if isinstance(value[0], dict) and 'value' in value[0]:
                        return value[0]['value']
                    else:
                        return ' '.join([str(item) for item in value])
                else:
                    return ""

            # Handle dictionaries
            elif isinstance(value, dict):
                if 'value' in value:
                    return value['value']
                else:
                    return str(value)

            # Handle strings and other types
            else:
                return value
    return ""

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract fields with proper handling for lists and dictionaries
        title = clean_html_content(get_field_value(entry, ['title']))
        link = get_field_value(entry, ['link'])

        # Get description from multiple possible fields
        description = clean_html_content(get_field_value(entry, ['description', 'summary', 'subtitle']))

        # Get content from multiple possible fields - handle content:encoded specifically
        content = ""
        if 'content' in entry:
            content = clean_html_content(entry['content'])
        elif 'content:encoded' in entry:
            content = clean_html_content(entry['content:encoded'])
        else:
            content = description  # Fallback

        # Get publication date from multiple possible fields
        pub_date = get_field_value(entry, ['published', 'pubDate', 'dc:date', 'updated'])

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

def main():
    # The actual RSS feed URL
    rss_url = "https://www.tunisienumerique.com/feed-actualites-tunisie.xml"

    print(f"Parsing RSS feed from: {rss_url}")
    print("=" * 100)

    try:
        # Parse the feed
        parsed_data = parse_rss_feed(rss_url)

        print(f"Found {len(parsed_data)} items in the RSS feed\n")

        # Print the results without truncation
        for i, item in enumerate(parsed_data, 1):
            print(f"ITEM {i}:")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("-" * 100)
            print()

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Parsing RSS feed from: https://www.tunisienumerique.com/feed-actualites-tunisie.xml
Found 40 items in the RSS feed

ITEM 1:
Title: Avis aux supporters des Aigles de Carthage : le Maroc impose un e-visa pour assister à la CAN
Link: https://www.tunisienumerique.com/avis-aux-supporters-des-aigles-de-carthage-le-maroc-impose-un-e-visa-pour-assister-a-la-can/
Description: Avis aux supporters des Aigles de Carthage : le Maroc impose un e-visa pour assister à la CAN L’article Avis aux supporters des Aigles de Carthage : le Maroc impose un e-visa pour assister à la CAN est apparu en premier sur Tunisie numerique .
Publication Date: Fri, 19 Sep 2025 18:05:14 +0000
Content: Avis aux supporters des Aigles de Carthage : le Maroc impose un e-visa pour assister à la CAN L’article Avis aux supporters des Aigles de Carthage : le Maroc impose un e-visa pour assister à la CAN est apparu en premier sur Tunisie numerique .
-----------------------------------------------------------------------------------

In [55]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if text is None:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Clean up any remaining HTML entities and special characters
    clean_text = re.sub(r'\s+', ' ', clean_text)  # Replace multiple spaces with single space
    clean_text = clean_text.replace('&#8217;', "'")  # Replace HTML apostrophe
    clean_text = clean_text.replace('&amp;', '&')  # Replace HTML ampersand
    clean_text = clean_text.replace('&lt;', '<')  # Replace HTML less than
    clean_text = clean_text.replace('&gt;', '>')  # Replace HTML greater than
    clean_text = clean_text.replace('&quot;', '"')  # Replace HTML quotes
    clean_text = clean_text.replace('&#8230;', '...')  # Replace HTML ellipsis

    return clean_text.strip()

def extract_rss_feed_data(rss_url):
    """Extract and parse RSS feed data"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    # Check if feed was parsed successfully
    if feed.bozo:
        print(f"Error parsing feed: {feed.bozo_exception}")
        return []

    extracted_data = []

    # Extract data from each item
    for entry in feed.entries:
        # Get description content
        description_content = clean_html_content(entry.get('description', ''))

        # Get main content (if available)
        main_content = ''
        if entry.get('content'):
            main_content = clean_html_content(entry.get('content', [{}])[0].get('value', ''))

        # Combine description with main content if both exist
        full_content = description_content
        if main_content and main_content != description_content:
            full_content = f"{description_content} {main_content}"

        item_data = {
            "title": clean_html_content(entry.get('title', '')),
            "link": entry.get('link', ''),
            "description": description_content,
            "pub_date": entry.get('published', entry.get('pubDate', '')),
            "content": full_content
        }
        extracted_data.append(item_data)

    return extracted_data

# RSS URL from the snippet
rss_url = "https://africanmanager.com/feed/"

# Extract data from the RSS feed
feed_data = extract_rss_feed_data(rss_url)

# Print the extracted data without truncation
for i, item in enumerate(feed_data, 1):
    print(f"=== Item {i} ===")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("\n" + "="*50 + "\n")

=== Item 1 ===
Title: Tourisme : La Tunisie dans le gotha des vacances en famille et des voyages de luxe
Link: https://africanmanager.com/tourisme-la-tunisie-dans-le-gotha-des-vacances-en-famille-et-des-voyages-de-luxe/
Description: La Tunisie, la Turquie, l’Italie, l’Égypte, l’Espagne, la Thaïlande, l’Allemagne, les Émirats arabes unis et la Grèce se sont imposées comme les destinations phares de la dernière étude Dertour, confirmant leur popularité continue pour les vacances en famille et les voyages de luxe. Si la Grèce domine le secteur du luxe, d’autres pays restent de […]
Publication Date: Fri, 19 Sep 2025 15:41:18 +0000
Content: La Tunisie, la Turquie, l’Italie, l’Égypte, l’Espagne, la Thaïlande, l’Allemagne, les Émirats arabes unis et la Grèce se sont imposées comme les destinations phares de la dernière étude Dertour, confirmant leur popularité continue pour les vacances en famille et les voyages de luxe. Si la Grèce domine le secteur du luxe, d’autres pays restent de […]


==

In [60]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean text using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Remove extra whitespace and normalize
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract and clean each field
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Get content - use description if content is empty
        content = clean_html_content(entry.get('content', [{}])[0].get('value', '')) if entry.get('content') else ''
        if not content:
            content = description

        item_data = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }
        results.append(item_data)

    return results

# RSS URL from the snippet
rss_url = "http://www.businessnews.com.tn/rss.xml"

# Parse the RSS feed
parsed_data = parse_rss_feed(rss_url)

# Print the extracted data without truncation
for i, item in enumerate(parsed_data, 1):
    print(f"ITEM {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)

ITEM 1:
Title: Bassem Ennaifer : l'amélioration a touché les indicateurs économiques, mais pas le panier des Tunisiens
Link: http://www.businessnews.com.tn/bassem-ennaifer--lamelioration-a-toue-les-indicateurs-economiques-mais-pas-le-panier-des-tunisiens,520,151433,3
Description: « La clé, c'est la productivité », a-t-il rappelé aux Tunisiens
Publication Date: 2025-09-15T14:52+01:00
Content: « La clé, c'est la productivité », a-t-il rappelé aux Tunisiens
--------------------------------------------------------------------------------
ITEM 2:
Title: ESET Research identifie GhostRedirector, un APT utilisant des outils inédits pour du référencement frauduleux
Link: http://www.businessnews.com.tn/eset-resear-identifie-ghostredirector-un-apt-utilisant-des-outils-inedits-pour-du-referencement-frauduleux,524,151432,3
Description: GhostRedirector présente les caractéristiques d\'un groupe APT aligné sur les intérêts de la Chine, utilisant Gamshen pour du SEO fraud as-a-service afin de manipule

In [61]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities and clean up whitespace
    cleaned_text = html.unescape(cleaned_text)
    cleaned_text = ' '.join(cleaned_text.split())

    return cleaned_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract and clean each field
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Try to get content from different possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        elif hasattr(entry, 'description'):
            content = clean_html_content(entry.description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# RSS URL from the snippet
rss_url = "https://www.radiotataouine.tn/articles/rss"

# Parse the feed
parsed_data = parse_rss_feed(rss_url)

# Print the results without truncation
for i, item in enumerate(parsed_data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: ممثلون عن الاتحاد الدولي للنقل الجوي يؤكدون استعداد المنظمة لدعم تونس في تنفيذ مشاريعها ذات الصلة
Link: https://www.radiotataouine.tn/article/68cd8bfe2b5755c3b5abd7b1/%D9%85%D9%85%D8%AB%D9%84%D9%88%D9%86-%D8%B9%D9%86-%D8%A7%D9%84%D8%A7%D8%AA%D8%AD%D8%A7%D8%AF-%D8%A7%D9%84%D8%AF%D9%88%D9%84%D9%8A-%D9%84%D9%84%D9%86%D9%82%D9%84-%D8%A7%D9%84%D8%AC%D9%88%D9%8A-%D9%8A%D8%A4%D9%83%D8%AF%D9%88%D9%86-%D8%A7%D8%B3%D8%AA%D8%B9%D8%AF%D8%A7%D8%AF-%D8%A7%D9%84%D9%85%D9%86%D8%B8%D9%85%D8%A9-%D9%84%D8%AF%D8%B9%D9%85-%D8%AA%D9%88%D9%86%D8%B3-%D9%81%D9%8A-%D8%AA%D9%86%D9%81%D9%8A%D8%B0-%D9%85%D8%B4%D8%A7%D8%B1%D9%8A%D8%B9%D9%87%D8%A7-%D8%B0%D8%A7%D8%AA-%D8%A7%D9%84%D8%B5%D9%84%D8%A9
Description: ممثلون عن الاتحاد الدولي للنقل الجوي يؤكدون استعداد المنظمة لدعم تونس في تنفيذ مشاريعها ذات الصلة
Publication Date: Fri, 19 Sep 2025 16:56:00 +0100
Content: ممثلون عن الاتحاد الدولي للنقل الجوي يؤكدون استعداد المنظمة لدعم تونس في تنفيذ مشاريعها ذات الصلة
------------------------------------------

In [62]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities and clean up whitespace
    clean_text = html.unescape(clean_text)
    clean_text = ' '.join(clean_text.split())

    return clean_text

def parse_rss_feed(url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    results = []

    for entry in feed.entries:
        # Extract fields with fallbacks
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))

        # Handle publication date (prefer published, then updated, then current date)
        pub_date = entry.get('published', entry.get('updated', ''))
        pub_date = clean_html_content(pub_date)

        # Extract content - try multiple possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        elif hasattr(entry, 'description'):
            content = clean_html_content(entry.description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# Extract RSS URL from the snippet and parse
rss_url = "https://www.radiogafsa.tn/articles/rss"
parsed_data = parse_rss_feed(rss_url)

# Print the results without truncation
for i, item in enumerate(parsed_data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: الإحاطة بأكثر من 600 طفل عبر مركبات الطفولة في سيدي بوزيد
Link: https://www.radiogafsa.tn/article/68cd50922b5755c3b59ad648/%D8%A7%D9%84%D8%A5%D8%AD%D8%A7%D8%B7%D8%A9-%D8%A8%D8%A3%D9%83%D8%AB%D8%B1-%D9%85%D9%86-600-%D8%B7%D9%81%D9%84-%D8%B9%D8%A8%D8%B1-%D9%85%D8%B1%D9%83%D8%A8%D8%A7%D8%AA-%D8%A7%D9%84%D8%B7%D9%81%D9%88%D9%84%D8%A9-%D9%81%D9%8A-%D8%B3%D9%8A%D8%AF%D9%8A-%D8%A8%D9%88%D8%B2%D9%8A%D8%AF
Description: الإحاطة بأكثر من 600 طفل عبر مركبات الطفولة في سيدي بوزيد
Publication Date: Fri, 19 Sep 2025 12:44:00 +0100
Content: الإحاطة بأكثر من 600 طفل عبر مركبات الطفولة في سيدي بوزيد
--------------------------------------------------------------------------------

Item 2:
Title: القصرين: مشروع نموذجي لاستغلال مياه الأمطار لفائدة المؤسسات التربوية والفلاحية
Link: https://www.radiogafsa.tn/article/68cd4d2f2b5755c3b599a116/%D8%A7%D9%84%D9%82%D8%B5%D8%B1%D9%8A%D9%86-%D9%85%D8%B4%D8%B1%D9%88%D8%B9-%D9%86%D9%85%D9%88%D8%B0%D8%AC%D9%8A-%D9%84%D8%A7%D8%B3%D8%AA%D8%BA%D9%84%D8%A7%D

In [63]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_feed_data(rss_url):
    """
    Extract and clean RSS feed data from the given URL
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = get_clean_text(entry.get('title', ''))
        link = entry.get('link', '')
        description = get_clean_text(entry.get('description', ''))
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Extract content - try multiple possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = get_clean_text(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = get_clean_text(entry.summary)
        elif hasattr(entry, 'description'):
            content = get_clean_text(entry.description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

def get_clean_text(html_content):
    """
    Clean HTML content and extract plain text using BeautifulSoup
    """
    if not html_content:
        return ""

    # Decode HTML entities first
    decoded_content = html.unescape(html_content)

    # Use BeautifulSoup to extract text and remove HTML tags
    soup = BeautifulSoup(decoded_content, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Remove extra whitespace
    clean_text = ' '.join(clean_text.split())

    return clean_text

def print_results(results):
    """
    Print the extracted results without truncation
    """
    for i, result in enumerate(results, 1):
        print(f"=== Item {i} ===")
        print(f"Title: {result['title']}")
        print(f"Link: {result['link']}")
        print(f"Description: {result['description']}")
        print(f"Publication Date: {result['pub_date']}")
        print(f"Content: {result['content']}")
        print("-" * 80)
        print()

# RSS URL from the snippet
rss_url = "https://www.radiokef.tn/articles/rss"

# Extract and process the RSS feed
try:
    extracted_data = extract_rss_feed_data(rss_url)

    if extracted_data:
        print_results(extracted_data)
    else:
        print("No data extracted from the RSS feed.")

except Exception as e:
    print(f"Error processing RSS feed: {e}")

=== Item 1 ===
Title: المدير الجهوي للتجارة بالكاف: توفر كافة المواد الاستهلاكية والمنتوجات الفلاحية وانتظامية التزويد
Link: https://www.radiokef.tn/article/68cd97fd2b5755c3b5b0b42b/%D8%A7%D9%84%D9%85%D8%AF%D9%8A%D8%B1-%D8%A7%D9%84%D8%AC%D9%87%D9%88%D9%8A-%D9%84%D9%84%D8%AA%D8%AC%D8%A7%D8%B1%D8%A9-%D8%A8%D8%A7%D9%84%D9%83%D8%A7%D9%81-%D8%AA%D9%88%D9%81%D8%B1-%D9%83%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%85%D9%88%D8%A7%D8%AF-%D8%A7%D9%84%D8%A7%D8%B3%D8%AA%D9%87%D9%84%D8%A7%D9%83%D9%8A%D8%A9-%D9%88%D8%A7%D9%84%D9%85%D9%86%D8%AA%D9%88%D8%AC%D8%A7%D8%AA-%D8%A7%D9%84%D9%81%D9%84%D8%A7%D8%AD%D9%8A%D8%A9-%D9%88%D8%A7%D9%86%D8%AA%D8%B8%D8%A7%D9%85%D9%8A%D8%A9-%D8%A7%D9%84%D8%AA%D8%B2%D9%88%D9%8A%D8%AF
Description: المدير الجهوي للتجارة بالكاف: توفر كافة المواد الاستهلاكية والمنتوجات الفلاحية وانتظامية التزويد
Publication Date: Fri, 19 Sep 2025 18:30:00 +0100
Content: المدير الجهوي للتجارة بالكاف: توفر كافة المواد الاستهلاكية والمنتوجات الفلاحية وانتظامية التزويد
------------------------------------

In [66]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean unwanted content using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Remove extra whitespace and clean up
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    extracted_data = []

    for entry in feed.entries:
        # Extract fields with fallbacks for missing data
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Extract content from multiple possible sources
        content_text = ''

        # First try to get description
        description = clean_html_content(entry.get('description', ''))
        if description:
            content_text = description

        # If no description or it's empty, try other content fields
        if not content_text:
            if hasattr(entry, 'content'):
                content_text = clean_html_content(entry.content[0].value if entry.content else '')
            elif hasattr(entry, 'summary'):
                content_text = clean_html_content(entry.summary)

        # If still no content, use title as fallback
        if not content_text:
            content_text = title

        # Ensure both description and content have the same text
        description = content_text

        # Create dictionary with extracted data
        item_data = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content_text
        }

        extracted_data.append(item_data)

    return extracted_data

# RSS URL from the snippet
rss_url = "https://www.rtci.tn/articles/rss"

# Parse the feed
try:
    results = parse_rss_feed(rss_url)

    # Print the extracted data without truncation
    for i, item in enumerate(results, 1):
        print(f"=== Item {i} ===")
        print(f"Title: {item['title']}")
        print(f"Link: {item['link']}")
        print(f"Description: {item['description']}")
        print(f"Publication Date: {item['pub_date']}")
        print(f"Content: {item['content']}")
        print("\n" + "-" * 80 + "\n")

except Exception as e:
    print(f"Error parsing RSS feed: {e}")

=== Item 1 ===
Title: Médecine hyperbare : un premier congrès national pour structurer l’avenir en Tunisie
Link: https://www.rtci.tn/article/68cd6f062b5755c3b5a2cf69/m%C3%A9decine-hyperbare--un-premier-congr%C3%A8s-national-pour-structurer-lavenir-en-tunisie
Description: La médecine hyperbare à l’honneur à Tunis : un premier congrès national pour structurer l’avenir de la discipline en Tunisie La médecine hyperbare fait un pas décisif en Tunisie avec la tenue du premier Congrès National de Médecine Hyperbare, organisé par la Société Tunisienne de Médecine Hyperbare à la Cité des Sciences de Tunis.
Publication Date: Fri, 19 Sep 2025 14:52:00 +0100
Content: La médecine hyperbare à l’honneur à Tunis : un premier congrès national pour structurer l’avenir de la discipline en Tunisie La médecine hyperbare fait un pas décisif en Tunisie avec la tenue du premier Congrès National de Médecine Hyperbare, organisé par la Société Tunisienne de Médecine Hyperbare à la Cité des Sciences de Tunis.

--

In [69]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_feed_content(rss_url):
    """
    Extract and clean RSS feed content from the given URL
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = get_clean_text(entry.get('title', ''))
        link = entry.get('link', '')
        description = get_clean_text(entry.get('description', ''))
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Extract content - combine all available content sources
        content_parts = []

        # Add description content first
        if description and description != title:
            content_parts.append(description)

        # Add other content sources if available
        if hasattr(entry, 'content'):
            content_text = get_clean_text(entry.content[0].value)
            if content_text and content_text != description and content_text != title:
                content_parts.append(content_text)

        elif hasattr(entry, 'content_encoded'):
            content_text = get_clean_text(entry.content_encoded)
            if content_text and content_text != description and content_text != title:
                content_parts.append(content_text)

        elif hasattr(entry, 'summary_detail') and hasattr(entry.summary_detail, 'value'):
            content_text = get_clean_text(entry.summary_detail.value)
            if content_text and content_text != description and content_text != title:
                content_parts.append(content_text)

        elif hasattr(entry, 'summary'):
            content_text = get_clean_text(entry.summary)
            if content_text and content_text != description and content_text != title:
                content_parts.append(content_text)

        # Combine all content parts
        if content_parts:
            content = ' '.join(content_parts)
        else:
            content = description  # Fallback to description

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

def get_clean_text(html_content):
    """
    Clean HTML content and extract plain text using BeautifulSoup
    """
    if not html_content:
        return ""

    # Decode HTML entities first
    decoded_content = html.unescape(str(html_content))

    # Parse with BeautifulSoup and get text
    soup = BeautifulSoup(decoded_content, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Remove extra whitespace
    clean_text = ' '.join(clean_text.split())

    return clean_text

# Example usage with the RSS URL from your snippet
if __name__ == "__main__":
    rss_url = "https://www.radiomonastir.tn/articles/rss"

    try:
        entries = extract_rss_feed_content(rss_url)

        for i, entry in enumerate(entries, 1):
            print(f"Entry {i}:")
            print(f"Title: {entry['title']}")
            print(f"Link: {entry['link']}")
            print(f"Description: {entry['description']}")
            print(f"Publication Date: {entry['pub_date']}")
            print(f"Content: {entry['content']}")
            print("-" * 80)
            print()

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

Entry 1:
Title: اجتماع لجنة قيادة برنامج للسكن الاجتماعي.
Link: https://www.radiomonastir.tn/article/68cda36e2b5755c3b5b48287/%D8%A7%D8%AC%D8%AA%D9%85%D8%A7%D8%B9-%D9%84%D8%AC%D9%86%D8%A9-%D9%82%D9%8A%D8%A7%D8%AF%D8%A9-%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D8%AC-%D9%84%D9%84%D8%B3%D9%83%D9%86-%D8%A7%D9%84%D8%A7%D8%AC%D8%AA%D9%85%D8%A7%D8%B9%D9%8A
Description: في إطار مواصلة اجراءات توزيع المساكن الاجتماعية الجاهزة لمستحقيها، التأمت اليوم الجمعة 19-09-2025، تحت إشراف وزير التجهيز والإسكان صلاح الزواري، لجنة قيادة برنامج السكن الاجتماعي والتي خصصت للإعلام بالقائمات النهائية للمترشحين للانتفاع بمساكن اجتماعية والمصادقة على جداول تحديد الأثمان بالنسبة لكل منتفع بالمساكن الاجتماعية
Publication Date: Fri, 19 Sep 2025 18:39:00 +0100
Content: في إطار مواصلة اجراءات توزيع المساكن الاجتماعية الجاهزة لمستحقيها، التأمت اليوم الجمعة 19-09-2025، تحت إشراف وزير التجهيز والإسكان صلاح الزواري، لجنة قيادة برنامج السكن الاجتماعي والتي خصصت للإعلام بالقائمات النهائية للمترشحين للانتفاع بمساكن اجتماعية والمصادقة 

In [71]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Unescape HTML entities
    clean_text = html.unescape(clean_text)

    return clean_text

def parse_rss_feed(url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    results = []

    for entry in feed.entries:
        # Extract fields with fallbacks
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))

        # Handle publication date (prefer published, then updated, then current date)
        pub_date = entry.get('published', '') or entry.get('updated', '') or entry.get('date', '')

        # Handle content - try multiple possible content fields and include description
        content_parts = []

        # First try to get main content
        if hasattr(entry, 'content') and entry.content:
            content_parts.append(clean_html_content(entry.content[0].value))
        elif hasattr(entry, 'summary'):
            content_parts.append(clean_html_content(entry.summary))

        # Always include description in content
        if description and description not in content_parts:
            content_parts.append(description)

        # Combine all content parts
        content = ' '.join(content_parts).strip()

        # If no content was found from other sources, use description as fallback
        if not content and description:
            content = description

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# Example usage with the provided RSS URL
if __name__ == "__main__":
    rss_url = "https://www.radiojeunes.tn/articles/rss"

    try:
        articles = parse_rss_feed(rss_url)

        for i, article in enumerate(articles, 1):
            print(f"=== Article {i} ===")
            print(f"Title: {article['title']}")
            print(f"Link: {article['link']}")
            print(f"Description: {article['description']}")
            print(f"Publication Date: {article['pub_date']}")
            print(f"Content: {article['content']}")
            print("\n" + "-" * 80 + "\n")

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

=== Article 1 ===
Title: اللجنة الإستشارية لبرنامج التأهيل الصناعي للمؤسسات تُصادق على 4 ملفات جديدة باستثمارات جملية تفوق 110 مليون دينار
Link: https://www.radiojeunes.tn/article/68cda3ef2b5755c3b5b4b8be/%D8%A7%D9%84%D9%84%D8%AC%D9%86%D8%A9-%D8%A7%D9%84%D8%A5%D8%B3%D8%AA%D8%B4%D8%A7%D8%B1%D9%8A%D8%A9-%D9%84%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D8%AC-%D8%A7%D9%84%D8%AA%D8%A3%D9%87%D9%8A%D9%84-%D8%A7%D9%84%D8%B5%D9%86%D8%A7%D8%B9%D9%8A-%D9%84%D9%84%D9%85%D8%A4%D8%B3%D8%B3%D8%A7%D8%AA-%D8%AA%D8%B5%D8%A7%D8%AF%D9%82-%D8%B9%D9%84%D9%89-4-%D9%85%D9%84%D9%81%D8%A7%D8%AA-%D8%AC%D8%AF%D9%8A%D8%AF%D8%A9-%D8%A8%D8%A7%D8%B3%D8%AA%D8%AB%D9%85%D8%A7%D8%B1%D8%A7%D8%AA-%D8%AC%D9%85%D9%84%D9%8A%D8%A9-%D8%AA%D9%81%D9%88%D9%82-110-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D8%AF%D9%8A%D9%86%D8%A7%D8%B1
Description: صادقت اللّجنة الاستشارية لبرنامج التأهيل الصناعي للمؤسسات على 4 ملفات لمؤسسات صناعية ناشطة في مجالات الصّناعات الغذائيّة والنسيج والملابس وقطاع الصناعات الميكانيكية والكهربائية وصناعات مختلفة، باستثمارات جملية

In [73]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_content(rss_url):
    """
    Extract and clean content from RSS feed using feedparser and BeautifulSoup
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = getattr(entry, 'title', '')
        link = getattr(entry, 'link', '')
        description = getattr(entry, 'description', '')
        pub_date = getattr(entry, 'published', getattr(entry, 'pubDate', ''))

        # Extract content - try multiple possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = entry.content[0].value if entry.content else ''
        elif hasattr(entry, 'summary'):
            content = entry.summary

        # Combine content with description if both exist
        if content and description:
            combined_content = f"{description} {content}"
        elif content:
            combined_content = content
        else:
            combined_content = description

        # Clean HTML and unwanted strings using BeautifulSoup
        title_clean = clean_html_content(title)
        description_clean = clean_html_content(description)
        content_clean = clean_html_content(combined_content)

        # Create result dictionary
        result = {
            "title": title_clean,
            "link": link,
            "description": description_clean,
            "pub_date": pub_date,
            "content": content_clean
        }

        results.append(result)

    return results

def clean_html_content(text):
    """
    Clean HTML content and extract plain text using BeautifulSoup
    """
    if not text:
        return ""

    # Decode HTML entities first
    text = html.unescape(text)

    # Use BeautifulSoup to extract text and clean HTML tags
    soup = BeautifulSoup(text, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()

    # Get clean text
    clean_text = soup.get_text()

    # Clean up whitespace
    clean_text = ' '.join(clean_text.split())

    return clean_text

def print_results(results):
    """
    Print the extracted results without truncation
    """
    for i, result in enumerate(results, 1):
        print(f"=== Item {i} ===")
        print(f"Title: {result['title']}")
        print(f"Link: {result['link']}")
        print(f"Description: {result['description']}")
        print(f"Publication Date: {result['pub_date']}")
        print(f"Content: {result['content']}")
        print("\n" + "-"*80 + "\n")

# Main execution
if __name__ == "__main__":
    rss_url = "https://www.radionationale.tn/articles/rss"

    try:
        results = extract_rss_content(rss_url)
        print_results(results)

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

=== Item 1 ===
Title: معاقبة ماهر الكنزاري مدرب الترجي بــ 4 مباريات وتخطئته ب7 الاف دينار
Link: https://www.radionationale.tn/article/68cda6dc2b5755c3b5b5a6e7/%D9%85%D8%B9%D8%A7%D9%82%D8%A8%D8%A9-%D9%85%D8%A7%D9%87%D8%B1-%D8%A7%D9%84%D9%83%D9%86%D8%B2%D8%A7%D8%B1%D9%8A-%D9%85%D8%AF%D8%B1%D8%A8-%D8%A7%D9%84%D8%AA%D8%B1%D8%AC%D9%8A-%D8%A8%D9%80%D9%80-4-%D9%85%D8%A8%D8%A7%D8%B1%D9%8A%D8%A7%D8%AA-%D9%88%D8%AA%D8%AE%D8%B7%D8%A6%D8%AA%D9%87-%D8%A87-%D8%A7%D9%84%D8%A7%D9%81-%D8%AF%D9%8A%D9%86%D8%A7%D8%B1
Description: قررت الهيئة المكلفة بتسيير شؤون الرابطة الوطنية لكرة القدم المحترفة خلال اجتماعها اليوم الجمعة معاقبة مدرب الترجي الرياضي ماهر الكنزاري بمنعه من الجلوس على دكة البدلاء ب4 مباريات
Publication Date: Fri, 19 Sep 2025 18:54:00 +0100
Content: قررت الهيئة المكلفة بتسيير شؤون الرابطة الوطنية لكرة القدم المحترفة خلال اجتماعها اليوم الجمعة معاقبة مدرب الترجي الرياضي ماهر الكنزاري بمنعه من الجلوس على دكة البدلاء ب4 مباريات معاقبة ماهر الكنزاري مدرب الترجي بــ 4 مباريات وتخطئته ب7 الاف دين

In [74]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities and clean up
    clean_text = html.unescape(clean_text)
    clean_text = ' '.join(clean_text.split())  # Normalize whitespace

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract fields with fallbacks
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))

        # Handle publication date (try multiple possible fields)
        pub_date = entry.get('published', '') or entry.get('pubDate', '') or entry.get('updated', '')

        # Handle content (try multiple possible fields)
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        elif hasattr(entry, 'description'):
            content = clean_html_content(entry.description)

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

# Example usage
if __name__ == "__main__":
    rss_url = "https://www.radiosfax.tn/articles/rss"

    try:
        entries = parse_rss_feed(rss_url)

        for i, entry in enumerate(entries, 1):
            print(f"Entry {i}:")
            print(f"Title: {entry['title']}")
            print(f"Link: {entry['link']}")
            print(f"Description: {entry['description']}")
            print(f"Publication Date: {entry['pub_date']}")
            print(f"Content: {entry['content']}")
            print("-" * 80)
            print()

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

Entry 1:
Title: رئيسة قسم الأعصاب بمستشفى الحبيب بورقية صفاقس: تونس تسجل سنويا بين 18 و24 ألف حالة جلطة دماغية
Link: https://www.radiosfax.tn/article/68cd365a2b5755c3b5917205/%D8%B1%D8%A6%D9%8A%D8%B3%D8%A9-%D9%82%D8%B3%D9%85-%D8%A7%D9%84%D8%A3%D8%B9%D8%B5%D8%A7%D8%A8-%D8%A8%D9%85%D8%B3%D8%AA%D8%B4%D9%81%D9%89-%D8%A7%D9%84%D8%AD%D8%A8%D9%8A%D8%A8-%D8%A8%D9%88%D8%B1%D9%82%D9%8A%D8%A9-%D8%B5%D9%81%D8%A7%D9%82%D8%B3-%D8%AA%D9%88%D9%86%D8%B3-%D8%AA%D8%B3%D8%AC%D9%84-%D8%B3%D9%86%D9%88%D9%8A%D8%A7-%D8%A8%D9%8A%D9%86-18-%D9%8824-%D8%A3%D9%84%D9%81-%D8%AD%D8%A7%D9%84%D8%A9-%D8%AC%D9%84%D8%B7%D8%A9-%D8%AF%D9%85%D8%A7%D8%BA%D9%8A%D8%A9
Description: رئيسة قسم الأعصاب بمستشفى الحبيب بورقية صفاقس: تونس تسجل سنويا بين 18 و24 ألف حالة جلطة دماغية
Publication Date: Fri, 19 Sep 2025 10:20:00 +0100
Content: رئيسة قسم الأعصاب بمستشفى الحبيب بورقية صفاقس: تونس تسجل سنويا بين 18 و24 ألف حالة جلطة دماغية
--------------------------------------------------------------------------------

Entry 2:
Title: جلسة ح

In [78]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Clean HTML content and extract plain text"""
    if not text:
        return ""

    # Parse HTML content
    soup = BeautifulSoup(text, 'html.parser')

    # Remove unwanted tags but preserve text content
    for element in soup(['script', 'style', 'iframe', 'noscript', 'header', 'footer', 'nav', 'aside']):
        element.decompose()

    # Get clean text
    clean_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities and normalize whitespace
    clean_text = html.unescape(clean_text)
    clean_text = ' '.join(clean_text.split())

    return clean_text

def parse_rss_feed(rss_url):
    """Parse RSS feed and extract required fields"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    # Process each item in the feed
    for entry in feed.entries:
        # Extract and clean each field
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')
        description = clean_html_content(entry.get('description', ''))

        # Handle publication date
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Handle content - try different possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)

        # Combine content with description if both exist
        if content and description:
            full_content = f"{content} {description}"
        elif content:
            full_content = content
        elif description:
            full_content = description
        else:
            full_content = ''

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": full_content
        }

        results.append(result)

    return results

# Extract RSS URL from the provided snippet
rss_url = "https://www.mosaiquefm.net/ar/rss"

# Parse the RSS feed
extracted_data = parse_rss_feed(rss_url)

# Print the extracted data without truncation
for i, item in enumerate(extracted_data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: الفيتو.. كيف يُوقف صوت واحد قرارات العالم؟
Link: https://www.mosaiquefm.net/ar/%D8%A7%D9%84%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1-%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85%D9%8A%D8%A9/1459368/%D8%A7%D9%84%D9%81%D9%8A%D8%AA%D9%88-%D9%83%D9%8A%D9%81-%D9%8A-%D9%88%D9%82%D9%81-%D8%B5%D9%88%D8%AA-%D9%88%D8%A7%D8%AD%D8%AF-%D9%82%D8%B1%D8%A7%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85
Description: للمرّة السادسة تستخدم الولايات المتحدة الأمريكية حق النقض (الفيتو) في مجلس الأمن بشأن الحرب المستمرة في غزة.
Publication Date: Fri, 19 Sep 2025 19:57:09 +0100
Content: الفيتو.. كيف يُوقف صوت واحد قرارات العالم؟ للمرّة السادسة تستخدم الولايات المتحدة الأمريكية حق النقض (الفيتو) في مجلس الأمن بشأن الحرب المستمرة في غزة.
--------------------------------------------------------------------------------

Item 2:
Title: أكثر من 100 ألف حالة زهايمر في تونس
Link: https://www.mosaiquefm.net/ar/%D8%AA%D9%88%D9%86%D8%B3-%D9%88%D8%B7%D9%86%D9%8A%D8%A9/1459374/%D8%A3%D9%83%D8%AB%D8%B1-%D9%85%D9%86-10

In [80]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_content(rss_url):
    """
    Extract and clean content from RSS feed using feedparser and BeautifulSoup
    """
    # Parse the RSS feed
    print(f"Parsing RSS feed from: {rss_url}")
    feed = feedparser.parse(rss_url)

    # Check if feed was parsed successfully
    if feed.bozo and feed.bozo_exception:
        print(f"Error parsing RSS feed: {feed.bozo_exception}")
        return

    print(f"Found {len(feed.entries)} items in the feed\n")

    # Extract and process each item
    for i, item in enumerate(feed.entries, 1):
        print(f"Item {i}:")
        print("-" * 60)

        # Extract basic fields with fallbacks
        title = getattr(item, 'title', 'No title available')
        link = getattr(item, 'link', 'No link available')
        pub_date = getattr(item, 'published', getattr(item, 'pubDate', 'No date available'))

        # Extract and clean description
        description = getattr(item, 'description', '')
        cleaned_description = clean_html_content(description)

        # Extract content - try multiple possible fields
        content = ''
        if hasattr(item, 'content'):
            content = item.content[0].get('value', '') if item.content else ''
        elif hasattr(item, 'summary'):
            content = item.summary
        elif hasattr(item, 'description'):
            content = item.description

        cleaned_content = clean_html_content(content)

        # Print all extracted information
        print(f"Title: {title}")
        print(f"Link: {link}")
        print(f"Publication Date: {pub_date}")
        print(f"Description: {cleaned_description}")
        print(f"Content: {cleaned_content}")
        print("-" * 60)
        print("\n")

def clean_html_content(html_content):
    """
    Clean HTML content and extract plain text using BeautifulSoup
    """
    if not html_content:
        return "No content available"

    try:
        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Remove unwanted tags but keep text content
        for unwanted_tag in soup(['script', 'style', 'img', 'br', 'link', 'meta', 'iframe', 'form', 'input', 'button']):
            unwanted_tag.decompose()

        # Remove all attributes from remaining tags
        for tag in soup.find_all(True):
            tag.attrs = {}

        # Get clean text and unescape HTML entities
        clean_text = soup.get_text(separator='\n', strip=True)
        clean_text = html.unescape(clean_text)

        # Remove extra whitespace and normalize line breaks
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        clean_text = '\n'.join(lines)

        return clean_text if clean_text else "No text content found after cleaning"

    except Exception as e:
        print(f"Error cleaning HTML content: {e}")
        # Fallback: return original content with basic cleaning
        return html.unescape(html_content).strip()

def main():
    """
    Main function to execute the RSS parsing
    """
    # The actual RSS feed URL provided
    rss_url = "https://www.jawharafm.net/ar/rss/showRss/88/1/1"

    print("Starting RSS Feed Extraction")
    print("=" * 60)

    try:
        extract_rss_content(rss_url)
    except Exception as e:
        print(f"An error occurred: {e}")

    print("Extraction completed!")

if __name__ == "__main__":
    main()

Starting RSS Feed Extraction
Parsing RSS feed from: https://www.jawharafm.net/ar/rss/showRss/88/1/1
Found 30 items in the feed

Item 1:
------------------------------------------------------------
Title: البعثة الدائمة لتونس بجنيف تدين خلال جلسة حوار عاجل الاعتداء على قطر
Link: https://www.jawharafm.net/ar/article/البعثة-الدائمة-لتونس-بجنيف-تدين-خلال-جلسة-حوار-عاجل-الاعتداء-على-قطر/92/281484
Publication Date: 2025-09-16T16:16:00+01:00
Description: أكّدت البعثة التونسية الدائمة بمجلس حقوق الإنسان بجنيف، إدانة تونس بأشدّ العبارات للاعتداء الغادر والجبان الذي أقدم عليه الكيان الصّهيوني المحتلّ يوم التاسع من سبتمبر 2025 ضدّ قطر في انتهاك سافر لكل القوانين والمواثيق الدّوليّة.
Content: أكّدت البعثة التونسية الدائمة بمجلس حقوق الإنسان بجنيف، إدانة تونس بأشدّ العبارات للاعتداء الغادر والجبان الذي أقدم عليه الكيان الصّهيوني المحتلّ يوم التاسع من سبتمبر 2025 ضدّ قطر في انتهاك سافر لكل القوانين والمواثيق الدّوليّة.
------------------------------------------------------------


Item 2:
----------

In [81]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and clean the text content"""
    if not text:
        return ""

    # Parse HTML content
    soup = BeautifulSoup(text, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()

    # Get text and clean it
    clean_text = soup.get_text()

    # Clean up whitespace and special characters
    clean_text = ' '.join(clean_text.split())
    clean_text = html.unescape(clean_text)

    return clean_text

def extract_rss_feed_data(url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    extracted_data = []

    for entry in feed.entries:
        # Extract basic fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')

        # Extract description and clean it
        description = clean_html_content(entry.get('description', ''))

        # Extract publication date
        pub_date = entry.get('published', '') or entry.get('pubDate', '')

        # Extract content - try different possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        elif hasattr(entry, 'description'):
            content = clean_html_content(entry.description)

        # For some feeds, content might be in different namespaces
        if not content and hasattr(entry, 'content_encoded'):
            content = clean_html_content(entry.content_encoded)

        item_data = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        extracted_data.append(item_data)

    return extracted_data

def main():
    # Extract the RSS URL from the provided snippet
    rss_url = "https://essahafa.tn/feed/"

    try:
        # Extract data from the RSS feed
        data = extract_rss_feed_data(rss_url)

        # Print the extracted data
        for i, item in enumerate(data, 1):
            print(f"Item {i}:")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("-" * 80)
            print()

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

if __name__ == "__main__":
    main()

Item 1:
Title: فواصل رياضية: كـيــروش لـمـاذا جــاء ولـمـاذا ذهــب؟ …ولــن يعود
Link: https://essahafa.tn/2025/01/14/%d9%81%d9%88%d8%a7%d8%b5%d9%84-%d8%b1%d9%8a%d8%a7%d8%b6%d9%8a%d8%a9-%d9%83%d9%80%d9%8a%d9%80%d9%80%d8%b1%d9%88%d8%b4-%d9%84%d9%80%d9%85%d9%80%d8%a7%d8%b0%d8%a7-%d8%ac%d9%80%d9%80%d8%a7%d8%a1-%d9%88/
Description: ‭ ‬يكتبها‭ ‬عبد‭ ‬السلام‭ ‬ضيف‭ ‬الله عاد‭ ‬كارلوس‭ ‬كيروش‭ ‬من‭ ‬حيث‭ ‬أتى،‭ ‬ربما‭ ‬كان‭ ‬يكفي‭ ‬قدومه‭ ‬الذي‭ ‬فهمت‭ ‬انه‭ ‬كان‭ ‬بطلب‭ ‬منه‭ ‬ليفتح‭ ‬ملف‭ ‬المنتخب‭ ‬الوطني‭ ‬دون‭ ‬أن‭ ‬يقفل،‭ ‬وأكثر‭ ‬من‭ ‬ذلك‭ ‬ليفتح‭ ‬الأعين‭ ‬التي‭ ‬نامت‭ ‬أو‭ ‬اتعامتب‭ ‬عن‭ ‬وضع‭ ‬اكرتناب‭ ‬وحالها‭ ‬وأحوالها‭. ‬مجيء‭ ‬الفني‭ ‬البرتغالي‭ ‬إلى‭ ‬تونس‭ ‬كأنه‭ ‬رسالة‭ ‬أخرى‭ ‬تدق‭ ‬باب‭ ‬كرة‭ ‬القدم‭ … ظهرت المقالة فواصل رياضية: كـيــروش لـمـاذا جــاء ولـمـاذا ذهــب؟ …ولــن يعود أولاً على الصحافة اليوم - يومية اخبارية جامعة.
Publication Date: Tue, 14 Jan 2025 09:31:55 +0000
Content: ‭ ‬يكتبها‭ ‬عبد‭ ‬السلام‭ ‬ضيف‭ ‬الله عاد‭ ‬كارلوس‭ ‬كيروش‭ ‬من‭ ‬حيث‭ ‬أتى،‭ ‬ربما‭ ‬كان‭ ‬ي

In [82]:
import feedparser
from bs4 import BeautifulSoup
import html

def clean_html_content(text):
    """Remove HTML tags and decode HTML entities"""
    if not text:
        return ""

    # Parse with BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ', strip=True)

    # Decode HTML entities
    cleaned_text = html.unescape(cleaned_text)

    return cleaned_text

def extract_rss_feed_data(url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    extracted_data = []

    for entry in feed.entries:
        # Extract and clean each field
        item_data = {
            "title": clean_html_content(entry.get('title', '')),
            "link": entry.get('link', ''),
            "description": clean_html_content(entry.get('description', '')),
            "pub_date": entry.get('published', entry.get('pubDate', '')),
            "content": clean_html_content(entry.get('content', [{}])[0].get('value', '') if entry.get('content') else '')
        }

        # If content is empty, try alternative content fields
        if not item_data['content']:
            item_data['content'] = clean_html_content(entry.get('summary', ''))

        extracted_data.append(item_data)

    return extracted_data

# RSS feed URL from the snippet
rss_url = "https://lapresse.tn/feed/"

# Extract data from the RSS feed
data = extract_rss_feed_data(rss_url)

# Print the extracted data without truncation
for i, item in enumerate(data, 1):
    print(f"Item {i}:")
    print(f"Title: {item['title']}")
    print(f"Link: {item['link']}")
    print(f"Description: {item['description']}")
    print(f"Publication Date: {item['pub_date']}")
    print(f"Content: {item['content']}")
    print("-" * 80)
    print()

Item 1:
Title: Maher Kanzari, suspendu 4 matchs et écope de 7 000 dinars d’amende
Link: https://lapresse.tn/2025/09/19/maher-kanzari-suspendu-4-matchs-et-ecope-de-7-000-dinars-damende/
Description: La commission de la Ligue Nationale de Football Professionnel a rendu ses décisions concernant les récentes rencontres du championnat. L’entraîneur de l’Espérance Sportive de Tunis, Maher Kanzari, a été lourdement sanctionné pour son comportement lors du match contre le Stade Tunisien le 13 septembre dernier. Il a écopé d’une interdiction de s’asseoir sur le banc …
Publication Date: Fri, 19 Sep 2025 19:04:43 +0000
Content: La commission de la Ligue Nationale de Football Professionnel a rendu ses décisions concernant les récentes rencontres du championnat. L’entraîneur de l’Espérance Sportive de Tunis, Maher Kanzari , a été lourdement sanctionné pour son comportement lors du match contre le Stade Tunisien le 13 septembre dernier. Il a écopé d’une interdiction de s’asseoir sur le banc de touch

In [83]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_content(rss_url):
    """
    Extract and clean RSS feed content from the given URL
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    # List to store extracted items
    extracted_items = []

    # Process each item in the feed
    for entry in feed.entries:
        item_data = {}

        # Extract title
        item_data['title'] = clean_html_content(entry.get('title', ''))

        # Extract link
        item_data['link'] = entry.get('link', '')

        # Extract description
        description = entry.get('description', '')
        if not description and hasattr(entry, 'summary'):
            description = entry.summary
        item_data['description'] = clean_html_content(description)

        # Extract publication date
        item_data['pub_date'] = entry.get('published', entry.get('pubDate', ''))

        # Extract content - try multiple possible fields
        content = ''
        if hasattr(entry, 'content'):
            content = entry.content[0].value if entry.content else ''
        elif hasattr(entry, 'content_encoded'):
            content = entry.content_encoded
        elif hasattr(entry, 'summary_detail'):
            content = entry.summary_detail.value

        item_data['content'] = clean_html_content(content)

        extracted_items.append(item_data)

    return extracted_items

def clean_html_content(text):
    """
    Clean HTML content using BeautifulSoup and return plain text
    """
    if not text:
        return ""

    # Decode HTML entities first
    decoded_text = html.unescape(text)

    # Parse with BeautifulSoup to extract text only
    soup = BeautifulSoup(decoded_text, 'html.parser')

    # Get clean text without HTML tags
    clean_text = soup.get_text(separator=' ', strip=False)

    # Clean up extra whitespace but preserve meaningful spacing
    clean_text = ' '.join(clean_text.split())

    return clean_text

def print_extracted_content(items):
    """
    Print extracted content without truncation
    """
    for i, item in enumerate(items, 1):
        print(f"=== Item {i} ===")
        print(f"Title: {item['title']}")
        print(f"Link: {item['link']}")
        print(f"Description: {item['description']}")
        print(f"Publication Date: {item['pub_date']}")
        print(f"Content: {item['content']}")
        print("\n" + "-"*80 + "\n")

# Main execution
if __name__ == "__main__":
    # Extract RSS URL from the provided snippet
    rss_url = "https://radiomedtunisie.com/feed/"

    try:
        # Extract content from the RSS feed
        extracted_items = extract_rss_content(rss_url)

        # Print the extracted content
        print_extracted_content(extracted_items)

    except Exception as e:
        print(f"Error processing RSS feed: {e}")

=== Item 1 ===
Title: انشطة دينية سياحية و ثقافية احتفالا بالمولد النبوي الشريف
Link: https://radiomedtunisie.com/%d8%a7%d9%86%d8%b4%d8%b7%d8%a9-%d8%af%d9%8a%d9%86%d9%8a%d8%a9-%d8%b3%d9%8a%d8%a7%d8%ad%d9%8a%d8%a9-%d9%88-%d8%ab%d9%82%d8%a7%d9%81%d9%8a%d8%a9-%d8%a7%d8%ad%d8%aa%d9%81%d8%a7%d9%84%d8%a7-%d8%a8%d8%a7/
Description: انعقدت يوم امس الخميس 28 أوت 2025 باحد الفضاء ات الخاصة بولاية القيروان الندوة الصحفية لجمعية الاحتفالات الدولية بمناسبة المولد النبوي الشريف للاعلان عن برنامج الدورة الثامنة من الاحتفالات ، و التي ستتضمن برمجة متنوعة ترتكز أساسا على الاأنشطة الدينية من مسابقات في تلاوة و ترتيل القرآن و رفع الآذان على طريقة الشيخ علي البراق […]
Publication Date: Fri, 29 Aug 2025 09:50:00 +0000
Content: انعقدت يوم امس الخميس 28 أوت 2025 باحد الفضاء ات الخاصة بولاية القيروان الندوة الصحفية لجمعية الاحتفالات الدولية بمناسبة المولد النبوي الشريف للاعلان عن برنامج الدورة الثامنة من الاحتفالات ، و التي ستتضمن برمجة متنوعة ترتكز أساسا على الاأنشطة الدينية من مسابقات في تلاوة و ترتيل القرآ

In [86]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean unwanted content using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Remove script, style, and other non-content elements
    for element in soup(["script", "style", "aside", "div", "span"]):
        element.decompose()

    # Get text and clean up
    clean_text = soup.get_text()

    # Remove extra whitespace and newlines
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

def remove_boilerplate_text(text):
    """Remove WordPress boilerplate text from the content"""
    if not text:
        return ""

    # Patterns to remove WordPress boilerplate
    patterns = [
        r'The post.*?appeared first on.*?\.?$',
        r'ظهر أولاً على.*?\.?$',
        r'المصدر:.*?\.?$',
        r'Source:.*?\.?$',
        r'أوازيس أف أم\.?$',
        r'The post.*?first on.*?\.?$'
    ]

    # Remove each pattern
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

    # Clean up any trailing punctuation or whitespace
    text = re.sub(r'[\.\s]+$', '', text)
    text = text.strip()

    return text

def extract_article_content(content):
    """Extract the main article content by removing WordPress boilerplate"""
    if not content:
        return ""

    soup = BeautifulSoup(content, 'html.parser')

    # Remove common WordPress boilerplate patterns
    for element in soup.find_all(['p', 'div']):
        text = element.get_text().strip()
        # Remove elements that contain boilerplate text
        if any(phrase in text for phrase in ['The post', 'appeared first on', 'first on', 'Source:']):
            element.decompose()
        if any(phrase in text for phrase in ['أوازيس أف أم', 'المصدر:', 'ظهر أولاً على']):
            element.decompose()

    # Get clean text
    clean_content = soup.get_text()

    # Remove any remaining boilerplate
    clean_content = remove_boilerplate_text(clean_content)

    return clean_content

def extract_rss_feed_data(url):
    """Extract and clean RSS feed data"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    extracted_data = []

    for entry in feed.entries:
        # Extract basic fields
        title = entry.get('title', '')
        link = entry.get('link', '')

        # Handle description - clean HTML content and remove boilerplate
        description = entry.get('description', '')
        cleaned_description = clean_html_content(description)
        cleaned_description = remove_boilerplate_text(cleaned_description)

        # Handle publication date
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Handle content - try different possible content fields
        content = ''
        if hasattr(entry, 'content'):
            content = entry.content[0].value if entry.content else ''
        elif hasattr(entry, 'content_encoded'):
            content = entry.content_encoded
        elif hasattr(entry, 'summary'):
            content = entry.summary
        else:
            # Fallback to description if no content field found
            content = description

        # Extract actual article content by removing boilerplate
        cleaned_content = extract_article_content(content)

        # If content extraction failed, fall back to cleaned description
        if not cleaned_content or len(cleaned_content) < 20:
            cleaned_content = cleaned_description

        # Create data dictionary
        item_data = {
            "title": title,
            "link": link,
            "description": cleaned_description,
            "pub_date": pub_date,
            "content": cleaned_content
        }

        extracted_data.append(item_data)

    return extracted_data

def main():
    # RSS feed URL from the snippet
    rss_url = "https://oasis-fm.net/feed/"

    try:
        # Extract data from RSS feed
        data = extract_rss_feed_data(rss_url)

        # Print extracted data
        for i, item in enumerate(data, 1):
            print(f"Item {i}:")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("-" * 80)

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

if __name__ == "__main__":
    main()

Item 1:
Title: الرابطة2: زياد العلوي ينتقل إلى التقدم الرياضي بساقية الدائر
Link: https://oasis-fm.net/%d8%a7%d9%84%d8%b1%d8%a7%d8%a8%d8%b7%d8%a92-%d8%b2%d9%8a%d8%a7%d8%af-%d8%a7%d9%84%d8%b9%d9%84%d9%88%d9%8a-%d9%8a%d9%86%d8%aa%d9%82%d9%84-%d8%a5%d9%84%d9%89-%d8%a7%d9%84%d8%aa%d9%82%d8%af%d9%85-%d8%a7/
Description: أمضى المهاجم السابق للنادي البنزرتي زياد العلوي عقداً لمدة موسم مع التقدم الرياضي بساقية الدائر ومن المنتظر ان يباشر العلوي تحضيراته مع التقدم الرياضي هذا الاسبوع إستعداداً لبداية بطولة الرابطة المحترفة الثانية يشار إلى أن زياد العلوي سبق له اللعب في الإتحاد المنستيري والنادي الإفريقي ونادي النصر بنغازي
Publication Date: Tue, 16 Sep 2025 08:57:36 +0000
Content: أمضى المهاجم السابق للنادي البنزرتي زياد العلوي عقداً لمدة موسم مع التقدم الرياضي بساقية الدائر

ومن المنتظر ان يباشر العلوي تحضيراته مع التقدم الرياضي هذا الاسبوع إستعداداً لبداية بطولة الرابطة المحترفة الثانية
يشار إلى أن زياد العلوي سبق له اللعب في الإتحاد المنستيري والنادي الإفريقي ونادي النصر بنغازي
-------------

In [87]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean the text using BeautifulSoup"""
    if not text:
        return ""

    # Parse with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
        script.decompose()

    # Get text and clean it
    text = soup.get_text()

    # Clean up whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_feed_data(rss_url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    extracted_data = []

    for entry in feed.entries:
        # Extract basic fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')

        # Handle description (could be in different fields)
        description = clean_html_content(entry.get('description', ''))
        if not description:
            description = clean_html_content(entry.get('summary', ''))

        # Handle publication date
        pub_date = entry.get('published', '')
        if not pub_date:
            pub_date = entry.get('updated', '')
        if not pub_date:
            pub_date = entry.get('dc_date', '')

        # Handle content (could be in different fields)
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value if entry.content else '')
        elif hasattr(entry, 'content_encoded'):
            content = clean_html_content(entry.content_encoded)
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)

        # Create data dictionary
        item_data = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        extracted_data.append(item_data)

    return extracted_data

# Example usage with the provided RSS URL
if __name__ == "__main__":
    rss_url = "https://inkyfada.com/en/feed/"

    try:
        data = extract_feed_data(rss_url)

        for i, item in enumerate(data, 1):
            print(f"Item {i}:")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("-" * 80)

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

Item 1:
Title: Tunisia’s descent into dictatorship: HRW condemns arbitrary detentions and repressive justice in Tunisia
Link: https://inkyfada.com/en/2025/04/16/human-rights-watch-repressive-justice-tunisia/
Description: Human Rights Watch has released a damning report on the authoritarian drift of Kaïs Saïed’s regime, denouncing arbitrary detentions and a repressive justice system.
Publication Date: Tue, 22 Apr 2025 12:10:18 +0000
--------------------------------------------------------------------------------
Item 2:
Title: Leaked data reveals the extent of Tunisia and the European Union’s cooperation on migration
Link: https://inkyfada.com/en/2025/04/03/leaks-migration-tunisia-europe/
Description: In 2023, Tunisia, the pivotal migration crossroads between Africa and Europe, emerged as the primary route to Italy, surpassing Libya. As Tunisia deepens its cooperation with the European Union, leaked confidential documents shed light on the scale of the migration crisis, its key figures,

In [88]:
import feedparser
from bs4 import BeautifulSoup
import re

def clean_html_content(text):
    """Remove HTML tags and clean up whitespace"""
    if not text:
        return ""

    # Parse with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
        script.decompose()

    # Get text content
    text = soup.get_text()

    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_rss_feed_data(url):
    """Extract and clean data from RSS feed"""
    # Parse the RSS feed
    feed = feedparser.parse(url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = clean_html_content(entry.get('title', ''))
        link = entry.get('link', '')

        # Handle description (may contain HTML)
        description = clean_html_content(entry.get('description', ''))

        # Handle publication date
        pub_date = entry.get('published', entry.get('pubDate', ''))

        # Handle content - try different possible content fields
        content = ''
        if hasattr(entry, 'content'):
            content = clean_html_content(entry.content[0].value)
        elif hasattr(entry, 'summary'):
            content = clean_html_content(entry.summary)
        elif hasattr(entry, 'description'):
            content = clean_html_content(entry.description)

        # For some feeds, content might be in other fields
        if not content:
            for key in entry.keys():
                if 'content' in key.lower() and isinstance(entry[key], str):
                    content = clean_html_content(entry[key])
                    break

        # Create result dictionary
        result = {
            "title": title,
            "link": link,
            "description": description,
            "pub_date": pub_date,
            "content": content
        }

        results.append(result)

    return results

def main():
    # Extract the URL from the provided snippet
    rss_url = "https://ftdes.net/feed/"

    try:
        # Extract data from RSS feed
        feed_data = extract_rss_feed_data(rss_url)

        # Print the extracted data
        for i, item in enumerate(feed_data, 1):
            print(f"=== Item {i} ===")
            print(f"Title: {item['title']}")
            print(f"Link: {item['link']}")
            print(f"Description: {item['description']}")
            print(f"Publication Date: {item['pub_date']}")
            print(f"Content: {item['content']}")
            print("\n" + "="*80 + "\n")

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

if __name__ == "__main__":
    main()

=== Item 1 ===
Title: Colloque international: « Vulnérabilités des jeunes, sens des études et sens de la vie »
Link: https://ftdes.net/colloque-international-vulnerabilites-des-jeunes-sens-des-etudes-et-sens-de-la-vie/?utm_source=rss&utm_medium=rss&utm_campaign=colloque-international-vulnerabilites-des-jeunes-sens-des-etudes-et-sens-de-la-vie
Description: Colloque international: « Vulnérabilités des jeunes, sens des études et sens de la vie » 10-11 octobre 2025 à la Bibliothèque Nationale de Tunis Argumentaire Nos sociétés connaissent aujourd’hui de fortes mutations technologiques, environnementales, socioculturelles et idéologiques qui ne sont pas sans conséquences sur notre développement personnel et sur notre fonctionnement psychologique. En effet, vivre […] L’article Colloque international: « Vulnérabilités des jeunes, sens des études et sens de la vie » est apparu en premier sur FTDES.
Publication Date: Wed, 17 Sep 2025 13:48:20 +0000
Content: Colloque international: « Vulnérabili

In [90]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_content(rss_url):
    """
    Extract and clean content from RSS feed using feedparser and BeautifulSoup
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = getattr(entry, 'title', '')
        link = getattr(entry, 'link', '')
        pub_date = getattr(entry, 'published', getattr(entry, 'pubDate', ''))

        # Extract and clean description
        description = getattr(entry, 'description', '')
        description_clean = clean_html_content(description)

        # Extract and clean content (try multiple possible fields)
        content = ''
        if hasattr(entry, 'content'):
            content = entry.content[0].value if entry.content else ''
        elif hasattr(entry, 'summary'):
            content = entry.summary
        content_clean = clean_html_content(content)

        # If content is empty but description has content, use description
        if not content_clean.strip() and description_clean.strip():
            content_clean = description_clean

        result = {
            "title": clean_text(title),
            "link": clean_text(link),
            "description": description_clean,
            "pub_date": clean_text(pub_date),
            "content": content_clean
        }

        results.append(result)

    return results

def clean_html_content(html_content):
    """
    Clean HTML content using BeautifulSoup to remove all HTML tags
    while preserving the text content
    """
    if not html_content:
        return ""

    # Unescape HTML entities first
    unescaped_content = html.unescape(html_content)

    # Parse with BeautifulSoup and extract text
    soup = BeautifulSoup(unescaped_content, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style", "head", "title", "meta", "[document]"]):
        script.decompose()

    # Get text and clean up
    text = soup.get_text()

    # Clean up whitespace
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ' '.join(chunk for chunk in chunks if chunk)

    return text

def clean_text(text):
    """
    Basic text cleaning (remove extra whitespace, unescape HTML entities)
    """
    if not text:
        return ""

    cleaned = html.unescape(text).strip()
    return cleaned

# Example usage with your RSS snippet URL
if __name__ == "__main__":
    # Replace this with the actual RSS URL from your snippet
    rss_url = "https://www.jawharafm.net/ar/rss/showRss/88/1/4"  # Assuming this is the base URL

    try:
        entries = extract_rss_content(rss_url)

        for i, entry in enumerate(entries, 1):
            print(f"=== Entry {i} ===")
            print(f"Title: {entry['title']}")
            print(f"Link: {entry['link']}")
            print(f"Description: {entry['description']}")
            print(f"Publication Date: {entry['pub_date']}")
            print(f"Content: {entry['content']}")
            print("=" * 50)
            print()

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

=== Entry 1 ===
Title: بديل واتساب وتيليغرام.. ما هو تطبيق "ماكس" ؟
Link: https://www.jawharafm.net/ar/article/بديل-واتساب-وتيليغرام..-ما-هو-تطبيق-ماكس-؟/94/281083
Description: في خطوة جديدة ضمن سعيها لتعزيز السيطرة الرقمية، أطلقت السلطات الروسية تطبيق المراسلة الجديد "ماكس"، الذي تصفه بأنه "المُرسِل الوطني" المخصص لمنافسة تطبيقات مثل واتساب وتيليغرام، بل وتحويله إلى منصة شاملة على غرار "وي تشات" الصيني.
Publication Date: 2025-09-08T12:28:00+01:00
Content: في خطوة جديدة ضمن سعيها لتعزيز السيطرة الرقمية، أطلقت السلطات الروسية تطبيق المراسلة الجديد "ماكس"، الذي تصفه بأنه "المُرسِل الوطني" المخصص لمنافسة تطبيقات مثل واتساب وتيليغرام، بل وتحويله إلى منصة شاملة على غرار "وي تشات" الصيني.

=== Entry 2 ===
Title: أوبن إيه آي تطلق التحديث الخامس لـ'تشات جي بي تي'
Link: https://www.jawharafm.net/ar/article/أوبن-إيه-آي-تطلق-التحديث-الخامس-لـتشات-جي-بي-تي/94/279770
Description: أطلقت شركة "أوبن إيه آي" الرائدة في مجال الذكاء الاصطناعي التوليدي منذ إطلاق "تشات جي بي تي" أواخر عام 2022، نموذجا جديد

In [91]:
import feedparser
from bs4 import BeautifulSoup
import html

def extract_rss_content(rss_url):
    """
    Extract and clean content from RSS feed using feedparser and BeautifulSoup
    """
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    results = []

    for entry in feed.entries:
        # Extract basic fields
        title = getattr(entry, 'title', '')
        link = getattr(entry, 'link', '')
        pub_date = getattr(entry, 'published', getattr(entry, 'pubDate', ''))

        # Extract and clean description
        description = getattr(entry, 'description', '')
        description_clean = clean_html_content(description)

        # Extract and clean content (try multiple possible fields)
        content = ''
        if hasattr(entry, 'content'):
            content = entry.content[0].value if entry.content else ''
        elif hasattr(entry, 'summary'):
            content = entry.summary
        content_clean = clean_html_content(content)

        # If content is empty but description has content, use description
        if not content_clean.strip() and description_clean.strip():
            content_clean = description_clean

        result = {
            "title": clean_text(title),
            "link": clean_text(link),
            "description": description_clean,
            "pub_date": clean_text(pub_date),
            "content": content_clean
        }

        results.append(result)

    return results

def clean_html_content(html_content):
    """
    Clean HTML content using BeautifulSoup to remove all HTML tags
    while preserving the text content
    """
    if not html_content:
        return ""

    # Unescape HTML entities first
    unescaped_content = html.unescape(html_content)

    # Parse with BeautifulSoup and extract text
    soup = BeautifulSoup(unescaped_content, 'html.parser')

    # Remove script and style elements
    for script in soup(["script", "style", "head", "title", "meta", "[document]"]):
        script.decompose()

    # Get text and clean up
    text = soup.get_text()

    # Clean up whitespace
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ' '.join(chunk for chunk in chunks if chunk)

    return text

def clean_text(text):
    """
    Basic text cleaning (remove extra whitespace, unescape HTML entities)
    """
    if not text:
        return ""

    cleaned = html.unescape(text).strip()
    return cleaned

# Example usage with your RSS snippet URL
if __name__ == "__main__":
    # Replace this with the actual RSS URL from your snippet
    rss_url = "https://www.jawharafm.net/ar/rss/showRss/88/1/2"  # Assuming this is the base URL

    try:
        entries = extract_rss_content(rss_url)

        for i, entry in enumerate(entries, 1):
            print(f"=== Entry {i} ===")
            print(f"Title: {entry['title']}")
            print(f"Link: {entry['link']}")
            print(f"Description: {entry['description']}")
            print(f"Publication Date: {entry['pub_date']}")
            print(f"Content: {entry['content']}")
            print("=" * 50)
            print()

    except Exception as e:
        print(f"Error parsing RSS feed: {e}")

=== Entry 1 ===
Title: رقم الأسبوع: 1 أورو يساوي 3.4 دينار في سوق التعاملات البنكية
Link: https://www.jawharafm.net/ar/article/رقم-الأسبوع-1-أورو-يساوي-3.4-دينار-في-سوق-التعاملات-البنكية/93/281519
Description: شهد الدينار التونسي تراجعا أمام العملة الأوروبية الأورو، إذ بات يتداول، منذ أكثر من أسبوع في سوق التعاملات البنكية بـ3,4 دينار للأورو الواحد وفق البيانات المنشورة على موقع البنك المركزي التونسي.
Publication Date: 2025-09-17T10:56:00+01:00
Content: شهد الدينار التونسي تراجعا أمام العملة الأوروبية الأورو، إذ بات يتداول، منذ أكثر من أسبوع في سوق التعاملات البنكية بـ3,4 دينار للأورو الواحد وفق البيانات المنشورة على موقع البنك المركزي التونسي.

=== Entry 2 ===
Title: الدولار عند أدنى مستوى في 4 سنوات مقابل اليورو
Link: https://www.jawharafm.net/ar/article/الدولار-عند-أدنى-مستوى-في-4-سنوات-مقابل-اليورو/93/281501
Description: انخفض الدولار بشدة اليوم الثلاثاء مسجلا أدنى مستوى في 4 سنوات مقابل اليورو، مع زيادة رهانات المستثمرين على أن مجلس الاحتياطي الاتحادي (البنك المركزي الأميركي) سيخف