In [6]:
import xml.etree.ElementTree as ET

def parse_opml(file_path):
    '''
    Description: Takes the file path of an OPML file and extracts the RSS feed URLs.
    Args:
        file_path: A string representing the path to the OPML file.
    Returns: A list of RSS feed URLs.
    '''
    
    # Parse the OPML file using ElementTree
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize an empty list to store the feed URLs
    feeds = []

    # Iterate through the 'outline' elements in the OPML file
    for elem in root.iter('outline'):
        # Extract the 'xmlUrl' attribute containing the RSS feed URL
        url = elem.get('xmlUrl')

        # If the URL exists, append it to the feeds list
        if url:
            feeds.append(url)

    # Return the list of feed URLs
    return feeds

In [10]:
feeds = parse_opml('../data/tyler-rss-subscriptions.opml')
feeds

['http://mikudb.moe/feed/',
 'https://frozenstarfall.net/feed/',
 'http://bunkai-kei.com/feed/',
 'https://kill-the-newsletter.com/feeds/cfp02mmxp2rhdpfq.xml',
 'https://www.animenewsnetwork.com/review/rss.xml',
 'https://www.animenewsnetwork.com/feature/rss.xml',
 'https://www.evageeks.org/feed/',
 'https://fullfrontal.moe/feed/',
 'https://buttercupfestival.tumblr.com/rss',
 'http://feeds.feedburner.com/Buttersafe',
 'https://qwantz.com/rssfeed.php',
 'https://www.paranatural.net/comic/rss',
 'http://feeds.penny-arcade.com/pa-mainsite/',
 'http://sssscomic.com/ssss-feed.xml',
 'https://www.oglaf.com/feeds/rss/',
 'https://www.smbc-comics.com/comic/rss',
 'https://pbfcomics.com/feed/',
 'http://feeds.feedburner.com/wondermark',
 'https://xkcd.com/atom.xml',
 'http://thesecretknots.com/feed/',
 'http://dresdencodak.com/feed/',
 'http://askakorean.blogspot.com/feeds/posts/default',
 'http://populargusts.blogspot.com/feeds/posts/default',
 'http://freekorea.us/feed/',
 'http://www.econta

In [56]:
import feedparser
import pandas as pd

# Make a function out of this
def rss_to_df(feed, source):
    '''
    Description: Takes a feedparser object and converts it to a Pandas DataFrame
    Args:
        feed: A feedparser object
    Returns: A Pandas DataFrame
    '''
    #df = pd.DataFrame({'title': feed.entries[0].title, 'link': feed.entries[0].link, 'published': feed.entries[0].published}, index=[0])
    df = pd.DataFrame()
    for i in range(len(feed.entries)):
        entry = feed.entries[i]
        curr = pd.DataFrame({'title': entry.title, 'link': entry.link, 'summary': entry.summary, 'published': entry.published}, index=[i])
        df = pd.concat([df, curr], ignore_index=True)
    df['source'] = source

    # Convert the published column to a datetime object
    df['published'] = pd.to_datetime(df['published'])

    # Fix the "- AP news - en español" at the end of the title
    df['title'] = df['title'].str.replace(' - The Associated Press - en Español', '')
    return df

# Iterate over the feeds and process them
feed = feedparser.parse(feeds[0])
feed.entries[0].author

'Kyrozen'

In [58]:
df = pd.DataFrame()
for feed_url in feeds[1:5]:
    try:
        feed = feedparser.parse(feed_url)
        out = rss_to_df(feed, "")
        df = pd.concat([df, out], ignore_index=True)
    except Exception as e:
        print(f"Error processing feed URL: {feed_url}. Error: {e}")

Error processing feed URL: https://kill-the-newsletter.com/feeds/cfp02mmxp2rhdpfq.xml. Error: object has no attribute 'published'


In [60]:
df

Unnamed: 0,title,link,summary,published,source
0,Comiket 99 announcement,https://frozenstarfall.net/comiket-99-announce...,Hi everyone! we are finally back with a new al...,2021-12-12 19:28:03+00:00,
1,LUNARTANZ (2021 Ver.) & Nhato Remix,https://frozenstarfall.net/lunartanz-2021-ver-...,Hi everyone! We remasted our song LUNARTANZ fr...,2021-09-24 09:35:20+00:00,
2,Hiatus,https://frozenstarfall.net/hiatus/,Hi everyone! I&#8217;m sorry to inform you tha...,2021-04-04 16:49:22+00:00,
3,New works for Air Comiket 2,https://frozenstarfall.net/new-works-for-air-c...,Hi everyone! 2020 has been a tough year for ev...,2020-12-14 18:59:08+00:00,
4,Comiket 97 announcements,https://frozenstarfall.net/comiket-97-announce...,Hi everyone! We are happy to finally announce ...,2019-12-18 16:59:12+00:00,
5,Kotoba,http://bunkai-kei.com/release/bk-k_051/,track06 Bass : Jun Toyoda (Nyan-Nyan-Orchestra...,2017-04-27 13:58:20+00:00,
6,Leave,http://bunkai-kei.com/release/bk-k_050/,2010年のBunkai-Keiからのリリースから5年の歳月を経てfladyによる2作目のリ...,2015-12-10 14:00:03+00:00,
7,Lucoq,http://bunkai-kei.com/release/bk-k_049/,,2015-03-19 13:30:55+00:00,
8,Dragoon,http://bunkai-kei.com/release/bk-k_048/,クワインゴースト。,2015-02-26 13:58:06+00:00,
9,dendel voile,http://bunkai-kei.com/release/bk-k_047/,SHARP personal workstation X68000 series BIOS ...,2014-12-18 13:58:26+00:00,


In [64]:
import feedparser
import xml.etree.ElementTree as ET
import pandas as pd

def parse_opml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    feeds = []
    for elem in root.iter('outline'):
        url = elem.get('xmlUrl')
        if url:
            feeds.append(url)
    return feeds

def rss_reader(feed_url):
    feed = feedparser.parse(feed_url)
    feed_entries = []

    for entry in feed.entries:
        feed_entries.append({
            'feed_title': feed.feed.title,
            'title': entry.title,
            'summary': entry.summary,
            'link': entry.link,
            'published': entry.published
        })

    return feed_entries

if __name__ == "__main__":
    opml_file = "../data/tyler-rss-subscriptions.opml"
    output_csv = "../output/opml_output.csv"
    
    feeds = parse_opml(opml_file)

    all_feed_entries = []

    for feed_url in feeds:
        try:
            all_feed_entries.extend(rss_reader(feed_url))
        except Exception as e:
            print(f"Error processing feed URL: {feed_url}. Error: {e}")

    df = pd.DataFrame(all_feed_entries)
    df.to_csv(output_csv, index=False)

Error processing feed URL: https://kill-the-newsletter.com/feeds/cfp02mmxp2rhdpfq.xml. Error: object has no attribute 'published'
Error processing feed URL: http://sssscomic.com/ssss-feed.xml. Error: object has no attribute 'published'
Error processing feed URL: https://xkcd.com/atom.xml. Error: object has no attribute 'published'
Error processing feed URL: https://kill-the-newsletter.com/feeds/m8q3pfxjfrtk8xhj.xml. Error: object has no attribute 'published'
Error processing feed URL: http://bactra.org/weblog/index.rss. Error: object has no attribute 'published'
Error processing feed URL: https://slatestarscratchpad.tumblr.com/rss. Error: object has no attribute 'summary'
Error processing feed URL: https://kill-the-newsletter.com/feeds/saai8csyfofi4n3z.xml. Error: object has no attribute 'published'
Error processing feed URL: https://kill-the-newsletter.com/feeds/s0xwmajqfzdcw4ys.xml. Error: object has no attribute 'published'
Error processing feed URL: https://kill-the-newsletter.com/