<b> This notebook contains the sample code for a simple RSS Feed parser </b>

In [1]:
import feedparser
import os
import pandas as pd

from datetime import datetime

In [2]:
# set the feeds as global variables
ITUNES = "http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/ws/RSS/topMovies/xml"
NETFLIX = "http://dvd.netflix.com/Top100RSS"
RSS_LIST = [ITUNES, NETFLIX]

In [3]:
# set the dataframe columns as global variables
DF_COLUMNS = ['source', 'date', 'rank', 'title', 'link', 'summary']

In [10]:
# create a function to parse data and append to a dataframe
def top10_movies(rss, df):
    
    #parse the feed
    #In order to understand feed, need to read the api. 
    #Lots of info not included in notebook (what are the feed. methods? what do they call?)
    #See cells below for details!!!
    feed = feedparser.parse(rss)
    
    #check bozo to see if your feed is well formed
    if feed.bozo == 0:
        print("%s is a well-formed feed!" % feed.feed.title)
    else:
        print("%s has flipped the bozo bit. Potential errors ahead!" % feed.feed.title)
        
    #set the feed date to be the published data, if it exists. If not, use the current date
    feed_date = feed.feed.get('published', datetime.now().strftime('%Y-%m-%d'))
    
    #set a counter for our loop
    i = 0
    
    #for the first 10 movies in our feed, append the required information to the dataframe
    #dataframe columns were set in the DF_COLUMNS global var above (source, date, rank, title, link, summary)
    while i < 10:
        feed_items = pd.Series([feed.feed.title, feed_date, i+1, feed.entries[i].title, feed.entries[i].id, \
                              feed.entries[i].summary], DF_COLUMNS)
        df = df.append(feed_items, ignore_index = True)
        i+= 1
    
    #return the dataframe
    return df

if __name__ == "__main__":
    #create an empty dataframe
    top10_df = pd.DataFrame(columns = DF_COLUMNS)
    
    #run each feed through our top 10 function
    for item in RSS_LIST:
        top10_df = top10_movies(item, top10_df)
        
    #save the dataframe to a csv. if the csv exists, append to it
    if not os.path.isfile('top10.csv'):
        top10_df.to_csv('top10.csv', header = DF_COLUMNS, index = False)
    else:
        top10_df.to_csv('top10.csv', mode = 'a', header = False, index = False)

iTunes Store: Top Movies is a well-formed feed!
Netflix Top 100 is a well-formed feed!


<b> Lets explore the feedparser api methods! </b>

In [19]:
# What elements are available in the ITunes RSS Feed?
itunes_parse = feedparser.parse('http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/ws/RSS/topMovies/xml')
itunes_parse.feed

{'author': 'iTunes Store',
 'author_detail': {'href': 'http://www.apple.com/itunes/',
  'name': 'iTunes Store'},
 'authors': [{'href': 'http://www.apple.com/itunes/', 'name': 'iTunes Store'}],
 'guidislink': True,
 'href': 'http://www.apple.com/itunes/',
 'icon': 'http://itunes.apple.com/favicon.ico',
 'id': 'http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/ws/RSS/topMovies/xml',
 'language': 'en',
 'link': 'https://itunes.apple.com/WebObjects/MZStore.woa/wa/viewTop?cc=us&id=1&popId=15',
 'links': [{'href': 'https://itunes.apple.com/WebObjects/MZStore.woa/wa/viewTop?cc=us&id=1&popId=15',
   'rel': 'alternate',
   'type': 'text/html'},
  {'href': 'http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/ws/RSS/topMovies/xml',
   'rel': 'self',
   'type': 'application/atom+xml'}],
 'rights': 'Copyright 2008 Apple Inc.',
 'rights_detail': {'base': 'http://ax.itunes.apple.com/WebObjects/MZStoreServices.woa/ws/RSS/topMovies/xml',
  'language': 'en',
  'type': 'text/plain',
  'valu

In [20]:
# Is the feed a dict?
type(itunes_parse.feed)

feedparser.FeedParserDict

In [21]:
# What are the keys available (we can clearly see from the .feed printout, but this is for funzies)
itunes_parse.feed.keys()

dict_keys(['language', 'id', 'guidislink', 'link', 'title', 'title_detail', 'updated', 'updated_parsed', 'links', 'icon', 'authors', 'author_detail', 'href', 'author', 'rights', 'rights_detail'])

In [31]:
# What is the feed.bozo????
itunes_parse.feed.bozo

AttributeError: object has no attribute 'bozo'

In [32]:
# Feedparser docs indicate the bozo bit is a value assigned when the rss feed in not well-formated XML.
wellformed = feedparser.parse('http://feedparser.org/docs/examples/atom10.xml')
notwellformed = feedparser.parse('http://feedparser.org/tests/illformed/rss/aaa_illformed.xml')

print("A well formed rss will yield {} whilst a non-well formed rss yields the value {}. \
Bozo may also not be present, but this does not always indicate poor formating".format(wellformed.bozo-1, notwellformed.bozo))

A well formed rss will yield 0 whilst a non-well formed rss yields the value 1. Bozo may also not be present, but this does not always indicate poor formating


In [8]:
# Lets have a look at Netflix!
netflix_parse = feedparser.parse('http://dvd.netflix.com/Top100RSS')
netflix_parse.feed

{'cf_treatas': 'list',
 'language': 'en-us',
 'link': 'http://dvd.netflix.com',
 'links': [{'href': 'http://dvd.netflix.com',
   'rel': 'alternate',
   'type': 'text/html'},
  {'href': 'http://dvd.netflix.com/Top100RSS',
   'rel': 'self',
   'type': 'application/rss+xml'}],
 'subtitle': 'Top 100 Netflix movies, published every 2 weeks.',
 'subtitle_detail': {'base': 'http://dvd.netflix.com/Top100RSS',
  'language': None,
  'type': 'text/html',
  'value': 'Top 100 Netflix movies, published every 2 weeks.'},
 'title': 'Netflix Top 100',
 'title_detail': {'base': 'http://dvd.netflix.com/Top100RSS',
  'language': None,
  'type': 'text/plain',
  'value': 'Netflix Top 100'},
 'ttl': '20160'}

In [11]:
# The feed method returns a python dictionary!
isinstance(netflix_parse.feed, dict)

True

In [38]:
# The entries method returns a list of dicts. Each dict contains the metadata associated with each movie
netflix_parse.entries

[{'guidislink': False,
  'id': 'https://dvd.netflix.com/Movie/Sully/80103102',
  'link': 'https://dvd.netflix.com/Movie/Sully/80103102',
  'links': [{'href': 'https://dvd.netflix.com/Movie/Sully/80103102',
    'rel': 'alternate',
    'type': 'text/html'}],
  'summary': '<a href="https://dvd.netflix.com/Movie/Sully/80103102"><img src="//secure.netflix.com/us/boxshots/small/80103102.jpg"/></a><br>Viewers around the world were astonished in 2009 when airline pilot Chesley Sullenberger safely landed an Airbus 320 on the Hudson River after both engines were disabled. This fact-based drama illuminates Sullenberger\'s life and heroic achievement.',
  'summary_detail': {'base': 'http://dvd.netflix.com/Top100RSS',
   'language': None,
   'type': 'text/html',
   'value': '<a href="https://dvd.netflix.com/Movie/Sully/80103102"><img src="//secure.netflix.com/us/boxshots/small/80103102.jpg"/></a><br>Viewers around the world were astonished in 2009 when airline pilot Chesley Sullenberger safely land

In [13]:
# The entries method returns a python list! Each item within this list is a python dict. 
isinstance(netflix_parse.entries, list)

True

In [35]:
# We can search each entry in the list, and pull out the metadata using the feed keys as methods (title, link, summary, etc) 
netflix_parse.entries[0].title

'Sully'

In [37]:
netflix_parse.entries[0].summary

'<a href="https://dvd.netflix.com/Movie/Sully/80103102"><img src="//secure.netflix.com/us/boxshots/small/80103102.jpg"/></a><br>Viewers around the world were astonished in 2009 when airline pilot Chesley Sullenberger safely landed an Airbus 320 on the Hudson River after both engines were disabled. This fact-based drama illuminates Sullenberger\'s life and heroic achievement.'

In [39]:
netflix_parse.entries[0].links

[{'href': 'https://dvd.netflix.com/Movie/Sully/80103102',
  'rel': 'alternate',
  'type': 'text/html'}]

<b> Now that we have an idea of the structure of the feedparser.parse object, we can go back to the function top10_movies... </b>
<b> We can modify it to a more generic function, able to parse other RSS feeds. Then move on to the API notebook!! </b>

In [10]:
def topX_movies(rss, dfcolumns, entrylimit):
     '''This function extracts data from an rss feed and returns that data as a pandas dataframe object. / 
             
        rss (type: str) is a link to the rss stream to parse with feedparser 
        dfcolumns (type: list) contains the values to collect from each entry in the rss stream
        entrylimit (type: int) is the number of entries to collect from the rss stream
        
        '''
    
    #step1. pull rss feed
    rss_parse = feedparser.parse(rss)
    
    #step2. check rss feed integrity. tell us the result
    if 'bozo' in rss_parse.feed and rss_parse.bozo == 0:
        print('The RSS format is good!')
    else:
        print('The RSS format is not good. Be careful when using this data!') 
      
    #step3. create global counter to wrap code in a loop. we need to pull data for each entry.
    i = 0
       
    #step4. check if passed column values exist in the rss feed. create list with valid columns names. 
    valid_columns = [_ for _ in dfcolumns if _ in rss_parse.entries[0]]
    
    #step5. create dataframe with columns names present in the rss feed. dataframe values are placeholders.
    placeholder_dict = {valid_columns[_]:'not a real entry' for _ in range(len(valid_columns))}
    df = pd.DataFrame(placeholder_dict, columns = valid_columns, index = [0])
    
    
    #step6. add desired data to dataframe. loop through each entry until limit reached.
    while i < entrylimit:
        #create dict with rss feed entries by index 
        parsed_dict = rss_parse.entries[i]  
        
        #create series object with desired rss feed values. create list with values for each entry ordered by valid_columns. 
        parsed_items = pd.Series([parsed_dict[_] for _ in valid_columns], valid_columns)

        #append series to dataframe
        df = df.append(parsed_items, ignore_index = True)   
        
        #increment counter
        i+= 1
    
    #step7. drop first row from dataframe. this row contains placeholder values.
    df = df.drop([0])
    
    #step8. return dataframe
    return df

<b> The cells below were used to prototype features of the function above </b>

In [8]:
# We need an easy way to check for a key in a dict. Use the 'in' comparator to see if 'bozo' is present!
'bozo' in netflix_parse.feed

False

In [69]:
# To create a panda's dataframe, we need to create a dict. The keys will be the column names. 
opop = ['not','what','we','want']      #this list is in place of dfcolumns
opop2 = {opop[_]:'not a real entry' for _ in range(len(opop))}
opop2

{'not': 'not a real entry',
 'want': 'not a real entry',
 'we': 'not a real entry',
 'what': 'not a real entry'}

In [71]:
# To make a dataframe with one row containing all the column names we want, we need to provide an index value.
# Without an index, pandas will throw an exception. In this case, easy to define 0 as index.
fakedf = pd.DataFrame(data = opop2, columns = opop, index = [0])

In [72]:
# Test dataframe looks good. Move this code to topX_movies to create a placeholder dataframe!
fakedf

Unnamed: 0,not,what,we,want
0,not a real entry,not a real entry,not a real entry,not a real entry


In [17]:
# Can we do essentially this psuedocode in python? [rss_parse.entries[i].item for item in dfcolumns]
# rss_parse.entries is a dict! I can store it as a dict and then call the items in dfcolumns as keys to pull the values!!!
i = 2
testdict = netflix_parse.entries[i]
testdict

{'guidislink': False,
 'id': 'https://dvd.netflix.com/Movie/The-Accountant/80071227',
 'link': 'https://dvd.netflix.com/Movie/The-Accountant/80071227',
 'links': [{'href': 'https://dvd.netflix.com/Movie/The-Accountant/80071227',
   'rel': 'alternate',
   'type': 'text/html'}],
 'summary': '<a href="https://dvd.netflix.com/Movie/The-Accountant/80071227"><img src="//secure.netflix.com/us/boxshots/small/80071227.jpg"/></a><br>Offering two very different skills to his clients, a financial forensics expert and trained assassin goes to work for a tech mogul who\'s determined to eliminate those responsible for secretly manipulating the company\'s financial records.',
 'summary_detail': {'base': 'http://dvd.netflix.com/Top100RSS',
  'language': None,
  'type': 'text/html',
  'value': '<a href="https://dvd.netflix.com/Movie/The-Accountant/80071227"><img src="//secure.netflix.com/us/boxshots/small/80071227.jpg"/></a><br>Offering two very different skills to his clients, a financial forensics exp

In [32]:
# This should pull the values in the order of DF_COLUMNS into a new list, which we can pass to pd.Series!!!
# If any key is not present, this will throw error. Need to use only for valid keys!!!
newlist_onlyvalidcolumns_values = [testdict[_] for _ in DF_COLUMNS if _ in testdict]
newlist_onlyvalidcolumns_values

['The Accountant',
 'https://dvd.netflix.com/Movie/The-Accountant/80071227',
 '<a href="https://dvd.netflix.com/Movie/The-Accountant/80071227"><img src="//secure.netflix.com/us/boxshots/small/80071227.jpg"/></a><br>Offering two very different skills to his clients, a financial forensics expert and trained assassin goes to work for a tech mogul who\'s determined to eliminate those responsible for secretly manipulating the company\'s financial records.']

In [30]:
# Cells below show that only valid keys used to pull entries, and the output had length 3 as expected
newlist_forrsspares = [_ for _ in DF_COLUMNS if _ in testdict]

In [28]:
len(testdict)

8

In [29]:
len(DF_COLUMNS)

6

In [31]:
newlist_forrsspares

['title', 'link', 'summary']

In [33]:
len(newlist_onlyvalidcolumns_values)

3

In [34]:
len(newlist_forrsspares)

3

<b> The cells below were used to check the returned value of the function topX_movies </b>

In [11]:
final_Dfcheck = topX_movies(NETFLIX, DF_COLUMNS, 10)

The RSS format is not good. Be careful when using this data!


In [12]:
final_Dfcheck

Unnamed: 0,title,link,summary
1,Sully,https://dvd.netflix.com/Movie/Sully/80103102,"<a href=""https://dvd.netflix.com/Movie/Sully/8..."
2,Hacksaw Ridge,https://dvd.netflix.com/Movie/Hacksaw-Ridge/80...,"<a href=""https://dvd.netflix.com/Movie/Hacksaw..."
3,The Accountant,https://dvd.netflix.com/Movie/The-Accountant/8...,"<a href=""https://dvd.netflix.com/Movie/The-Acc..."
4,Jason Bourne,https://dvd.netflix.com/Movie/Jason-Bourne/800...,"<a href=""https://dvd.netflix.com/Movie/Jason-B..."
5,The Magnificent Seven,https://dvd.netflix.com/Movie/The-Magnificent-...,"<a href=""https://dvd.netflix.com/Movie/The-Mag..."
6,Game of Thrones,https://dvd.netflix.com/Movie/Game-of-Thrones/...,"<a href=""https://dvd.netflix.com/Movie/Game-of..."
7,Manchester by the Sea,https://dvd.netflix.com/Movie/Manchester-by-th...,"<a href=""https://dvd.netflix.com/Movie/Manches..."
8,Deepwater Horizon,https://dvd.netflix.com/Movie/Deepwater-Horizo...,"<a href=""https://dvd.netflix.com/Movie/Deepwat..."
9,Hell or High Water,https://dvd.netflix.com/Movie/Hell-or-High-Wat...,"<a href=""https://dvd.netflix.com/Movie/Hell-or..."
10,The Girl on the Train,https://dvd.netflix.com/Movie/The-Girl-on-the-...,"<a href=""https://dvd.netflix.com/Movie/The-Gir..."


In [13]:
final20_Dfcheck = topX_movies(NETFLIX, DF_COLUMNS, 20)
final20_Dfcheck

The RSS format is not good. Be careful when using this data!


Unnamed: 0,title,link,summary
1,Sully,https://dvd.netflix.com/Movie/Sully/80103102,"<a href=""https://dvd.netflix.com/Movie/Sully/8..."
2,Hacksaw Ridge,https://dvd.netflix.com/Movie/Hacksaw-Ridge/80...,"<a href=""https://dvd.netflix.com/Movie/Hacksaw..."
3,The Accountant,https://dvd.netflix.com/Movie/The-Accountant/8...,"<a href=""https://dvd.netflix.com/Movie/The-Acc..."
4,Jason Bourne,https://dvd.netflix.com/Movie/Jason-Bourne/800...,"<a href=""https://dvd.netflix.com/Movie/Jason-B..."
5,The Magnificent Seven,https://dvd.netflix.com/Movie/The-Magnificent-...,"<a href=""https://dvd.netflix.com/Movie/The-Mag..."
6,Game of Thrones,https://dvd.netflix.com/Movie/Game-of-Thrones/...,"<a href=""https://dvd.netflix.com/Movie/Game-of..."
7,Manchester by the Sea,https://dvd.netflix.com/Movie/Manchester-by-th...,"<a href=""https://dvd.netflix.com/Movie/Manches..."
8,Deepwater Horizon,https://dvd.netflix.com/Movie/Deepwater-Horizo...,"<a href=""https://dvd.netflix.com/Movie/Deepwat..."
9,Hell or High Water,https://dvd.netflix.com/Movie/Hell-or-High-Wat...,"<a href=""https://dvd.netflix.com/Movie/Hell-or..."
10,The Girl on the Train,https://dvd.netflix.com/Movie/The-Girl-on-the-...,"<a href=""https://dvd.netflix.com/Movie/The-Gir..."


In [14]:
final20_Dfcheck.dtypes

title      object
link       object
summary    object
dtype: object

In [25]:
# We can slice a row from a dataframe by indexing the row
final20_Dfcheck.loc[1][:]

title                                                  Sully
link            https://dvd.netflix.com/Movie/Sully/80103102
summary    <a href="https://dvd.netflix.com/Movie/Sully/8...
Name: 1, dtype: object

In [26]:
# We can extract a specific value with indexing by row and column
final20_Dfcheck.loc[1]['title']

'Sully'

In [29]:
# What output do we get for the summary?
final20_Dfcheck.loc[1]['summary']

'<a href="https://dvd.netflix.com/Movie/Sully/80103102"><img src="//secure.netflix.com/us/boxshots/small/80103102.jpg"/></a><br>Viewers around the world were astonished in 2009 when airline pilot Chesley Sullenberger safely landed an Airbus 320 on the Hudson River after both engines were disabled. This fact-based drama illuminates Sullenberger\'s life and heroic achievement.'

In [30]:
# What type of object is the summary?
type(final20_Dfcheck.loc[1]['summary'])

str

<b> We could parse the summary value using regular expressions from re module to extract only the text outside the html &lt;a&gt; href tag...</b>