# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
feedburner = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
feedburner.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
feedburner.feed.keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [6]:
extract = {'title': feedburner.feed.title, 
           'subtitle': feedburner.feed.subtitle, 
           'author': feedburner.entries[0].author, 
           'link': feedburner.feed.link}

extract

{'title': 'Radar',
 'subtitle': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology',
 'author': 'Nat Torkington',
 'link': 'https://www.oreilly.com/radar'}

### 5. Count the number of entries that are contained in this RSS feed.

In [7]:
len(feedburner['entries'])

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [8]:
feedburner.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [9]:
titles = [t.title for t in feedburner.entries]
titles

['Four short links: 11 February 2020',
 'Four short links: 10 February 2020',
 'Four short links: 7 February 2020',
 'Radar trends to watch: February 2020',
 'Four short links: 6 February 2020',
 'Four short links: 5 February 2020',
 'Four short links: 4 February 2020',
 'AI meets operations',
 'Four short links: 3 February 2020',
 'Four short links: 31 January 2020',
 'Four short links: 30 January 2020',
 'Four short links: 29 January 2020',
 'Four short links: 28 January 2020',
 'Four short links: 27 January 2020',
 'Four short links: 24 January 2020',
 'Four short links: 23 January 2020',
 'Four short links: 22 January 2020',
 'Four short links: 21 January 2020',
 'Four short links: 20 January 2020',
 'Four short links: 17 January 2020',
 'Four short links: 16 January 2020',
 'Reinforcement learning for the real world',
 'Four short links: 15 January 2020',
 'Four short links: 14 January 2020',
 'Where programming languages are headed in 2020',
 'Four short links: 13 January 2020',


### 8. Calculate the percentage of "Four short links" entry titles.

In [10]:
counter = 0
for i in titles:
    if i.startswith('Four short links'):
        counter += 1

percentage = counter / len(titles)
round(percentage,3)

0.783

### 9. Create a Pandas data frame from the feed's entries.

In [11]:
import pandas as pd

In [15]:
data = pd.DataFrame(feedburner.entries)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 19 columns):
title                  60 non-null object
title_detail           60 non-null object
links                  60 non-null object
link                   60 non-null object
comments               60 non-null object
published              60 non-null object
published_parsed       60 non-null object
authors                60 non-null object
author                 60 non-null object
author_detail          58 non-null object
tags                   60 non-null object
id                     60 non-null object
guidislink             60 non-null bool
summary                60 non-null object
summary_detail         60 non-null object
content                60 non-null object
wfw_commentrss         60 non-null object
slash_comments         60 non-null object
feedburner_origlink    60 non-null object
dtypes: bool(1), object(18)
memory usage: 8.6+ KB


### 10. Count the number of entries per author and sort them in descending order.

In [23]:
#data = data.fillna('unknown')
authors = data.groupby('author', as_index= False)['title'].count()
authors.columns = ['authors', 'entries']
authors.sort_values('entries', ascending=False)

Unnamed: 0,authors,entries
5,Nat Torkington,47
3,Mike Loukides,4
0,,2
2,Jenn Webb,2
1,Alison McCauley,1
4,Mike Loukides and Ben Lorica,1
6,Patrick Hall and Andrew Burt,1
7,Roger Magoulas,1
8,Zan McQuade and Amanda Quinn,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [25]:
data['title_lenght'] = data['title'].apply(len)
data[['title', 'author', 'title_lenght']].sort_values('title_lenght', ascending=False).head(5)

Unnamed: 0,title,author,title_lenght
51,5 industries that demonstrate how blockchains ...,Alison McCauley,63
54,Why you should care about debugging machine le...,Patrick Hall and Andrew Burt,59
24,Where programming languages are headed in 2020,Zan McQuade and Amanda Quinn,46
48,AI is computer science disguised as hard work,Jenn Webb,45
21,Reinforcement learning for the real world,Jenn Webb,41


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [27]:
titles = data['title'][data['summary'].str.contains('machine learning')].tolist()
titles

['Four short links: 28 January 2020',
 'Four short links: 13 January 2020',
 'Why you should care about debugging machine learning models',
 'The road to Software 2.0']