# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [6]:
data = feedparser.parse(url)
data

{'feed': {'title': "All - O'Reilly Media",
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': "All - O'Reilly Media"},
  'id': 'https://www.oreilly.com',
  'guidislink': True,
  'link': 'https://www.oreilly.com',
  'updated': '2019-08-27T12:52:10Z',
  'updated_parsed': time.struct_time(tm_year=2019, tm_mon=8, tm_mday=27, tm_hour=12, tm_min=52, tm_sec=10, tm_wday=1, tm_yday=239, tm_isdst=0),
  'subtitle': 'All of our Ideas and Learning material from all of our topics.',
  'subtitle_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': 'All of our Ideas and Learning material from all of our topics.'},
  'links': [{'href': 'https://www.oreilly.com',
    'rel': 'alternate',
    'type': 'text/html'},
   {'rel': 'self',
    'type': 'application/atom+xml',
    'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
   {'rel': 'hub',
   

### 2. Obtain a list of components (keys) that are available for this feed.

In [8]:
data.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [12]:
data.feed.keys()

dict_keys(['title', 'title_detail', 'id', 'guidislink', 'link', 'updated', 'updated_parsed', 'subtitle', 'subtitle_detail', 'links', 'authors', 'author_detail', 'author', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [78]:
print("feed title: ", data.feed.title)
print("subtitle: ", data.feed.subtitle)
print("author: ", data.feed.authors[0]['name'])
links = [data.feed.links[x]['href'] for x in range(len(data.feed.links))]
print("link: ", data.feed.link)
print("links: ", ", ".join(links))



feed title:  All - O'Reilly Media
subtitle:  All of our Ideas and Learning material from all of our topics.
author:  O'Reilly Media
link:  https://www.oreilly.com
links:  https://www.oreilly.com, http://feeds.feedburner.com/oreilly/radar/atom, http://pubsubhubbub.appspot.com/


### 5. Count the number of entries that are contained in this RSS feed.

In [80]:
len(data.entries)

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [92]:
entrie_keys = data.entries[0].keys()
entrie_keys

dict_keys(['title', 'title_detail', 'updated', 'updated_parsed', 'id', 'guidislink', 'link', 'content', 'summary', 'links', 'authors', 'author_detail', 'author', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [97]:
titles = [data.entries[x]['title'] for x in range(len(data.entries))]
titles

['Four short links: 27 August 2019',
 'Four short links: 26 August 2019',
 'How organizations are sharpening their skills to better understand and use AI',
 'Four short links: 23 August 2019',
 'Four short links: 22 August 2019',
 'Four short links: 21 August 2019',
 'Four short links: 20 August 2019',
 'Four short links: 19 August 2019',
 'Antitrust regulators are using the wrong tools to break up Big Tech',
 'Labeling, transforming, and structuring training data sets for machine learning',
 'Four short links: 15 August 2019',
 'Four short links: 14 August 2019',
 'Four short links: 13 August 2019',
 'Four short links: 12 August 2019',
 'Blockchain solutions in enterprise',
 'Four short links: 9 August 2019',
 'Got speech? These guidelines will help you get started building voice applications',
 'Four short links: 8 August 2019',
 'New live online training courses',
 'Four short links: 7 August 2019',
 'Four short links: 6 August 2019',
 'Four short links: 5 August 2019',
 'Four short

### 8. Calculate the percentage of "Four short links" entry titles.

In [104]:
title_entries = [x for x in titles if x.startswith('Four short links')]
num, all_num = len(title_entries), len(titles)
percent = (num*100) / all_num
percent

58.333333333333336

### 9. Create a Pandas data frame from the feed's entries.

In [105]:
import pandas as pd

In [108]:
data.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [117]:
df = pd.DataFrame(data['entries'])
df.head()

Unnamed: 0,author,author_detail,authors,content,feedburner_origlink,guidislink,id,link,links,summary,title,title_detail,updated,updated_parsed
0,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-27:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Personal Information, Research Data, Ma...",Four short links: 27 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-27T11:10:00Z,"(2019, 8, 27, 11, 10, 0, 1, 239, 0)"
1,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-26:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Avoiding Sexual Predators, YouTube Radi...",Four short links: 26 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-26T11:00:00Z,"(2019, 8, 26, 11, 0, 0, 0, 238, 0)"
2,Ben Lorica,{'name': 'Ben Lorica'},[{'name': 'Ben Lorica'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/how-organization...,True,"tag:www.oreilly.com,2019-08-26:/ideas/how-orga...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,How organizations are sharpening their skills ...,"{'type': 'text/plain', 'language': None, 'base...",2019-08-26T11:00:00Z,"(2019, 8, 26, 11, 0, 0, 0, 238, 0)"
3,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-23:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Open Source Economics, Program Synthesi...",Four short links: 23 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-23T08:00:00Z,"(2019, 8, 23, 8, 0, 0, 4, 235, 0)"
4,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-22:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>I Don't Know, Map Quirks, UI Toolkit, a...",Four short links: 22 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-22T12:55:00Z,"(2019, 8, 22, 12, 55, 0, 3, 234, 0)"


### 10. Count the number of entries per author and sort them in descending order.

In [119]:
len(df["author"].unique().tolist())

20

In [135]:
dups = df.pivot_table(columns=['author'], aggfunc='size').reset_index()
dups2 = pd.DataFrame(dups)
dups_desc = dups2.rename(columns = {0:'entries'})
dups_desc.sort_values(by = 'entries', ascending = False)

Unnamed: 0,author,entries
12,Nat Torkington,35
5,Ben Lorica,5
11,Mike Loukides,2
0,Adam Jacob,1
10,"Michael Bradley, David Gorman, Matt Lucas, Mat...",1
17,Tim O'Reilly,1
16,Tiffani Bell,1
15,Roger Magoulas,1
14,Pete Skomoroch,1
13,"Pedro Cruz, Brad Topol",1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [176]:
df = pd.DataFrame(data['entries'])
df.head()
list(df['title'])

['Four short links: 27 August 2019',
 'Four short links: 26 August 2019',
 'How organizations are sharpening their skills to better understand and use AI',
 'Four short links: 23 August 2019',
 'Four short links: 22 August 2019',
 'Four short links: 21 August 2019',
 'Four short links: 20 August 2019',
 'Four short links: 19 August 2019',
 'Antitrust regulators are using the wrong tools to break up Big Tech',
 'Labeling, transforming, and structuring training data sets for machine learning',
 'Four short links: 15 August 2019',
 'Four short links: 14 August 2019',
 'Four short links: 13 August 2019',
 'Four short links: 12 August 2019',
 'Blockchain solutions in enterprise',
 'Four short links: 9 August 2019',
 'Got speech? These guidelines will help you get started building voice applications',
 'Four short links: 8 August 2019',
 'New live online training courses',
 'Four short links: 7 August 2019',
 'Four short links: 6 August 2019',
 'Four short links: 5 August 2019',
 'Four short

In [152]:
new_df = df[['author', 'title']]
new_df['title_length'] = new_df['title'].str.count('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [175]:
new_df_length = new_df.sort_values(by = 'title_length', ascending = False)
new_df_length
#list(new_df_length['title'])

['Got speech? These guidelines will help you get started building voice applications',
 'Managing machine learning in the enterprise: Lessons from banking and health care',
 'Labeling, transforming, and structuring training data sets for machine learning',
 "Highlights from the O'Reilly Open Source Software Conference in Portland 2019",
 'How organizations are sharpening their skills to better understand and use AI',
 'O’Reilly Radar: Open source technology trends—What our users tell us',
 'Antitrust regulators are using the wrong tools to break up Big Tech',
 'Built to last: Building and growing open source communities',
 'One simple graphic: Researchers love PyTorch and TensorFlow',
 'The role of open source in mitigating natural disasters',
 "O'Reilly Open Source and Frank Willison Awards",
 'Taming chaos: Preparing for your next incident',
 'Acquiring and sharing high-quality data',
 'Ask not what Brands™ can do for you',
 'The war for the soul of open source',
 'Why Amazon cares a

### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [174]:
machine_learning = new_df_length[new_df_length['title'].str.contains('machine learning')]
list(machine_learning['title'])

['Managing machine learning in the enterprise: Lessons from banking and health care',
 'Labeling, transforming, and structuring training data sets for machine learning']