# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser
import pandas as pd
import re
from collections import Counter

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
feeds = feedparser.parse(url) 

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
feeds

{'feed': {'title': 'Radar',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': 'Radar'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://www.oreilly.com/radar'},
   {'rel': 'self',
    'type': 'application/rss+xml',
    'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
   {'rel': 'hub',
    'href': 'http://pubsubhubbub.appspot.com/',
    'type': 'text/html'}],
  'link': 'https://www.oreilly.com/radar',
  'subtitle': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology',
  'subtitle_detail': {'type': 'text/html',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'},
  'updated': 'Tue, 05 Nov 2019 12:28:35 +0000',
  'updated_parsed': time.struct_time(tm_year=2019, tm_mon=11, tm_m

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
feeds.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [6]:
feeds.feed.keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

In [7]:
feeds.feed.title

'Radar'

In [8]:
feeds.feed.subtitle

'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'

In [9]:
feeds.feed.link

'https://www.oreilly.com/radar'

In [10]:
feeds.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

In [11]:
feeds.entries[0].author

'Nat Torkington'

### 5. Count the number of entries that are contained in this RSS feed.

In [12]:
df = pd.DataFrame(feeds.entries)
df.count()

author                 18
author_detail          18
authors                18
comments               18
content                18
feedburner_origlink    18
guidislink             18
id                     18
link                   18
links                  18
published              18
published_parsed       18
slash_comments         18
summary                18
summary_detail         18
tags                   18
title                  18
title_detail           18
wfw_commentrss         18
dtype: int64

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [13]:
feeds.entries

[{'title': 'Four short links: 6 November 2019',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': 'Four short links: 6 November 2019'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'http://feedproxy.google.com/~r/oreilly/radar/atom/~3/HApLcqKbwf8/'}],
  'link': 'http://feedproxy.google.com/~r/oreilly/radar/atom/~3/HApLcqKbwf8/',
  'comments': 'https://www.oreilly.com/radar/four-short-links-6-november-2019/#respond',
  'published': 'Wed, 06 Nov 2019 05:01:31 +0000',
  'published_parsed': time.struct_time(tm_year=2019, tm_mon=11, tm_mday=6, tm_hour=5, tm_min=1, tm_sec=31, tm_wday=2, tm_yday=310, tm_isdst=0),
  'authors': [{'name': 'Nat Torkington'}],
  'author': 'Nat Torkington',
  'author_detail': {'name': 'Nat Torkington'},
  'tags': [{'term': 'Four Short Links', 'scheme': None, 'label': None},
   {'term': 'Signals', 'scheme': None, 'label': None}],
  'id': 'https://www.oreilly

### 7. Extract a list of entry titles.

In [14]:
feeds.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 8. Calculate the percentage of "Four short links" entry titles.

In [15]:
titles = [feeds.entries[i].title for i in range(len(feeds.entries))]

In [16]:
all_titles = len(titles)
links = len(list(filter(lambda x: x.startswith("Four short links"), titles)))
perc = round(links*100 / all_titles, 0)

print("The occurrence of 'Four short links'",perc, "percent")

The occurrence of 'Four short links' 28.0 percent


### 9. Create a Pandas data frame from the feed's entries.

In [17]:
import pandas as pd

In [18]:
df = pd.DataFrame(feeds.entries)
df.head(5)

Unnamed: 0,author,author_detail,authors,comments,content,feedburner_origlink,guidislink,id,link,links,published,published_parsed,slash_comments,summary,summary_detail,tags,title,title_detail,wfw_commentrss
0,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],https://www.oreilly.com/radar/four-short-links...,"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,False,https://www.oreilly.com/radar/?p=10648,http://feedproxy.google.com/~r/oreilly/radar/a...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...","Wed, 06 Nov 2019 05:01:31 +0000","(2019, 11, 6, 5, 1, 31, 2, 310, 0)",0,Things I Wish Someone Had Explained About Func...,"{'type': 'text/html', 'language': None, 'base'...","[{'term': 'Four Short Links', 'scheme': None, ...",Four short links: 6 November 2019,"{'type': 'text/plain', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...
1,Jenn Webb,{'name': 'Jenn Webb'},[{'name': 'Jenn Webb'}],https://www.oreilly.com/radar/its-important-to...,"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/its-important-to...,False,https://www.oreilly.com/radar/?p=10231,http://feedproxy.google.com/~r/oreilly/radar/a...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...","Tue, 05 Nov 2019 05:05:36 +0000","(2019, 11, 5, 5, 5, 36, 1, 309, 0)",0,In this interview from O&#8217;Reilly Foo Camp...,"{'type': 'text/html', 'language': None, 'base'...","[{'term': 'Future of the Firm', 'scheme': None...",It’s important to cultivate your organization’...,"{'type': 'text/plain', 'language': None, 'base...",https://www.oreilly.com/radar/its-important-to...
2,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],https://www.oreilly.com/radar/four-short-links...,"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,False,https://www.oreilly.com/radar/?p=10644,http://feedproxy.google.com/~r/oreilly/radar/a...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...","Tue, 05 Nov 2019 05:01:13 +0000","(2019, 11, 5, 5, 1, 13, 1, 309, 0)",0,&#8220;Nearly All&#8221; Counter-Strike Microt...,"{'type': 'text/html', 'language': None, 'base'...","[{'term': 'Four Short Links', 'scheme': None, ...",Four short links: 5 November 2019,"{'type': 'text/plain', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...
3,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],https://www.oreilly.com/radar/four-short-links...,"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,False,https://www.oreilly.com/radar/?p=10612,http://feedproxy.google.com/~r/oreilly/radar/a...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...","Mon, 04 Nov 2019 05:01:01 +0000","(2019, 11, 4, 5, 1, 1, 0, 308, 0)",0,Beyond Bots and Trolls: Understanding Disinfor...,"{'type': 'text/html', 'language': None, 'base'...","[{'term': 'Four Short Links', 'scheme': None, ...",Four short links: 4 November 2019,"{'type': 'text/plain', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...
4,Mike Loukides,{'name': 'Mike Loukides'},[{'name': 'Mike Loukides'}],https://www.oreilly.com/radar/quantum-computin...,"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/quantum-computin...,False,https://www.oreilly.com/radar/?p=10154,http://feedproxy.google.com/~r/oreilly/radar/a...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...","Fri, 01 Nov 2019 04:05:34 +0000","(2019, 11, 1, 4, 5, 34, 4, 305, 0)",0,One of the most exciting topics we’ve been fol...,"{'type': 'text/html', 'language': None, 'base'...","[{'term': 'Innovation & Disruption', 'scheme':...",Quantum computing’s potential is still far off...,"{'type': 'text/plain', 'language': None, 'base...",https://www.oreilly.com/radar/quantum-computin...


### 10. Count the number of entries per author and sort them in descending order.

In [19]:
authors = df.groupby('author', as_index=False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False)

Unnamed: 0,author,entries
10,Nat Torkington,5
0,Ankur Narang,1
1,Anna Roth,1
2,Chris Lattner and Tatiana Shpeisman,1
3,Jared Duke and Sarah Sirajuddin,1
4,Jenn Webb,1
5,Konstantinos Katsiapis and Anusha Ramesh,1
6,Mac Slocum,1
7,Megan Kacholia,1
8,Mike Liang,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [20]:
length = lambda x: len(x)

lenght_of_entry_title = list(map(length, df.title))
df['lenght_of_entry_title'] = lenght_of_entry_title

lenght_1 = df[['author', 'lenght_of_entry_title']].copy()
lenght_1.sort_values(['lenght_of_entry_title'], ascending=False)

Unnamed: 0,author,lenght_of_entry_title
4,Mike Loukides,100
13,Mike Liang,83
7,Ankur Narang,80
8,Anna Roth,75
1,Jenn Webb,65
6,Mac Slocum,64
10,Sandeep Gupta and Joseph Paul Cohen,54
9,Tony Jebara,46
14,Jared Duke and Sarah Sirajuddin,46
11,Konstantinos Katsiapis and Anusha Ramesh,43


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [39]:
#machine_learning = [feeds.entries[i].summary for i in range(len(feeds.entries))]

In [40]:
machine_learning_list = [i for i in df["title"] if "machine learning" in i]
machine_learning_list

['TensorFlow.js: Bringing machine learning to JavaScript']