# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [40]:
import feedparser
import re
import pandas as pd

### 1. Use feedparser to parse the following RSS feed URL.

In [3]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [None]:
#feed dot autor is not working

In [4]:
feeds = feedparser.parse(url)

In [None]:
#feeds

### 2. Obtain a list of components (keys) that are available for this feed.

In [7]:
feeds.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [8]:
feeds.feed.keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

### 5. Count the number of entries that are contained in this RSS feed.

In [20]:
n_entries = len(feeds.entries)
n_entries

18

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [23]:
feeds.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [26]:
feeds.entries[0].keys()

{'type': 'text/plain',
 'language': None,
 'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
 'value': 'It’s important to cultivate your organization’s collective genius'}

In [27]:
titles = [feeds.entries[i].title for i in range(len(feeds.entries))]
print(titles)

['It’s important to cultivate your organization’s collective genius', 'Four short links: 5 November 2019', 'Four short links: 4 November 2019', 'Quantum computing’s potential is still far off, but quantum supremacy shows we’re on the right track', 'Four short links: 1 November 2019', 'Highlights from TensorFlow World in Santa Clara, California 2019', 'Sticker recommendations and AI-driven innovations on the Hike messaging platform', '“Human error”: How can we help people build models that do what they expect', 'Personalization of Spotify Home and TensorFlow', 'TensorFlow.js: Bringing machine learning to JavaScript', 'TFX: An end-to-end ML platform for everyone', 'MLIR: Accelerating AI', 'TensorFlow Hub: The platform to share and discover pretrained models for TensorFlow', 'TensorFlow Lite: ML for mobile and IoT devices', 'Four short links: 31 October 2019', 'Accelerating ML at Twitter', 'The latest from TensorFlow', 'TensorFlow World 2019 opening keynote']


### 8. Calculate the percentage of "Four short links" entry titles.

In [71]:
n_str = data[data['title'].str.contains("Four short links")]

In [75]:
n_n_str = len(n_str.index)

In [76]:
percentage = n_n_str / n_entries
percentage

0.2222222222222222

### 9. Create a Pandas data frame from the feed's entries.

In [48]:
data = pd.DataFrame(feeds.entries)
data.head(2)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,It’s important to cultivate your organization’...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/its-important-to...,"Tue, 05 Nov 2019 05:05:36 +0000","(2019, 11, 5, 5, 5, 36, 1, 309, 0)",[{'name': 'Jenn Webb'}],Jenn Webb,{'name': 'Jenn Webb'},"[{'term': 'Future of the Firm', 'scheme': None...",https://www.oreilly.com/radar/?p=10231,False,In this interview from O&#8217;Reilly Foo Camp...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/its-important-to...,0,https://www.oreilly.com/radar/its-important-to...
1,Four short links: 5 November 2019,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Tue, 05 Nov 2019 05:01:13 +0000","(2019, 11, 5, 5, 1, 13, 1, 309, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=10644,False,&#8220;Nearly All&#8221; Counter-Strike Microt...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [54]:
df_author = data.groupby('author', as_index=False).agg({"title": "count"})
df_author.sort_values('title', ascending=False)

Unnamed: 0,author,title
11,Nat Torkington,4
0,Ankur Narang,1
1,Anna Roth,1
2,Chris Lattner and Tatiana Shpeisman,1
3,Jared Duke and Sarah Sirajuddin,1
4,Jeff Dean,1
5,Jenn Webb,1
6,Konstantinos Katsiapis and Anusha Ramesh,1
7,Mac Slocum,1
8,Megan Kacholia,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [64]:
data["len_str"]= data["title"].str.len() 
data.head(2)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink,len_str
0,It’s important to cultivate your organization’...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/its-important-to...,"Tue, 05 Nov 2019 05:05:36 +0000","(2019, 11, 5, 5, 5, 36, 1, 309, 0)",[{'name': 'Jenn Webb'}],Jenn Webb,{'name': 'Jenn Webb'},"[{'term': 'Future of the Firm', 'scheme': None...",https://www.oreilly.com/radar/?p=10231,False,In this interview from O&#8217;Reilly Foo Camp...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/its-important-to...,0,https://www.oreilly.com/radar/its-important-to...,65
1,Four short links: 5 November 2019,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Tue, 05 Nov 2019 05:01:13 +0000","(2019, 11, 5, 5, 1, 13, 1, 309, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=10644,False,&#8220;Nearly All&#8221; Counter-Strike Microt...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...,33


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [68]:
data[data['summary'].str.contains("machine learning")]

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink,len_str
5,Highlights from TensorFlow World in Santa Clar...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/highlights-from-...,"Fri, 01 Nov 2019 00:30:39 +0000","(2019, 11, 1, 0, 30, 39, 4, 305, 0)",[{'name': 'Mac Slocum'}],Mac Slocum,{'name': 'Mac Slocum'},"[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=10168,False,People from across the TensorFlow community ca...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/highlights-from-...,0,https://www.oreilly.com/radar/highlights-from-...,64
