# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

import re
import requests
import xml.etree.ElementTree as ET

import pandas as pd
from bs4 import BeautifulSoup

import random as rd

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
url_parsed = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
component_keys = list(url_parsed.keys())
component_keys

['bozo',
 'entries',
 'feed',
 'headers',
 'etag',
 'updated',
 'updated_parsed',
 'href',
 'status',
 'encoding',
 'version',
 'namespaces']

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
component_keys_feed = list(url_parsed['feed'].keys())
component_keys_feed

['title',
 'title_detail',
 'links',
 'link',
 'subtitle',
 'subtitle_detail',
 'updated',
 'updated_parsed',
 'language',
 'sy_updateperiod',
 'sy_updatefrequency',
 'generator_detail',
 'generator',
 'feedburner_info',
 'geo_lat',
 'geo_long',
 'feedburner_emailserviceid',
 'feedburner_feedburnerhostname']

### 4. Extract and print the feed title, subtitle, author, and link.

In [6]:
title = url_parsed['feed']['title']
subtitle = url_parsed['feed']['subtitle']
author = url_parsed['entries'][0]['author']
link = url_parsed['feed']['link']

print(title)
print(subtitle)
print(author)
print(link)

Radar
Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
Mike Loukides
https://www.oreilly.com/radar


In [7]:
entries = list(url_parsed['entries'])
print(len(set(entries)))

60


In [8]:
entries[1]['title']

'Communal Computing’s Many Problems'

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [9]:
component_keys_entry = list(entries[rd.randint(0, 60)].keys())
component_keys_entry

['title',
 'title_detail',
 'links',
 'link',
 'comments',
 'published',
 'published_parsed',
 'authors',
 'author',
 'author_detail',
 'tags',
 'id',
 'guidislink',
 'summary',
 'summary_detail',
 'content',
 'wfw_commentrss',
 'slash_comments',
 'feedburner_origlink']

### 7. Extract a list of entry titles.

In [10]:
titles = [entries[entry]['title'] for entry in list(range(len(entries)))]
titles

['Radar trends to watch: August 2021',
 'Communal Computing’s Many Problems',
 'Thinking About Glue',
 'Radar trends to watch: July 2021',
 'Hand Labeling Considered Harmful',
 'Two economies. Two sets of rules.',
 'Communal Computing',
 'Code as Infrastructure',
 'Radar trends to watch: June 2021',
 'AI Powered Misinformation and Manipulation at Scale #GPT-3',
 'DeepCheapFakes',
 'Radar trends to watch: May 2021',
 'Checking Jeff Bezos’s Math',
 'AI Adoption in the Enterprise 2021',
 'NFTs: Owning Digital Art',
 'Radar trends to watch: April 2021',
 'InfoTribes, Reality Brokers',
 'The End of Silicon Valley as We Know It?',
 'The Next Generation of AI',
 'Radar trends to watch: March 2021',
 'Product Management for AI',
 '5 things on our data and AI radar for 2021',
 '5 infrastructure and operations trends to watch in 2021',
 'The Wrong Question',
 'Radar trends to watch: February 2021',
 'Where Programming, Ops, AI, and the Cloud are Headed in 2021',
 'Seven Legal Questions for Data 

### 8. Calculate the percentage of "Four short links" entry titles.

In [11]:
four_short_links = [i for i in [re.findall(r'^[Four short links].*', title) for title in titles] if i != []]
percentage_four_short_links = round(len(four_short_links)/len(titles), 2)*100
percentage_four_short_links

35.0

### 9. Create a Pandas data frame from the feed's entries.

In [12]:
import pandas as pd

In [13]:
entries_df = pd.DataFrame(url_parsed['entries'])
entries_df.head(5)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Radar trends to watch: August 2021,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/radar-trends-to-...,"Mon, 02 Aug 2021 14:27:43 +0000","(2021, 8, 2, 14, 27, 43, 0, 214, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Radar Trends', 'scheme': None, 'lab...",https://www.oreilly.com/radar/?p=13892,False,Security continues to be in the news: most not...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/radar-trends-to-...,0,https://www.oreilly.com/radar/radar-trends-to-...
1,Communal Computing’s Many Problems,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/communal-computi...,"Tue, 20 Jul 2021 11:37:15 +0000","(2021, 7, 20, 11, 37, 15, 1, 201, 0)",[{'name': 'Chris Butler'}],Chris Butler,{'name': 'Chris Butler'},"[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=13876,False,"In the first article of this series, we discus...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/communal-computi...,0,https://www.oreilly.com/radar/communal-computi...
2,Thinking About Glue,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/thinking-about-g...,"Tue, 13 Jul 2021 13:28:28 +0000","(2021, 7, 13, 13, 28, 28, 1, 194, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Software Engineering', 'scheme': No...",https://www.oreilly.com/radar/?p=13867,False,"In Glue: the Dark Matter of Software, Marcel W...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/thinking-about-g...,0,https://www.oreilly.com/radar/thinking-about-g...
3,Radar trends to watch: July 2021,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/radar-trends-to-...,"Tue, 06 Jul 2021 17:12:56 +0000","(2021, 7, 6, 17, 12, 56, 1, 187, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Radar Trends', 'scheme': None, 'lab...",https://www.oreilly.com/radar/?p=13856,False,Certainly the biggest news of the past month h...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/radar-trends-to-...,0,https://www.oreilly.com/radar/radar-trends-to-...
4,Hand Labeling Considered Harmful,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/arguments-agains...,"Wed, 23 Jun 2021 12:34:40 +0000","(2021, 6, 23, 12, 34, 40, 2, 174, 0)",[{'name': 'Shayan Mohanty and Hugo Bowne-Ander...,Shayan Mohanty and Hugo Bowne-Anderson,{'name': 'Shayan Mohanty and Hugo Bowne-Anders...,"[{'term': 'Artificial Intelligence', 'scheme':...",https://www.oreilly.com/radar/?p=13825,False,We are traveling through the era of Software 2...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/arguments-agains...,0,https://www.oreilly.com/radar/arguments-agains...


### 10. Count the number of entries per author and sort them in descending order.

In [14]:
entries_per_author = entries_df[['author', 'title']]
count_entries_per_author = pd.DataFrame(entries_per_author.groupby('author').title.count()).sort_values(by='title', ascending=False)
count_entries_per_author

Unnamed: 0_level_0,title
author,Unnamed: 1_level_1
Mike Loukides,24
Nat Torkington,20
,4
Tim O’Reilly,3
Chris Butler,2
Hugo Bowne-Anderson,1
Justin Norman and Mike Loukides,1
Kevlin Henney,1
Nitesh Dhanjani,1
Patrick Hall and Ayoub Ouederni,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [15]:
title_length = entries_df[['title', 'author']]
title_length['n_char'] = entries_df['title'].str.len()
title_length.sort_values(by='n_char', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_length['n_char'] = entries_df['title'].str.len()


Unnamed: 0,title,author,n_char
25,"Where Programming, Ops, AI, and the Cloud are ...",Mike Loukides,60
9,AI Powered Misinformation and Manipulation at ...,Nitesh Dhanjani,58
22,5 infrastructure and operations trends to watc...,,55
31,O’Reilly’s top 20 live online training courses...,,54
21,5 things on our data and AI radar for 2021,,42
26,Seven Legal Questions for Data Scientists,Patrick Hall and Ayoub Ouederni,41
17,The End of Silicon Valley as We Know It?,Tim O’Reilly,40
53,AI Product Management After Deployment,Justin Norman and Mike Loukides,38
46,Radar trends to watch: November 2020,Mike Loukides,36
24,Radar trends to watch: February 2021,Mike Loukides,36


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [16]:
summary = entries_df['summary'].tolist()
ml_summary = pd.DataFrame([re.findall(r'\.*machine\slearning.*', text, 
              re.IGNORECASE) for text in summary], columns=['summary']).dropna()
ml_summary

Unnamed: 0,summary
4,"machine learning models, rather than hard-code..."
9,Machine Learning (ML) model has been trained o...
15,"machine learning and AI, including a substanti..."
21,Machine Learning (ML) applications and the [&#...
26,"machine learning (ML), can create new value fo..."


In [17]:
ml_summary.merge(entries_df, left_on='summary', right_on='title')

Unnamed: 0,summary_x,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary_y,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
