# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [9]:
import feedparser
import pandas as pd

### 1. Use feedparser to parse the following RSS feed URL.

In [3]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [4]:
oreilly = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [5]:
oreilly.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [6]:
oreilly.feed.keys()

dict_keys(['title', 'title_detail', 'id', 'guidislink', 'link', 'updated', 'updated_parsed', 'subtitle', 'subtitle_detail', 'links', 'authors', 'author_detail', 'author', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [17]:
keys = ['title', 'subtitle', 'author', 'link']
dict2 = {x:oreilly.feed[x] for x in keys}
dict2

{'title': "All - O'Reilly Media",
 'subtitle': 'All of our Ideas and Learning material from all of our topics.',
 'author': "O'Reilly Media",
 'link': 'https://www.oreilly.com'}

### 5. Count the number of entries that are contained in this RSS feed.

In [22]:

print(len(oreilly.entries))

60


### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [24]:
oreilly.entries[0].keys()

dict_keys(['title', 'title_detail', 'updated', 'updated_parsed', 'id', 'guidislink', 'link', 'content', 'summary', 'links', 'authors', 'author_detail', 'author', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [30]:
df = pd.DataFrame(oreilly.entries)
entries = list(df["title"])
print(entries)

['Four short links: 27 August 2019', 'Four short links: 26 August 2019', 'How organizations are sharpening their skills to better understand and use AI', 'Four short links: 23 August 2019', 'Four short links: 22 August 2019', 'Four short links: 21 August 2019', 'Four short links: 20 August 2019', 'Four short links: 19 August 2019', 'Antitrust regulators are using the wrong tools to break up Big Tech', 'Labeling, transforming, and structuring training data sets for machine learning', 'Four short links: 15 August 2019', 'Four short links: 14 August 2019', 'Four short links: 13 August 2019', 'Four short links: 12 August 2019', 'Blockchain solutions in enterprise', 'Four short links: 9 August 2019', 'Got speech? These guidelines will help you get started building voice applications', 'Four short links: 8 August 2019', 'New live online training courses', 'Four short links: 7 August 2019', 'Four short links: 6 August 2019', 'Four short links: 5 August 2019', 'Four short links: 2 August 2019'

### 8. Calculate the percentage of "Four short links" entry titles.

In [42]:
from collections import Counter 
import re
#counted = Counter(clean_list)
#final_list = [el for el in clean_list if counted[el] > 3]

#last_list = [word for word in final_list if len(word) > 3]
clean_list = [item for item in entries if item.startswith('Four short')]
#print(clean_list)

print(str(len(clean_list)/len(entries)*100) + "%")

percent = sum(list(map(lambda x : int(x.startswith('Four short')), entries))) / len(entries) * 100
percent

58.333333333333336%


58.333333333333336

### 9. Create a Pandas data frame from the feed's entries.

In [None]:
import pandas as pd

In [43]:
df.head(10)

Unnamed: 0,author,author_detail,authors,content,feedburner_origlink,guidislink,id,link,links,summary,title,title_detail,updated,updated_parsed
0,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-27:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Personal Information, Research Data, Ma...",Four short links: 27 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-27T11:10:00Z,"(2019, 8, 27, 11, 10, 0, 1, 239, 0)"
1,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-26:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Avoiding Sexual Predators, YouTube Radi...",Four short links: 26 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-26T11:00:00Z,"(2019, 8, 26, 11, 0, 0, 0, 238, 0)"
2,Ben Lorica,{'name': 'Ben Lorica'},[{'name': 'Ben Lorica'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/how-organization...,True,"tag:www.oreilly.com,2019-08-26:/ideas/how-orga...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,How organizations are sharpening their skills ...,"{'type': 'text/plain', 'language': None, 'base...",2019-08-26T11:00:00Z,"(2019, 8, 26, 11, 0, 0, 0, 238, 0)"
3,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-23:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Open Source Economics, Program Synthesi...",Four short links: 23 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-23T08:00:00Z,"(2019, 8, 23, 8, 0, 0, 4, 235, 0)"
4,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-22:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>I Don't Know, Map Quirks, UI Toolkit, a...",Four short links: 22 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-22T12:55:00Z,"(2019, 8, 22, 12, 55, 0, 3, 234, 0)"
5,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-21:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Competition vs. Convenience, Super-Cont...",Four short links: 21 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-21T11:40:00Z,"(2019, 8, 21, 11, 40, 0, 2, 233, 0)"
6,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-20:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Content Moderation, Robust Learning, Ar...",Four short links: 20 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-20T11:00:00Z,"(2019, 8, 20, 11, 0, 0, 1, 232, 0)"
7,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-08-19:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Developer Tool, Deep Fakes, DNA Tests, ...",Four short links: 19 August 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-08-19T11:20:00Z,"(2019, 8, 19, 11, 20, 0, 0, 231, 0)"
8,Tim O'Reilly,{'name': 'Tim O'Reilly'},[{'name': 'Tim O'Reilly'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/antitrust-regula...,True,"tag:www.oreilly.com,2019-08-19:/ideas/antitrus...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,Antitrust regulators are using the wrong tools...,"{'type': 'text/plain', 'language': None, 'base...",2019-08-19T11:00:00Z,"(2019, 8, 19, 11, 0, 0, 0, 231, 0)"
9,Ben Lorica,{'name': 'Ben Lorica'},[{'name': 'Ben Lorica'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/labeling-transfo...,True,"tag:www.oreilly.com,2019-08-15:/ideas/labeling...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,"Labeling, transforming, and structuring traini...","{'type': 'text/plain', 'language': None, 'base...",2019-08-15T11:30:00Z,"(2019, 8, 15, 11, 30, 0, 3, 227, 0)"


### 10. Count the number of entries per author and sort them in descending order.

In [45]:
authors = df.groupby('author', as_index=False).agg({'id':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False)

Unnamed: 0,author,entries
12,Nat Torkington,35
5,Ben Lorica,5
11,Mike Loukides,2
0,Adam Jacob,1
10,"Michael Bradley, David Gorman, Matt Lucas, Mat...",1
17,Tim O'Reilly,1
16,Tiffani Bell,1
15,Roger Magoulas,1
14,Pete Skomoroch,1
13,"Pedro Cruz, Brad Topol",1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [63]:
df["length"]=  df["title"].str.len()
new_df = df[["title", "author", "length"]]
new_df.sort_values('length', ascending=False)

Unnamed: 0,title,author,length
16,Got speech? These guidelines will help you get...,"Ben Lorica, Yishay Carmiel",82
53,Managing machine learning in the enterprise: L...,"Ben Lorica, Harish Doddi, David Talby",81
9,"Labeling, transforming, and structuring traini...",Ben Lorica,79
48,Highlights from the O'Reilly Open Source Softw...,Mac Slocum,77
2,How organizations are sharpening their skills ...,Ben Lorica,77
40,O’Reilly Radar: Open source technology trends—...,Roger Magoulas,68
8,Antitrust regulators are using the wrong tools...,Tim O'Reilly,67
50,Built to last: Building and growing open sourc...,Kay Williams,59
32,One simple graphic: Researchers love PyTorch a...,Ben Lorica,59
49,The role of open source in mitigating natural ...,"Pedro Cruz, Brad Topol",55


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [79]:
#print(list(df["summary"])[0])

new_new_df = df[df['summary'].str.contains("machine learning")]
list(new_new_df["title"])

['How organizations are sharpening their skills to better understand and use AI',
 'Labeling, transforming, and structuring training data sets for machine learning',
 'Four short links: 15 August 2019',
 'Got speech? These guidelines will help you get started building voice applications',
 'New live online training courses',
 'Four short links: 5 August 2019',
 'Learning from adversaries',
 'One simple graphic: Researchers love PyTorch and TensorFlow',
 'Acquiring and sharing high-quality data',
 "Highlights from the O'Reilly Open Source Software Conference in Portland 2019",
 'Managing machine learning in the enterprise: Lessons from banking and health care']