In [1]:
import feedparser
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup 
import urllib

### URL of News website RSS

In [2]:
#yahoo news rss
rss_url = "https://news.yahoo.com/rss/mostviewed"

### functions used in this code

In [3]:
# fetch the rss feed and return the parsed RSS
def parseRSS(url):
    return feedparser.parse(url) 

# return list of news' info in specific area 
def getInfo(data, area):
    info = []
    for news in data['entries']:
        try:
            info.append(news[area])
        except:
            info.append("")
    return info

#parse Html in link
def parseHTML(url):
    url = urllib.request.urlopen(url)
    content = url.read()
    soup = BeautifulSoup(content, "html.parser")
    return soup

### parse XML

In [4]:
data = parseRSS(rss_url)

### create DataFrame

In [5]:
df = pd.DataFrame()
areas = ["id", "title", "published", "summary", "link"]

In [6]:
for area in areas:
    df[area] = getInfo(data, area)

In [7]:
#rename columns
df.columns = ["id", "title", "date", "description", "link"]

In [8]:
df.head()

Unnamed: 0,id,title,date,description,link
0,senator-elizabeth-warrens-net-worth-151126747....,Senator Elizabeth Warren&#39;s net worth revea...,"Tue, 24 Sep 2019 11:11:26 -0400","<p><a href=""https://news.yahoo.com/senator-eli...",https://news.yahoo.com/senator-elizabeth-warre...
1,bill-oreilly-says-book-trump-162056672.html,Bill O&#39;Reilly says new book on Trump &#39;...,"Tue, 24 Sep 2019 12:20:56 -0400","<p><a href=""https://news.yahoo.com/bill-oreill...",https://news.yahoo.com/bill-oreilly-says-book-...
2,israeli-pm-rival-meet-final-110738975.html,Israel&#39;s Netanyahu given chance to form ne...,"Wed, 25 Sep 2019 14:46:04 -0400","<p><a href=""https://news.yahoo.com/israeli-pm-...",https://news.yahoo.com/israeli-pm-rival-meet-f...
3,irans-strange-navy-small-fast-114700802.html,"Iran&#39;s Strange Navy of Small, Fast Boats I...","Tue, 24 Sep 2019 07:47:00 -0400","<p><a href=""https://news.yahoo.com/irans-stran...",https://news.yahoo.com/irans-strange-navy-smal...
4,rashida-tlaib-argues-pro-vaping-213301525.html,Rashida Tlaib argues with pro-vaping witness d...,"Tue, 24 Sep 2019 17:33:01 -0400","<p><a href=""https://news.yahoo.com/rashida-tla...",https://news.yahoo.com/rashida-tlaib-argues-pr...


### process "title" to get rid of weird chars

In [9]:
for index, row in df.iterrows():
    row['title'] = row['title'].replace('&#39;', '')

In [10]:
df.head()

Unnamed: 0,id,title,date,description,link
0,senator-elizabeth-warrens-net-worth-151126747....,Senator Elizabeth Warrens net worth revealed a...,"Tue, 24 Sep 2019 11:11:26 -0400","<p><a href=""https://news.yahoo.com/senator-eli...",https://news.yahoo.com/senator-elizabeth-warre...
1,bill-oreilly-says-book-trump-162056672.html,Bill OReilly says new book on Trump will bring...,"Tue, 24 Sep 2019 12:20:56 -0400","<p><a href=""https://news.yahoo.com/bill-oreill...",https://news.yahoo.com/bill-oreilly-says-book-...
2,israeli-pm-rival-meet-final-110738975.html,Israels Netanyahu given chance to form new gov...,"Wed, 25 Sep 2019 14:46:04 -0400","<p><a href=""https://news.yahoo.com/israeli-pm-...",https://news.yahoo.com/israeli-pm-rival-meet-f...
3,irans-strange-navy-small-fast-114700802.html,"Irans Strange Navy of Small, Fast Boats Is No ...","Tue, 24 Sep 2019 07:47:00 -0400","<p><a href=""https://news.yahoo.com/irans-stran...",https://news.yahoo.com/irans-strange-navy-smal...
4,rashida-tlaib-argues-pro-vaping-213301525.html,Rashida Tlaib argues with pro-vaping witness d...,"Tue, 24 Sep 2019 17:33:01 -0400","<p><a href=""https://news.yahoo.com/rashida-tla...",https://news.yahoo.com/rashida-tlaib-argues-pr...


### process "description" html format to get text

In [11]:
for index, row in df.iterrows():
    row['description'] = BeautifulSoup(row['description']).getText()

In [12]:
df.head()

Unnamed: 0,id,title,date,description,link
0,senator-elizabeth-warrens-net-worth-151126747....,Senator Elizabeth Warrens net worth revealed a...,"Tue, 24 Sep 2019 11:11:26 -0400",The 2020 President Election is heating up -- a...,https://news.yahoo.com/senator-elizabeth-warre...
1,bill-oreilly-says-book-trump-162056672.html,Bill OReilly says new book on Trump will bring...,"Tue, 24 Sep 2019 12:20:56 -0400",Former Fox News host Bill O'Reilly latest book...,https://news.yahoo.com/bill-oreilly-says-book-...
2,israeli-pm-rival-meet-final-110738975.html,Israels Netanyahu given chance to form new gov...,"Wed, 25 Sep 2019 14:46:04 -0400",Israel's president on Wednesday asked Prime Mi...,https://news.yahoo.com/israeli-pm-rival-meet-f...
3,irans-strange-navy-small-fast-114700802.html,"Irans Strange Navy of Small, Fast Boats Is No ...","Tue, 24 Sep 2019 07:47:00 -0400","A ""mad max"" threat?",https://news.yahoo.com/irans-strange-navy-smal...
4,rashida-tlaib-argues-pro-vaping-213301525.html,Rashida Tlaib argues with pro-vaping witness d...,"Tue, 24 Sep 2019 17:33:01 -0400","""Are you a conspiracy theorist?"" Rashida Tlaib...",https://news.yahoo.com/rashida-tlaib-argues-pr...


### extract news full text from "link"

In [13]:
df["full text"] = ""
for index, row in df.iterrows():
    try:
        soup = parseHTML(row["link"]) #get website content from link
        content = soup.findAll("p") #get content with <p>...</p>
        text = ' '.join(list(map(lambda x: x.text, content))) #get text in <p>...</p>
        text = text.replace(u'\xa0', u' ') #remove '\xa0' in text
        row["full text"] = text
    except:
        continue

In [14]:
df.head()

Unnamed: 0,id,title,date,description,link,full text
0,senator-elizabeth-warrens-net-worth-151126747....,Senator Elizabeth Warrens net worth revealed a...,"Tue, 24 Sep 2019 11:11:26 -0400",The 2020 President Election is heating up -- a...,https://news.yahoo.com/senator-elizabeth-warre...,"The 2020 presidential race is well underway, a..."
1,bill-oreilly-says-book-trump-162056672.html,Bill OReilly says new book on Trump will bring...,"Tue, 24 Sep 2019 12:20:56 -0400",Former Fox News host Bill O'Reilly latest book...,https://news.yahoo.com/bill-oreilly-says-book-...,Bill O'Reilly says his new book is the first t...
2,israeli-pm-rival-meet-final-110738975.html,Israels Netanyahu given chance to form new gov...,"Wed, 25 Sep 2019 14:46:04 -0400",Israel's president on Wednesday asked Prime Mi...,https://news.yahoo.com/israeli-pm-rival-meet-f...,JERUSALEM (AP) — Israel's president on Wednesd...
3,irans-strange-navy-small-fast-114700802.html,"Irans Strange Navy of Small, Fast Boats Is No ...","Tue, 24 Sep 2019 07:47:00 -0400","A ""mad max"" threat?",https://news.yahoo.com/irans-strange-navy-smal...,Key point: Iran's navy could still threaten Am...
4,rashida-tlaib-argues-pro-vaping-213301525.html,Rashida Tlaib argues with pro-vaping witness d...,"Tue, 24 Sep 2019 17:33:01 -0400","""Are you a conspiracy theorist?"" Rashida Tlaib...",https://news.yahoo.com/rashida-tlaib-argues-pr...,In a House Oversight Committee hearing on vapi...


### output dataframe to csv

In [15]:
df.to_csv("yahoo_news.csv", encoding='utf-8')