In [1]:
import feedparser
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup 
import urllib

### URL of News website RSS

In [2]:
#cnn news rss
rss_url = "http://rss.cnn.com/rss/cnn_topstories.rss"

### functions used in this code

In [7]:
# fetch the rss feed and return the parsed RSS
def parseRSS(url):
    return feedparser.parse(url) 

# return list of news' info in specific area 
def getInfo(data, area):
    info = []
    for news in data['entries']:
        try:
            info.append(news[area])
        except:
            info.append("")
    return info

#parse Html in link
def parseHTML(url):
    url = urllib.request.urlopen(url)
    content = url.read()
    soup = BeautifulSoup(content, "html.parser")
    return soup

### parse XML

In [4]:
data = parseRSS(rss_url)

### create DataFrame

In [10]:
df = pd.DataFrame()
areas = ["title", "published", "summary", "link"]

In [11]:
for area in areas:
    df[area] = getInfo(data, area)

In [12]:
#rename columns
df.columns = ["title", "date", "description", "link"]

In [13]:
df.head()

Unnamed: 0,title,date,description,link
0,The anonymous whistleblower has tentatively ag...,"Thu, 26 Sep 2019 01:17:26 GMT",The anonymous whistleblower who filed a compla...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/ez...
1,Trump stays defiant: 'I didn't do it. There wa...,,"<div class=""feedflare"">\n<a href=""http://rss.c...",http://rss.cnn.com/~r/rss/cnn_topstories/~3/5P...
2,Reaction: Baldwin examines Trump's bizarre Pel...,"Wed, 25 Sep 2019 21:16:43 GMT",CNN's Brooke Baldwin reacts to President Donal...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/bc...
3,Timeline: Tracking White House contacts and Uk...,"Thu, 26 Sep 2019 02:12:33 GMT","<div class=""feedflare"">\n<a href=""http://rss.c...",http://rss.cnn.com/~r/rss/cnn_topstories/~3/cs...
4,Washington Post: Acting spy chief threatened t...,"Wed, 25 Sep 2019 21:28:07 GMT",Acting Director of National Intelligence Josep...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/TK...


### process "description" html format to get text

In [14]:
for index, row in df.iterrows():
    row['description'] = row['description'].split('<div class="feedflare">')[0]

In [15]:
df.head()

Unnamed: 0,title,date,description,link
0,The anonymous whistleblower has tentatively ag...,"Thu, 26 Sep 2019 01:17:26 GMT",The anonymous whistleblower who filed a compla...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/ez...
1,Trump stays defiant: 'I didn't do it. There wa...,,,http://rss.cnn.com/~r/rss/cnn_topstories/~3/5P...
2,Reaction: Baldwin examines Trump's bizarre Pel...,"Wed, 25 Sep 2019 21:16:43 GMT",CNN's Brooke Baldwin reacts to President Donal...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/bc...
3,Timeline: Tracking White House contacts and Uk...,"Thu, 26 Sep 2019 02:12:33 GMT",,http://rss.cnn.com/~r/rss/cnn_topstories/~3/cs...
4,Washington Post: Acting spy chief threatened t...,"Wed, 25 Sep 2019 21:28:07 GMT",Acting Director of National Intelligence Josep...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/TK...


### extract news full text from "link"

In [27]:
df["full text"] = ""
for index, row in df.iterrows():
    try:
        soup = parseHTML(row["link"]) #get website content from link
        content = soup.findAll("div", {"class": "zn-body__paragraph"}) #get content in <div class="zn-body__paragraph">
        text = ' '.join(list(map(lambda x: x.text, content))) #get text between <div>...</div>
        row["full text"] = text
    except:
        continue

In [28]:
df.head()

Unnamed: 0,title,date,description,link,full text
0,The anonymous whistleblower has tentatively ag...,"Thu, 26 Sep 2019 01:17:26 GMT",The anonymous whistleblower who filed a compla...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/ez...,The meeting could take place on the condition ...
1,Trump stays defiant: 'I didn't do it. There wa...,,,http://rss.cnn.com/~r/rss/cnn_topstories/~3/5P...,"Sitting in a Manhattan hotel meeting room, the..."
2,Reaction: Baldwin examines Trump's bizarre Pel...,"Wed, 25 Sep 2019 21:16:43 GMT",CNN's Brooke Baldwin reacts to President Donal...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/bc...,
3,Timeline: Tracking White House contacts and Uk...,"Thu, 26 Sep 2019 02:12:33 GMT",,http://rss.cnn.com/~r/rss/cnn_topstories/~3/cs...,The growing controversy stems from a whistlebl...
4,Washington Post: Acting spy chief threatened t...,"Wed, 25 Sep 2019 21:28:07 GMT",Acting Director of National Intelligence Josep...,http://rss.cnn.com/~r/rss/cnn_topstories/~3/TK...,Citing current and former US officials familia...


### output dataframe to csv

In [29]:
df.to_csv("cnn_news.csv", encoding='utf-8')