In [1]:
import feedparser
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup 
import urllib

### URL of News website RSS

In [2]:
#cbc news rss
rss_url = "https://www.cbc.ca/cmlink/rss-canada-toronto"

### functions used in this code

In [3]:
# fetch the rss feed and return the parsed RSS
def parseRSS(url):
    return feedparser.parse(url) 

# return list of news' info in specific area 
def getInfo(data, area):
    info = []
    for news in data['entries']:
        try:
            info.append(news[area])
        except:
            info.append("")
    return info

#parse Html in link
def parseHTML(url):
    url = urllib.request.urlopen(url)
    content = url.read()
    soup = BeautifulSoup(content, "html.parser")
    return soup

### parse XML

In [4]:
data = parseRSS(rss_url)

### create DataFrame

In [5]:
df = pd.DataFrame()
areas = ["id", "title", "published", "author", "summary", "link"]

In [6]:
for area in areas:
    df[area] = getInfo(data, area)

In [7]:
#rename columns
df.columns = ["id", "title", "date", "author", "description", "link"]

In [8]:
df.head()

Unnamed: 0,id,title,date,author,description,link
0,1.5297702,"Salauddin Chowdhury, 65, identified as victim ...","Wed, 25 Sep 2019 22:40:28 EDT",Julia Knope,<img src='https://i.cbc.ca/1.5297745.156946306...,https://www.cbc.ca/news/canada/toronto/salaudd...
1,1.5297192,Noticing more roadkill around Toronto? You're ...,"Wed, 25 Sep 2019 19:50:39 EDT",Julia Knope,<img src='https://i.cbc.ca/1.5297517.156944908...,https://www.cbc.ca/news/canada/toronto/roadkil...
2,1.5296397,"Less student supervision, less school cleaning...","Wed, 25 Sep 2019 08:36:32 EDT",CBC News,<img src='https://i.cbc.ca/1.5296401.156941455...,https://www.cbc.ca/news/canada/toronto/cupe-wo...
3,1.5297405,1 dead following industrial accident in North ...,"Wed, 25 Sep 2019 17:45:39 EDT",CBC News,<img src='https://i.cbc.ca/1.4538532.151878821...,https://www.cbc.ca/news/canada/toronto/one-die...
4,1.5296649,Leafs star Auston Matthews regrets 'distractio...,"Wed, 25 Sep 2019 12:00:34 EDT",Joshua Clipperton,<img src='https://i.cbc.ca/1.5296674.156942695...,https://www.cbc.ca/sports/hockey/nhl/auston-ma...


### process "description" html format to get text

In [9]:
for index, row in df.iterrows():
    row['description'] = BeautifulSoup(row['description']).find('p').getText()

In [10]:
df.head()

Unnamed: 0,id,title,date,author,description,link
0,1.5297702,"Salauddin Chowdhury, 65, identified as victim ...","Wed, 25 Sep 2019 22:40:28 EDT",Julia Knope,Family members have identified Salauddin Chowd...,https://www.cbc.ca/news/canada/toronto/salaudd...
1,1.5297192,Noticing more roadkill around Toronto? You're ...,"Wed, 25 Sep 2019 19:50:39 EDT",Julia Knope,When Louise Piper came across a dead raccoon o...,https://www.cbc.ca/news/canada/toronto/roadkil...
2,1.5296397,"Less student supervision, less school cleaning...","Wed, 25 Sep 2019 08:36:32 EDT",CBC News,A union representing education workers and the...,https://www.cbc.ca/news/canada/toronto/cupe-wo...
3,1.5297405,1 dead following industrial accident in North ...,"Wed, 25 Sep 2019 17:45:39 EDT",CBC News,One person is dead following an industrial acc...,https://www.cbc.ca/news/canada/toronto/one-die...
4,1.5296649,Leafs star Auston Matthews regrets 'distractio...,"Wed, 25 Sep 2019 12:00:34 EDT",Joshua Clipperton,Maple Leafs star Auston Matthews said he regre...,https://www.cbc.ca/sports/hockey/nhl/auston-ma...


### extract news full text from "link"

In [11]:
df["full text"] = ""
for index, row in df.iterrows():
    try:
        soup = parseHTML(row["link"]) #get website content from link
        content = soup.findAll("div", {"class": "story"}) #get content in <div class="story">
        text = content[0].get_text() #remove html tags
        text = text.replace(u'\xa0', u' ') #remove '\xa0' in text
        row["full text"] = text
    except:
        continue

In [12]:
df.head()

Unnamed: 0,id,title,date,author,description,link,full text
0,1.5297702,"Salauddin Chowdhury, 65, identified as victim ...","Wed, 25 Sep 2019 22:40:28 EDT",Julia Knope,Family members have identified Salauddin Chowd...,https://www.cbc.ca/news/canada/toronto/salaudd...,Family members have identified Salauddin Chowd...
1,1.5297192,Noticing more roadkill around Toronto? You're ...,"Wed, 25 Sep 2019 19:50:39 EDT",Julia Knope,When Louise Piper came across a dead raccoon o...,https://www.cbc.ca/news/canada/toronto/roadkil...,When Louise Piper came across a dead raccoon o...
2,1.5296397,"Less student supervision, less school cleaning...","Wed, 25 Sep 2019 08:36:32 EDT",CBC News,A union representing education workers and the...,https://www.cbc.ca/news/canada/toronto/cupe-wo...,A union representing education workers and the...
3,1.5297405,1 dead following industrial accident in North ...,"Wed, 25 Sep 2019 17:45:39 EDT",CBC News,One person is dead following an industrial acc...,https://www.cbc.ca/news/canada/toronto/one-die...,One person is dead following an industrial acc...
4,1.5296649,Leafs star Auston Matthews regrets 'distractio...,"Wed, 25 Sep 2019 12:00:34 EDT",Joshua Clipperton,Maple Leafs star Auston Matthews said he regre...,https://www.cbc.ca/sports/hockey/nhl/auston-ma...,Moments after Auston Matthews delivered a 38-s...


### output dataframe to csv

In [13]:
df.to_csv("cbc_news.csv", encoding='utf-8')