In [1]:
import feedparser
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup 
import urllib

### URL of News website RSS

In [2]:
#bbc news rss
rss_url = "http://feeds.bbci.co.uk/news/rss.xml#"

### functions used in this code

In [3]:
# fetch the rss feed and return the parsed RSS
def parseRSS(url):
    return feedparser.parse(url) 

# return list of news' info in specific area 
def getInfo(data, area):
    info = []
    for news in data['entries']:
        try:
            info.append(news[area])
        except:
            info.append("")
    return info

#parse Html in link
def parseHTML(url):
    url = urllib.request.urlopen(url)
    content = url.read()
    soup = BeautifulSoup(content, "html.parser")
    return soup

### parse XML

In [4]:
data = parseRSS(rss_url)

### create DataFrame

In [8]:
df = pd.DataFrame()
areas = ["title", "published", "summary", "link"]

In [9]:
for area in areas:
    df[area] = getInfo(data, area)

In [10]:
#rename columns
df.columns = ["title", "date", "description", "link"]

In [44]:
df

Unnamed: 0,title,date,description,link,full text
0,Boris Johnson says Supreme Court 'wrong' to ru...,"Wed, 25 Sep 2019 20:07:40 GMT","In angry Commons scenes, the PM goads oppositi...",https://www.bbc.co.uk/news/uk-politics-49827803,Boris Johnson has told MPs the Supreme Court w...
1,MPs' fury at Boris Johnson's 'dangerous language',"Wed, 25 Sep 2019 22:03:25 GMT",Labour MPs refer to murdered MP Jo Cox as they...,https://www.bbc.co.uk/news/uk-politics-49833804,Boris Johnson has refused to moderate his lang...
2,Trump impeachment: Memo confirms president urg...,"Wed, 25 Sep 2019 18:53:54 GMT",The notes show Mr Trump asked the Ukrainian pr...,https://www.bbc.co.uk/news/world-us-canada-498...,The Trump administration has released details ...
3,Widnes dog attack: Elayne Stanley named as victim,"Wed, 25 Sep 2019 14:24:27 GMT","Neighbours say Elayne Stanley, 44, was killed ...",https://www.bbc.co.uk/news/uk-england-merseysi...,A woman has died after she was attacked by two...
4,'Seahorse' transgender man loses challenge to ...,"Wed, 25 Sep 2019 14:26:11 GMT","A court rules Freddy McConnell, who gave birth...",https://www.bbc.co.uk/news/uk-49828705,"A transgender man, who has given birth to a ch..."
5,Baby Archie meets Archbishop Tutu,"Wed, 25 Sep 2019 15:35:17 GMT",It is the first time the Duke and Duchess's so...,https://www.bbc.co.uk/news/uk-49825285,The Duke and Duchess of Sussex have introduced...
6,Stoke-On-Trent: The city divided by a youth club,"Wed, 25 Sep 2019 08:58:51 GMT",A proposed £3m youth hub in Stoke is leading t...,https://www.bbc.co.uk/news/stories-49812479,
7,Climate change: UN panel signals red alert on ...,"Wed, 25 Sep 2019 09:00:09 GMT",A top panel of scientists says the oceans and ...,https://www.bbc.co.uk/news/science-environment...,
8,Amazon Alexa gets Samuel L Jackson and other c...,"Wed, 25 Sep 2019 20:18:29 GMT",The firm has also addressed privacy concerns a...,https://www.bbc.co.uk/news/technology-49829391,
9,Worker dies in Tata Port Talbot steelworks acc...,"Wed, 25 Sep 2019 16:36:11 GMT",An air ambulance was sent to Tata after report...,https://www.bbc.co.uk/news/uk-wales-49830715,


### extract news full text from "link"

In [46]:
df["full text"] = ""
for index, row in df.iterrows():
    try:
        soup = parseHTML(row["link"]) #get website content from link
        content = soup.findAll("div", {"class": "story-body"}) #get content in <div class="story-body">
        text = content[0].find_all("p") #find all texts starting with <p>
        pivot = content[0].find_all("p", {"class": "story-body__introduction"})[0] #find first useful <p>
        output = text[text.index(pivot):] #get following texts starting with <p>
        row["full text"] = ' '.join(list(map(lambda x: x.text, output))) #get text in following <p>
    except:
        continue

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [47]:
df.head()

Unnamed: 0,title,date,description,link,full text
0,Boris Johnson says Supreme Court 'wrong' to ru...,"Wed, 25 Sep 2019 20:07:40 GMT","In angry Commons scenes, the PM goads oppositi...",https://www.bbc.co.uk/news/uk-politics-49827803,Boris Johnson has told MPs the Supreme Court w...
1,MPs' fury at Boris Johnson's 'dangerous language',"Wed, 25 Sep 2019 22:03:25 GMT",Labour MPs refer to murdered MP Jo Cox as they...,https://www.bbc.co.uk/news/uk-politics-49833804,Boris Johnson has refused to moderate his lang...
2,Trump impeachment: Memo confirms president urg...,"Wed, 25 Sep 2019 18:53:54 GMT",The notes show Mr Trump asked the Ukrainian pr...,https://www.bbc.co.uk/news/world-us-canada-498...,The Trump administration has released details ...
3,Widnes dog attack: Elayne Stanley named as victim,"Wed, 25 Sep 2019 14:24:27 GMT","Neighbours say Elayne Stanley, 44, was killed ...",https://www.bbc.co.uk/news/uk-england-merseysi...,A woman has died after she was attacked by two...
4,'Seahorse' transgender man loses challenge to ...,"Wed, 25 Sep 2019 14:26:11 GMT","A court rules Freddy McConnell, who gave birth...",https://www.bbc.co.uk/news/uk-49828705,"A transgender man, who has given birth to a ch..."


### output dataframe to csv

In [48]:
df.to_csv("bbc_news.csv", encoding='utf-8')