# Scrapping DBS press releases

## 1) Import libraries.

In [1]:
#import libraries
import pandas as pd
import numpy as np
import time
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from lxml import html
from tqdm import tqdm

## 2) [Website](https://www.dbs.com/media/news-list.page) is `javascript` based. Use `selenium` to gather URLs, headlines, and dates of publication. 

This is a good chance to practise using `selenium`. We will scrap all press releases up to the year 2015. 

In [2]:
#open chrome and get the right webpage
driver = webdriver.Chrome()

driver.get('https://www.dbs.com/media/news-list.page')

In [3]:
#dataframe to store press release information
pressers = pd.DataFrame(columns = ['headline','date','url']) 

#scrapping through the pages
for page_num in range(1, 157):

    #get headline elements
    headline_element = driver.find_elements_by_xpath('//*[@id="newsLists"]/div/ul/li/a/h3')

    #get date elements

    date_element = driver.find_elements_by_xpath('//*[@id="newsLists"]/div/ul/li/div/span[2]/span[2]')

    #get url elements
    url_element = driver.find_elements_by_xpath('//*[@id="newsLists"]/div/ul/li/a')

    #extract headlines
    headlines = [headline.text for headline in headline_element]

    #extract dates
    dates = [date.text for date in date_element]

    #extract urls
    urls = [url.get_attribute('href') for url in url_element]

    if len(headlines) == len(dates) == len(urls): #check if all headlines, dates, urls, scrapped
        print('page {} scrapped successfully'.format(page_num))
        for i in range(len(headlines)):
            #Adding headline, date and url to a dataframe    
            pressers.loc[len(pressers)] = [headlines[i], dates[i], urls[i]]
    
    #stop if press release reach the year you want 
    if pressers.loc[len(pressers)-1]['date'].split()[-1] == '2014':
        print('scrape completed')
        break
    
    #otherwise click next button and continue
    else:
        driver.find_elements_by_xpath('//*[@id="newsListPagination"]/ul/li/a[@data-step="+"]')[0].click()
        time.sleep(3) #wait 3 seconds for next page to load    

page 1 scrapped successfully
page 2 scrapped successfully
page 3 scrapped successfully
page 4 scrapped successfully
page 5 scrapped successfully
page 6 scrapped successfully
page 7 scrapped successfully
page 8 scrapped successfully
page 9 scrapped successfully
page 10 scrapped successfully
page 11 scrapped successfully
page 12 scrapped successfully
page 13 scrapped successfully
page 14 scrapped successfully
page 15 scrapped successfully
page 16 scrapped successfully
page 17 scrapped successfully
page 18 scrapped successfully
page 19 scrapped successfully
page 20 scrapped successfully
page 21 scrapped successfully
page 22 scrapped successfully
page 23 scrapped successfully
page 24 scrapped successfully
page 25 scrapped successfully
page 26 scrapped successfully
page 27 scrapped successfully
page 28 scrapped successfully
page 29 scrapped successfully
page 30 scrapped successfully
page 31 scrapped successfully
page 32 scrapped successfully
page 33 scrapped successfully
page 34 scrapped su

In [4]:
pressers

Unnamed: 0,headline,date,url
0,Digital Duopoly: US and China Dominate Global ...,21 Aug 2020,https://www.dbs.com/newsroom/Digital_Duopoly_U...
1,DBS Bank provides SGD40 million loan facility ...,20 Aug 2020,https://www.dbs.com/newsroom/DBS_Bank_provides...
2,DBS Asian Insights Conference 2020: Navigating...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Asian_Insight...
3,DBS Private Bank's flagship portfolio hits new...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Private_Banks...
4,DBS on Covid-19's impact on Singapore resident...,18 Aug 2020,https://www.dbs.com/newsroom/DBS_on_Covid_19_s...
...,...,...,...
615,DBS/POSB – First bank in Singapore to introduc...,20 Jan 2015,https://www.dbs.com/newsroom/DBSPOSB__First_ba...
616,Launch of DBS-NUS Social Venture Challenge Asi...,19 Jan 2015,https://www.dbs.com/newsroom/Launch_of_DBSNUS_...
617,DBS Bank celebrates SG50 with 28th SEA Games,16 Nov 2014,https://www.dbs.com/newsroom/DBS_Bank_celebrat...
618,DBS RMB Index for VVinning Enterprises falls t...,11 Nov 2014,https://www.dbs.com/newsroom/DBS_RMB_Index_for...


## 3) We will now use `requests` and `html` to visits the URLs scrapped and parse them for the text of the press releases.

In [5]:
#create an empty list to store the text 
press_releases = []

#iterate through the number of press releases
for i in tqdm(range(len(pressers))):
    url = pressers['url'][i]
    page = requests.get(url)
    tree = html.fromstring(page.content)
    #each tree object can be parsed and the text as a list of objects. join objects together and strip white space including line breaks
    press_release = " ".join([text.strip() for text in tree.xpath('//*[@id="bodywrapper"]/div[2]/section/div[1]/div//text()')])
    press_releases.append(press_release)

100%|██████████| 620/620 [03:50<00:00,  2.69it/s]


In [6]:
#attach text list as a new column in dataframe by converting into array first
pressers['text'] = np.array(press_releases)

In [7]:
pressers

Unnamed: 0,headline,date,url,text
0,Digital Duopoly: US and China Dominate Global ...,21 Aug 2020,https://www.dbs.com/newsroom/Digital_Duopoly_U...,Indonesia . 21 Aug 2020 . 3 min read Indones...
1,DBS Bank provides SGD40 million loan facility ...,20 Aug 2020,https://www.dbs.com/newsroom/DBS_Bank_provides...,"Singapore . 20 Aug 2020 Singapore, 20 Aug 20..."
2,DBS Asian Insights Conference 2020: Navigating...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Asian_Insight...,Indonesia . 19 Aug 2020 . 3 min read Indones...
3,DBS Private Bank's flagship portfolio hits new...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Private_Banks...,"Singapore . 19 Aug 2020 Singapore, 19 Aug 20..."
4,DBS on Covid-19's impact on Singapore resident...,18 Aug 2020,https://www.dbs.com/newsroom/DBS_on_Covid_19_s...,"Singapore . 18 Aug 2020 Singapore, 18 Aug 20..."
...,...,...,...,...
615,DBS/POSB – First bank in Singapore to introduc...,20 Jan 2015,https://www.dbs.com/newsroom/DBSPOSB__First_ba...,"Singapore . 20 Jan 2015 Singapore, 20 Jan 20..."
616,Launch of DBS-NUS Social Venture Challenge Asi...,19 Jan 2015,https://www.dbs.com/newsroom/Launch_of_DBSNUS_...,"Singapore . 19 Jan 2015 Singapore, 19 Jan 20..."
617,DBS Bank celebrates SG50 with 28th SEA Games,16 Nov 2014,https://www.dbs.com/newsroom/DBS_Bank_celebrat...,"Singapore . 16 Nov 2014 Singapore, 16 Nov 20..."
618,DBS RMB Index for VVinning Enterprises falls t...,11 Nov 2014,https://www.dbs.com/newsroom/DBS_RMB_Index_for...,"Hong Kong . 11 Nov 2014 Hong Kong, 11 Nov 20..."


Let's save the country that issued the press release in a separate column. 

In [20]:
#Country is first word in the text and there's a '.' after. use string slicing to get first word
pressers['country'] = pressers['text'].apply(lambda x: x[:x.find('.')])

In [21]:
pressers

Unnamed: 0,headline,date,url,text,country
0,Digital Duopoly: US and China Dominate Global ...,21 Aug 2020,https://www.dbs.com/newsroom/Digital_Duopoly_U...,Indonesia . 21 Aug 2020 . 3 min read Indones...,Indonesia
1,DBS Bank provides SGD40 million loan facility ...,20 Aug 2020,https://www.dbs.com/newsroom/DBS_Bank_provides...,"Singapore . 20 Aug 2020 Singapore, 20 Aug 20...",Singapore
2,DBS Asian Insights Conference 2020: Navigating...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Asian_Insight...,Indonesia . 19 Aug 2020 . 3 min read Indones...,Indonesia
3,DBS Private Bank's flagship portfolio hits new...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Private_Banks...,"Singapore . 19 Aug 2020 Singapore, 19 Aug 20...",Singapore
4,DBS on Covid-19's impact on Singapore resident...,18 Aug 2020,https://www.dbs.com/newsroom/DBS_on_Covid_19_s...,"Singapore . 18 Aug 2020 Singapore, 18 Aug 20...",Singapore
...,...,...,...,...,...
615,DBS/POSB – First bank in Singapore to introduc...,20 Jan 2015,https://www.dbs.com/newsroom/DBSPOSB__First_ba...,"Singapore . 20 Jan 2015 Singapore, 20 Jan 20...",Singapore
616,Launch of DBS-NUS Social Venture Challenge Asi...,19 Jan 2015,https://www.dbs.com/newsroom/Launch_of_DBSNUS_...,"Singapore . 19 Jan 2015 Singapore, 19 Jan 20...",Singapore
617,DBS Bank celebrates SG50 with 28th SEA Games,16 Nov 2014,https://www.dbs.com/newsroom/DBS_Bank_celebrat...,"Singapore . 16 Nov 2014 Singapore, 16 Nov 20...",Singapore
618,DBS RMB Index for VVinning Enterprises falls t...,11 Nov 2014,https://www.dbs.com/newsroom/DBS_RMB_Index_for...,"Hong Kong . 11 Nov 2014 Hong Kong, 11 Nov 20...",Hong Kong


Since we already have the date and country in different columns, trim these from the text. 

In [23]:
pressers['text'] = pressers['text'].apply(lambda x: x[x.find('-'):])

In [28]:
#rearrange the columns
pressers = pressers[['headline', 'date', 'url', 'country', 'text']]

In [29]:
pressers

Unnamed: 0,headline,date,url,country,text
0,Digital Duopoly: US and China Dominate Global ...,21 Aug 2020,https://www.dbs.com/newsroom/Digital_Duopoly_U...,Indonesia,- Technology development makes it easier for p...
1,DBS Bank provides SGD40 million loan facility ...,20 Aug 2020,https://www.dbs.com/newsroom/DBS_Bank_provides...,Singapore,- DBS Bank has provided a SGD40 million loan f...
2,DBS Asian Insights Conference 2020: Navigating...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Asian_Insight...,Indonesia,- DBS is proud to host its first digital editi...
3,DBS Private Bank's flagship portfolio hits new...,19 Aug 2020,https://www.dbs.com/newsroom/DBS_Private_Banks...,Singapore,- Despite widespread market volatility in the ...
4,DBS on Covid-19's impact on Singapore resident...,18 Aug 2020,https://www.dbs.com/newsroom/DBS_on_Covid_19_s...,Singapore,- Covid-19 has caused a severe disruption to g...
...,...,...,...,...,...
615,DBS/POSB – First bank in Singapore to introduc...,20 Jan 2015,https://www.dbs.com/newsroom/DBSPOSB__First_ba...,Singapore,"- Come this Lunar New Year, DBS/POSB customers..."
616,Launch of DBS-NUS Social Venture Challenge Asi...,19 Jan 2015,https://www.dbs.com/newsroom/Launch_of_DBSNUS_...,Singapore,"- Into its second year, the DBS-NUS Social Ven..."
617,DBS Bank celebrates SG50 with 28th SEA Games,16 Nov 2014,https://www.dbs.com/newsroom/DBS_Bank_celebrat...,Singapore,- DBS Bank celebrates Singapore’s 50th birthda...
618,DBS RMB Index for VVinning Enterprises falls t...,11 Nov 2014,https://www.dbs.com/newsroom/DBS_RMB_Index_for...,Hong Kong,- DBS Bank (Hong Kong) Limited is pleased to r...


## 4) Export to a `csv` file.

In [31]:
#exclude index
pressers.to_csv(r'./dbs_press_releases.csv', index=False)