# Webscraping code for SGInvestors
Website: https://sginvestors.io/news/publishers/latest/

### Import libraries

In [1]:
#!pip install webdriver-manager
from bs4 import BeautifulSoup
from selenium import webdriver      
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

import csv
import numpy as np
import pandas as pd
from random import randint
import time
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")

In [2]:
print("Last update:",datetime.now().strftime("%d/%m/%Y %H:%M:%S"))

Last update: 09/03/2022 03:25:58


### Scrape information for all pages

In [3]:
# Obtain page URLs
page_url_list = ['https://sginvestors.io/news/publishers/latest/'] # 1st page
for i in range(2,9): # for 8 pages
    page_url_list.append('https://sginvestors.io/news/publishers/latest/0' + str(i))
    
page_url_list

['https://sginvestors.io/news/publishers/latest/',
 'https://sginvestors.io/news/publishers/latest/02',
 'https://sginvestors.io/news/publishers/latest/03',
 'https://sginvestors.io/news/publishers/latest/04',
 'https://sginvestors.io/news/publishers/latest/05',
 'https://sginvestors.io/news/publishers/latest/06',
 'https://sginvestors.io/news/publishers/latest/07',
 'https://sginvestors.io/news/publishers/latest/08']

In [4]:
# Scrape all pages

# Initialisations
news_source = []
news_header = []
updated_sg_time = []
url = []
num_pages = 1

# Scraping of pages
for page_url in page_url_list[:]:
    
    #driver = webdriver.Chrome('chromedriver')
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(page_url)
    page_soup = BeautifulSoup(driver.page_source,"html.parser")
    
    print('Page Number: ', num_pages)
    print('Page URL: ', page_url)
    
    # news_source
    for source in page_soup.findAll('img',{'class':'newschannelimg'}):
        source_link = source['src']
        news_source.append(source_link)
    
    # news header / title
    for header in page_soup.findAll('div',{'class':'newstitle'}):
        news_header.append(header.text)
        
    # updated sg time
    for time in page_soup.findAll('div',{'class':'updatedsgtime'}):
        updated_sg_time.append(time.text)
    
    # url
    link_container = page_soup.find('div',{'id':'articlelist'})
    for news_url in link_container.findAll('a',{'rel':'nofollow'}):
        href = news_url.get('href')
        url.append(href)
    
    num_pages += 1
    driver.quit()
    print('---------')
    
print('---Scraping done!!---')



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
There is no [mac64] chromedriver for browser  in cache
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_mac64.zip
Driver has been saved in cache [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51]


Page Number:  1
Page URL:  https://sginvestors.io/news/publishers/latest/




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome


---------


Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache




Page Number:  2
Page URL:  https://sginvestors.io/news/publishers/latest/02
---------


Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache




Page Number:  3
Page URL:  https://sginvestors.io/news/publishers/latest/03
---------


Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Page Number:  4
Page URL:  https://sginvestors.io/news/publishers/latest/04




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome


---------


Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Page Number:  5
Page URL:  https://sginvestors.io/news/publishers/latest/05




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome


---------


Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Page Number:  6
Page URL:  https://sginvestors.io/news/publishers/latest/06




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome


---------


Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Page Number:  7
Page URL:  https://sginvestors.io/news/publishers/latest/07




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome


---------


Driver [/Users/tohjiaxuan/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


Page Number:  8
Page URL:  https://sginvestors.io/news/publishers/latest/08
---------
---Scraping done!!---


In [5]:
print(len(news_source))
print(len(news_header))
print(len(updated_sg_time))
print(len(url))

288
288
288
288


In [6]:
print(news_source[0])
print(news_header[0])
print(updated_sg_time[0])
print(url[0])

https://lh3.googleusercontent.com/-XitgQTQ_Mzo/Voubk20tr0I/AAAAAAAAFMM/bVbkFFUmkiwEMq6HIdL_H8Zhx-GArdraACCo/s800/business-times-2016.png
Singapore contributes US$15m to Coalition for Epidemic Preparedness Innovations
2 hours ago
https://www.businesstimes.com.sg/government-economy/singapore-contributes-us15m-to-coalition-for-epidemic-preparedness-innovations


### Clean up data

In [7]:
# Clean up Source information
cleaned_source = []
for src in news_source:
    if 'cna' in src:
        cleaned_source.append('CNA')
    elif 'theedgegroup' in src:
        cleaned_source.append('The Edge')
    elif 'business-times' in src:
        cleaned_source.append('The Business Times')
    else:
        cleaned_source.append(src) # will need to double check

In [8]:
# Clean up Updated SG Time information (put in terms of date)
cleaned_time = []
current_time = datetime.today()
for time in updated_sg_time:
    if 'minute' in time:
        last_update = current_time - timedelta(minutes=int(time.split(' ')[0]))
    elif 'hour' in time:
        last_update = current_time - timedelta(hours=int(time.split(' ')[0]))
    elif 'day' in time:
        last_update = current_time - timedelta(days=int(time.split(' ')[0]))
    else:
        last_update = time # will need to double check
        
    #last_update = last_update.strftime('%d/%m/%Y %H:%M %p') # 27/02/2022 09:05 AM
    last_update = last_update.strftime('%d/%m/%Y') # 27/02/2022
    cleaned_time.append(last_update)

### Convert to DF and Export to CSV

In [14]:
# Convert to DataFrame
cols = ['Title','Date','Link','Source','Comments']
df = pd.DataFrame({'Title': news_header,
                   'Date': cleaned_time,
                   'Link': url,
                   'Source': cleaned_source,
                   'Comments': 'Featured on SGInvestors'}, columns=cols)
df.insert(0, 'Symbol', 'None (General News)')
df.head()

Unnamed: 0,Symbol,Title,Date,Link,Source,Comments
0,None (General News),Singapore contributes US$15m to Coalition for ...,09/03/2022,https://www.businesstimes.com.sg/government-ec...,The Business Times,Featured on SGInvestors
1,None (General News),Crypto exchanges staying in Russia will face g...,09/03/2022,https://www.channelnewsasia.com/business/crypt...,CNA,Featured on SGInvestors
2,None (General News),Primordial octopus was up in arms - 10 instead...,09/03/2022,https://www.channelnewsasia.com/world/primordi...,CNA,Featured on SGInvestors
3,None (General News),Oil surges as US and Britain cut off Russian c...,08/03/2022,https://www.channelnewsasia.com/business/oil-s...,CNA,Featured on SGInvestors
4,None (General News),Oil gains on expectations of US and British ba...,08/03/2022,https://www.channelnewsasia.com/business/oil-g...,CNA,Featured on SGInvestors


In [15]:
# Export to CSV
now = datetime.now()
current = now.strftime("%d/%m/%Y %H:%M:%S")
print('Current date and time',current)

df_dict = df.to_dict(orient='records')
keys = df_dict[0].keys()
with open('sginvestors_data.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(df_dict)

print('CSV exported successfully!')

Current date and time 09/03/2022 03:29:23
CSV exported successfully!
