In [1]:
import os
os.chdir("../../../")

import urllib
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

from scripts.python.scraper.utils import *
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import multiprocessing
import httpx

In [2]:
target_dir = os.getcwd() + "/data/text/solomon_islands/"
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

## Solomon Stars

In [4]:
def main():
    params = {
        "title_entry": "blog-content wf-td",
        "title": "entry-title",
        "date": "entry-date"
    }

    host = "https://www.solomonstarnews.com/category/news/news-national/page/"
    urls = [host + str(i) for i in range(1, 1421)]
    max_workers = multiprocessing.cpu_count() - 1
    output = []


    with tqdm(total=len(urls), unit="pages") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_url = {
                executor.submit(extract_news_info, url, params=params, timeout=5):
                url
                for url in urls
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                except Exception as exc:
                    print('%r generated an exception: %s' % (url, exc))
                else:
                    output.append(data)
                    pbar.update(1)

    return output

if __name__ == '__main__':
    output = main()

100%|████████████████████████████████████████████████████████████████████████████| 1420/1420 [22:00<00:00,  1.08pages/s]


In [5]:
output_df = pd.DataFrame()
for out in output:
    out_df = pd.DataFrame(out)
    if output_df.empty:
        output_df = out_df
    else:
        output_df = pd.concat([output_df, out_df], axis=0)

In [7]:
output_df = output_df.reset_index().drop("index", axis=1)
output_df["date"] = pd.to_datetime(output_df["date"])
output_df.sample(10)
output_df.to_csv(target_dir+"solomon_stars_urls.csv", encoding="utf-8")

In [17]:
def extract_news(url):
    content = load_page(url, timeout=5)
    soup = BeautifulSoup(content, "html.parser")
    try: 
        raw_text = soup.find(class_="entry-content").text
    except:
        raw_text = soup.find(class_="content").text
    return url, raw_text

In [18]:
news_lst = []
with tqdm(total=len(output_df)) as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:  
        futures = {executor.submit(extract_news, url): url for url in output_df.url}
        for future in as_completed(futures):
            url = futures[future]
            try:
                url, news = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_lst.append((url, news))
                pbar.update(1)

 53%|████████████████████████████████████████▍                                   | 7553/14200 [49:26<1:52:23,  1.01s/it]

'https://www.solomonstarnews.com/honiara-goes-under-flood-waters-again/' generated an exception: 'NoneType' object has no attribute 'text'


 53%|█████████████████████████████████████████▌                                    | 7561/14200 [49:30<40:16,  2.75it/s]

'https://www.solomonstarnews.com/in-court-with-assumpta-buchanan-d56/' generated an exception: 'NoneType' object has no attribute 'text'
'https://www.solomonstarnews.com/stop-spreading-false-rumour-sasako-told/' generated an exception: 'NoneType' object has no attribute 'text'


 53%|█████████████████████████████████████████▌                                    | 7570/14200 [49:35<47:32,  2.32it/s]

'https://www.solomonstarnews.com/russians-here-to-lobby-says-gov-t/' generated an exception: 'NoneType' object has no attribute 'text'


100%|██████████████████████████████████████████████████████████████████████████▉| 14196/14200 [1:46:43<00:01,  2.22it/s]


In [19]:
news_df = pd.DataFrame(news_lst, columns=["url", "news"])
display(news_df.head(5))
output_df = output_df.merge(news_df, how="left", on="url")

Unnamed: 0,url,news
0,https://www.solomonstarnews.com/taro-airport-u...,\nALL Solomon Airlines flights to Taro Airport...
1,https://www.solomonstarnews.com/uspsa-si-succe...,\nTHE University of the South Pacific Students...
2,https://www.solomonstarnews.com/bus-stabber-aw...,\nTHE young man who allegedly stole another ma...
3,https://www.solomonstarnews.com/last-minute-mou/,\nMalaita Province to sign ‘Peace’ MOU with Ch...
4,https://www.solomonstarnews.com/pm-highlights-...,"\nPRIME Minister, Manasseh Sogavare, has highl..."


In [30]:
import math
missed_idx = output_df[output_df.news.isna() == True].index.tolist()
for idx in missed_idx:
    url = output_df["url"][idx]
    _ , news = extract_news(url)
    if math.isnan(output_df.iloc[idx, -1]):
        output_df.iloc[idx, -1] = news

In [35]:
if len(output_df[output_df.news.isna() == True]) == 0:
    output_df.to_csv(target_dir+"solomon_stars_news.csv", encoding="utf-8")

## Solomon Times

In [3]:
host_url = "https://www.solomontimes.com/news/latest/"

ym_lst = [(i, j) for i in range(2007, 2023) for j in range(1, 13)]
ym_lst.extend([(2023, m) for m in range(1, 9)])
ss_urls = [host_url + str(y) + "/" + str(m) for y, m in ym_lst]

In [4]:
def load_page(url, timeout):
    r = requests.get(url)
    return r.content

def extract_news_info_ss(url, params=None, timeout=5):
    content = load_page(url, timeout)
    soup = BeautifulSoup(content, "html.parser")
    info_dict = {"url": [], "title": [], "date": []}
    if params is None:
        params = {"title_entry": "entry-container",
                  "title": "entry-title",
                  "date": "entry-date"}
    for a in soup.find_all(class_=params["title_entry"]):
        entry_title = a.find(params["title"])
        if params["date"] is not None:
            info_dict["date"].append(a.find(class_=params["date"]).text)
        else:
            info_dict["date"].append(np.NaN)
        
        info_dict["url"].append(a.find("a").attrs["href"])
        
        if entry_title is not None:
            info_dict["title"].append(entry_title.text)
        else:
            info_dict["title"].append("Missing Title.")
    return info_dict

In [5]:
params = {
    "title_entry": "article-list-item",
    "title": 'h2',
    "date": None
}

urls_df = pd.DataFrame()
with tqdm(total=len(ss_urls), unit="pages") as pbar:
    for url, ym in zip(ss_urls, ym_lst):
        url_dict = extract_news_info_ss(url, params=params)
        url_df = pd.DataFrame(url_dict)
        url_df["date"] = str(ym)
        if urls_df.empty:
            urls_df = url_df
        else:
            urls_df = pd.concat([urls_df, url_df], axis=0)
        pbar.update(1)

100%|█████████████████████████████████████████████████████████████████████████████| 200/200 [05:12<00:00,  1.56s/pages]


In [6]:
urls_df_clean = urls_df.reset_index().drop("index", axis=1)
urls_df_clean["date"] = [
    d.replace("(", "").replace(")", "").replace(", ", "-").strip()
    for d in urls_df_clean.date
]
urls_df_clean["url"] = [
    "https://www.solomontimes.com" + url for url in urls_df_clean.url
]
urls_df_clean.to_csv(target_dir+"solomon_times_urls.csv", encoding="utf-8")

In [7]:
def extract_news_ss(url):
    content = load_page(url, 5)
    soup = BeautifulSoup(content)

    date = soup.find(class_="article-timestamp").find("span")["datetime"]
    text = soup.find(class_="article-body").text.strip()
    tags = soup.find(class_="tags").find_all("a")
    tags_text = "".join(t.text + "," if i < len(tags) - 1 else t.text
                        for i, t in enumerate(tags))

    return {"url": url, "date": date, "news": text, "tag": tags_text}

In [8]:
news_st = []
with tqdm(total=len(urls_df_clean)) as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:  
        futures = {executor.submit(extract_news_ss, url): url for url in urls_df_clean.url}
        for future in as_completed(futures):
            url = futures[future]
            try:
                news = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_st_dict.append(news)
                pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████| 11049/11049 [30:36<00:00,  6.02it/s]


In [14]:
st_news = pd.DataFrame(news_ss_dict)
st_news["date"] = pd.to_datetime(st_news["date"])
st_news.to_csv(target_dir + "solomon_times_news.csv", encoding="utf-8")
st_news.sample(10)

Unnamed: 0,url,date,news,tag
7052,https://www.solomontimes.com/news/pms-link-wit...,2017-01-19 22:19:00,The Parliamentary Opposition Group says text m...,"News,Politics"
8583,https://www.solomontimes.com/news/recent-arriv...,2020-09-17 00:58:00,"Nationals along with Chinese diplomats, techni...","News,Economy"
4624,https://www.solomontimes.com/news/urban-youth-...,2011-06-03 00:50:30,"JOINT PRESS RELEASE - [Suva, Fiji - 2nd June 2...","News,Regional"
4991,https://www.solomontimes.com/news/solomon-isla...,2011-11-29 21:09:30,The opening match of the Oceania Cup between N...,"News,Sports"
9234,https://www.solomontimes.com/news/government-c...,2021-06-23 21:35:00,Government officials have successfully complet...,"News,Economy"
4490,https://www.solomontimes.com/news/ashika-jury-...,2011-04-03 22:18:30,The jury for the case of the MV Princess Ashik...,"News,Regional"
8530,https://www.solomontimes.com/news/ofc-will-inv...,2020-08-04 22:06:00,OFC Will Invest an Unprecedented $43 million D...,"News,Sports"
10387,https://www.solomontimes.com/news/siaf-rsipf-p...,2022-08-10 04:05:00,"As part of their ongoing community engagement,...","News,Economy"
7645,https://www.solomontimes.com/news/leai-nets-41...,2019-08-22 04:07:00,Solomon Islands young football sensation Rapha...,"News,Sports"
171,https://www.solomontimes.com/news/sbd10-millio...,2007-06-24 23:07:30,Governor of Central Bank Rick Hou has praised ...,"News,Economy"
