In [1]:
import os
os.chdir("../../../")

import urllib
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

from scripts.python.scraper.utils import *
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import multiprocessing
import httpx

In [2]:
target_dir = os.getcwd() + "/data/text/solomon_islands/"
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

In [4]:
def main():
    params = {
        "title_entry": "blog-content wf-td",
        "title": "entry-title",
        "date": "entry-date"
    }

    host = "https://www.solomonstarnews.com/category/news/news-national/page/"
    urls = [host + str(i) for i in range(1, 1421)]
    max_workers = multiprocessing.cpu_count() - 1
    output = []


    with tqdm(total=len(urls), unit="pages") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_url = {
                executor.submit(extract_news_info, url, params=params, timeout=5):
                url
                for url in urls
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                except Exception as exc:
                    print('%r generated an exception: %s' % (url, exc))
                else:
                    output.append(data)
                    pbar.update(1)

    return output

if __name__ == '__main__':
    output = main()

100%|████████████████████████████████████████████████████████████████████████████| 1420/1420 [22:00<00:00,  1.08pages/s]


In [5]:
output_df = pd.DataFrame()
for out in output:
    out_df = pd.DataFrame(out)
    if output_df.empty:
        output_df = out_df
    else:
        output_df = pd.concat([output_df, out_df], axis=0)

In [7]:
output_df = output_df.reset_index().drop("index", axis=1)
output_df["date"] = pd.to_datetime(output_df["date"])
output_df.sample(10)
output_df.to_csv(target_dir+"solomon_stars_urls.csv", encoding="utf-8")

In [17]:
def extract_news(url):
    content = load_page(url, timeout=5)
    soup = BeautifulSoup(content, "html.parser")
    try: 
        raw_text = soup.find(class_="entry-content").text
    except:
        raw_text = soup.find(class_="content").text
    return url, raw_text

In [18]:
news_lst = []
with tqdm(total=len(output_df)) as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:  
        futures = {executor.submit(extract_news, url): url for url in output_df.url}
        for future in as_completed(futures):
            url = futures[future]
            try:
                url, news = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_lst.append((url, news))
                pbar.update(1)

 53%|████████████████████████████████████████▍                                   | 7553/14200 [49:26<1:52:23,  1.01s/it]

'https://www.solomonstarnews.com/honiara-goes-under-flood-waters-again/' generated an exception: 'NoneType' object has no attribute 'text'


 53%|█████████████████████████████████████████▌                                    | 7561/14200 [49:30<40:16,  2.75it/s]

'https://www.solomonstarnews.com/in-court-with-assumpta-buchanan-d56/' generated an exception: 'NoneType' object has no attribute 'text'
'https://www.solomonstarnews.com/stop-spreading-false-rumour-sasako-told/' generated an exception: 'NoneType' object has no attribute 'text'


 53%|█████████████████████████████████████████▌                                    | 7570/14200 [49:35<47:32,  2.32it/s]

'https://www.solomonstarnews.com/russians-here-to-lobby-says-gov-t/' generated an exception: 'NoneType' object has no attribute 'text'


100%|██████████████████████████████████████████████████████████████████████████▉| 14196/14200 [1:46:43<00:01,  2.22it/s]


In [19]:
news_df = pd.DataFrame(news_lst, columns=["url", "news"])
display(news_df.head(5))
output_df = output_df.merge(news_df, how="left", on="url")

Unnamed: 0,url,news
0,https://www.solomonstarnews.com/taro-airport-u...,\nALL Solomon Airlines flights to Taro Airport...
1,https://www.solomonstarnews.com/uspsa-si-succe...,\nTHE University of the South Pacific Students...
2,https://www.solomonstarnews.com/bus-stabber-aw...,\nTHE young man who allegedly stole another ma...
3,https://www.solomonstarnews.com/last-minute-mou/,\nMalaita Province to sign ‘Peace’ MOU with Ch...
4,https://www.solomonstarnews.com/pm-highlights-...,"\nPRIME Minister, Manasseh Sogavare, has highl..."


In [30]:
import math
missed_idx = output_df[output_df.news.isna() == True].index.tolist()
for idx in missed_idx:
    url = output_df["url"][idx]
    _ , news = extract_news(url)
    if math.isnan(output_df.iloc[idx, -1]):
        output_df.iloc[idx, -1] = news

In [35]:
if len(output_df[output_df.news.isna() == True]) == 0:
    output_df.to_csv(target_dir+"solomon_stars_news.csv", encoding="utf-8")