In [1]:
import os
os.chdir("../../../")

import urllib
from urllib3.exceptions import ConnectionError
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

from scripts.python.scraper.utils import *
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import multiprocessing
from pycookiecheat import chrome_cookies
import time
import random

In [2]:
target_dir = os.getcwd() + "/data/text/solomon_islands/"
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

## Solomon Stars

In [None]:
def main():
    params = {
        "title_entry": "blog-content wf-td",
        "title": "entry-title",
        "date": "entry-date"
    }

    host = "https://www.solomonstarnews.com/category/news/news-national/page/"
    urls = [host + str(i) for i in range(1, 1421)]
    max_workers = multiprocessing.cpu_count() - 1
    output = []


    with tqdm(total=len(urls), unit="pages") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_url = {
                executor.submit(extract_news_info, url, params=params, timeout=5):
                url
                for url in urls
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                except Exception as exc:
                    print('%r generated an exception: %s' % (url, exc))
                else:
                    output.append(data)
                    pbar.update(1)

    return output

if __name__ == '__main__':
    output = main()

In [None]:
output_df = pd.DataFrame()
for out in output:
    out_df = pd.DataFrame(out)
    if output_df.empty:
        output_df = out_df
    else:
        output_df = pd.concat([output_df, out_df], axis=0)

In [None]:
output_df = output_df.reset_index().drop("index", axis=1)
output_df["date"] = pd.to_datetime(output_df["date"])
output_df.sample(10)
output_df.to_csv(target_dir+"solomon_stars_urls.csv", encoding="utf-8")

In [None]:
def extract_news(url):
    content = load_page(url, timeout=5)
    soup = BeautifulSoup(content, "html.parser")
    try: 
        raw_text = soup.find(class_="entry-content").text
    except:
        raw_text = soup.find(class_="content").text
    return url, raw_text

In [None]:
news_lst = []
with tqdm(total=len(output_df)) as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:  
        futures = {executor.submit(extract_news, url): url for url in output_df.url}
        for future in as_completed(futures):
            url = futures[future]
            try:
                url, news = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_lst.append((url, news))
                pbar.update(1)

In [None]:
news_df = pd.DataFrame(news_lst, columns=["url", "news"])
display(news_df.head(5))
output_df = output_df.merge(news_df, how="left", on="url")

In [None]:
import math
missed_idx = output_df[output_df.news.isna() == True].index.tolist()
for idx in missed_idx:
    url = output_df["url"][idx]
    _ , news = extract_news(url)
    if math.isnan(output_df.iloc[idx, -1]):
        output_df.iloc[idx, -1] = news

In [None]:
if len(output_df[output_df.news.isna() == True]) == 0:
    output_df.to_csv(target_dir+"solomon_stars_news.csv", encoding="utf-8")

## Solomon Times

In [None]:
host_url = "https://www.solomontimes.com/news/latest/"

ym_lst = [(i, j) for i in range(2007, 2023) for j in range(1, 13)]
ym_lst.extend([(2023, m) for m in range(1, 9)])
ss_urls = [host_url + str(y) + "/" + str(m) for y, m in ym_lst]

In [None]:
def load_page(url, timeout):
    r = requests.get(url)
    return r.content

def extract_news_info_ss(url, params=None, timeout=5):
    content = load_page(url, timeout)
    soup = BeautifulSoup(content, "html.parser")
    info_dict = {"url": [], "title": [], "date": []}
    if params is None:
        params = {"title_entry": "entry-container",
                  "title": "entry-title",
                  "date": "entry-date"}
    for a in soup.find_all(class_=params["title_entry"]):
        entry_title = a.find(params["title"])
        if params["date"] is not None:
            info_dict["date"].append(a.find(class_=params["date"]).text)
        else:
            info_dict["date"].append(np.NaN)
        
        info_dict["url"].append(a.find("a").attrs["href"])
        
        if entry_title is not None:
            info_dict["title"].append(entry_title.text)
        else:
            info_dict["title"].append("Missing Title.")
    return info_dict

In [None]:
params = {
    "title_entry": "article-list-item",
    "title": 'h2',
    "date": None
}

urls_df = pd.DataFrame()
with tqdm(total=len(ss_urls), unit="pages") as pbar:
    for url, ym in zip(ss_urls, ym_lst):
        url_dict = extract_news_info_ss(url, params=params)
        url_df = pd.DataFrame(url_dict)
        url_df["date"] = str(ym)
        if urls_df.empty:
            urls_df = url_df
        else:
            urls_df = pd.concat([urls_df, url_df], axis=0)
        pbar.update(1)

In [None]:
urls_df_clean = urls_df.reset_index().drop("index", axis=1)
urls_df_clean["date"] = [
    d.replace("(", "").replace(")", "").replace(", ", "-").strip()
    for d in urls_df_clean.date
]
urls_df_clean["url"] = [
    "https://www.solomontimes.com" + url for url in urls_df_clean.url
]
urls_df_clean.to_csv(target_dir+"solomon_times_urls.csv", encoding="utf-8")

In [None]:
def extract_news_ss(url):
    content = load_page(url, 5)
    soup = BeautifulSoup(content)

    date = soup.find(class_="article-timestamp").find("span")["datetime"]
    text = soup.find(class_="article-body").text.strip()
    tags = soup.find(class_="tags").find_all("a")
    tags_text = "".join(t.text + "," if i < len(tags) - 1 else t.text
                        for i, t in enumerate(tags))

    return {"url": url, "date": date, "news": text, "tag": tags_text}

In [None]:
news_st = []
with tqdm(total=len(urls_df_clean)) as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:  
        futures = {executor.submit(extract_news_ss, url): url for url in urls_df_clean.url}
        for future in as_completed(futures):
            url = futures[future]
            try:
                news = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_st_dict.append(news)
                pbar.update(1)

In [None]:
st_news = pd.DataFrame(news_ss_dict)
st_news["date"] = pd.to_datetime(st_news["date"])
st_news.to_csv(target_dir + "solomon_times_news.csv", encoding="utf-8")
st_news.sample(10)

## The Island Sun
<b> !!! Need to Bypass Cloudflare by using cookies obtained from [Google Chrome](chrome://settings/content/siteDetails?site=https%3A%2F%2Ftheislandsun.com.sb%2F) and Headers</b>

In [3]:
host_url = 'https://theislandsun.com.sb/'
stored_cookies = "/Users/czhang/Library/Application Support/Google/Chrome/Profile 7/Cookies"


def configure_cf(host_url, cookies_path):
    cookies = chrome_cookies(host_url, cookies_path)

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "sec-ch-ua-platform":
        "MacOS",
        "upgrade-insecure-requests": "1",
        "dnt": "1",
        "sec-ch-ua":
        '"Not.A/Brand";v="24", "Chromium";v="116", "Google Chrome";v="116"',
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "cf-ray": "7fc606513b832000-iad"
    }

    return headers, cookies


def load_page_cf(url, headers, cookies):
    r = requests.get(url, cookies=cookies, headers=headers)
    if r.status_code == 200:
        return r.content
    else:
        raise ConnectionError


def extract_url_cf(content):
    news_info_dict = {"title": [], "url": [], "date": [], "tag": []}
    soup = BeautifulSoup(content)
    for ele in soup.find_all(class_="item-details"):
        title_entry = ele.find(class_="entry-title td-module-title")
        tag_entry = ele.find(class_="td-post-category")
        url_entry = title_entry.find("a")
        date_entry = ele.find(class_="td-post-date")
        if ele is not None:
            news_info_dict["title"].append(title_entry.text)
            news_info_dict["url"].append(url_entry["href"])
            news_info_dict["date"].append(date_entry.text)
            news_info_dict["tag"].append(tag_entry.text)
    return news_info_dict


def scrape_cf(url):
    host_url = 'https://theislandsun.com.sb/'
    stored_cookies = "/Users/czhang/Library/Application Support/Google/Chrome/Profile 7/Cookies"
    headers, cookies = configure_cf(host_url, stored_cookies)
    content = load_page_cf(url, headers=headers, cookies=cookies)
    news_info_dict = extract_url_cf(content)
    return news_info_dict


def extract_text(url, cookies=None):
    if cookies is None:
        headers, cookies = configure_cf(host_url, stored_cookies)
    else:
        headers, _ = configure_cf(host_url, stored_cookies)
    content = load_page_cf(url, headers=headers, cookies=cookies)
    soup = BeautifulSoup(content)
    p_container = soup.find(class_="td-post-content tagdiv-type")
    text = "".join(p.text for p in p_container.find_all("p"))
    return {"url": url, "news": text}

In [None]:
page_urls = [
    host_url + "category/news/page/" + str(num) for num in range(1, 903)
]
output = []
with tqdm(total=len(page_urls), unit="pages") as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:
        future_to_url = {
            executor.submit(scrape_cf, url):
            url
            for url in page_urls
        }
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                output.append(data)
                pbar.update(1)

In [None]:
news_url_df = pd.DataFrame()
for i in output:
    temp_df = pd.DataFrame(i)
    if news_url_df.empty:
        news_url_df = temp_df
    else:
        news_url_df = pd.concat([news_url_df, temp_df], axis=0)

In [None]:
news_url_df["date"] = pd.to_datetime(news_url_df["date"])
news_url_df = news_url_df.sort_values(by="date", ascending=True).reset_index(drop=True)
news_url_df.to_csv(target_dir + "island_sun_urls.csv", encoding="utf-8")
news_url_df.sample(5)

In [None]:
# from cryptography.hazmat.primitives.ciphers import Cipher, modes
# from cryptography.hazmat.primitives.ciphers.algorithms import AES
# from cryptography.hazmat.primitives.hashes import SHA1
# from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
# import sqlite3

# conn = sqlite3.connect(stored_cookies)
# cookies = conn.execute("SELECT * FROM cookies;").fetchall()

# coi = []
# for item in cookies:
#     if "island" in item[1]:
#         coi.append(item)
        
# conn.close()


# kdf = PBKDF2HMAC(algorithm=SHA1(), salt=b'saltysalt', iterations=1003, length=16)
# key = kdf.derive()
# def chrome_decrypt(encrypted_value: bytes,
#                    key: bytes,
#                    init_vector=b" " * 16) -> str:

#     encrypted_value = encrypted_value[3:]

#     cipher = Cipher(
#         algorithm=AES(key),
#         mode=modes.CBC(init_vector),
#     )
#     decryptor = cipher.decryptor()
#     decrypted = decryptor.update(encrypted_value) + decryptor.finalize()

#     return clean(decrypted)
# cipher = Cipher(
#     algorithm=AES(key),
#     mode=modes.CBC( b" " * 16),
# )
# decryptor = cipher.decryptor() 
# decrypted = decryptor.update(coi[0][5][3:]) + decryptor.finalize()
# def clean(decrypted: bytes) -> str:
#     last = decrypted[-1]
#     if isinstance(last, int):
#         return decrypted[:-last].decode("utf-8")
#     return decrypted[: -ord(last)].decode("utf-8")

# clean(decrypted)

In [21]:
news_df = pd.read_csv(target_dir + "island_sun_news.csv").drop("Unnamed: 0", axis=1)

In [34]:
host_url = 'https://theislandsun.com.sb/'
stored_cookies = "/Users/czhang/Library/Application Support/Google/Chrome/Profile 7/Cookies"

headers, cookies = configure_cf(host_url, stored_cookies)
cookies["cf_clearance"] = "qAG73VfB9SLEcqIspcGrR.LT.3ymhfmv3E7Na4neGCM-1692998026-0-1-c5447e81.817438c3.a04331c9-160.2.1692998026"
cookies["cf_chl_2"] = "46c9bf8020b3cbb"
cookies["_gid"] = "GA1.3.1687955229.1692987941"
cookies["_ga"] = "GA1.1.1982791408.1692825085"
cookies["_ga_7LT5NMM1C2"] = "GS1.1.1692994235.9.1.1692994241.0.0.0"
cookies["_gat_gtag_UA_162935749_1"] = "1"
cookies["__cf_bm"] = "3GFEz5Oc63ext35xM1Bl.BCHsDYNTUFVXd5OJGnEl1g-1692995875-0-AcIDZkypNt/HsdXScCpuCiVpPIfD9gqjYQdMrCTzpGIq4ivtRrGms1lQ8aizPLh9W5kBc4vG1XIZR4RVkn/Cqdk="

In [38]:
news_urls = news_df[news_df["news"].isna() == True].url.tolist()
with tqdm(total=len(news_urls), unit="pages") as pbar:
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()+4) as executor:
        future_to_url = {
            executor.submit(extract_text, url, cookies=cookies):
            url
            for url in news_urls
        }
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_df.loc[news_df.url == url, "news"] = data["news"]
                pbar.update(1)

100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.81s/pages]


In [41]:
news_df.to_csv(target_dir + "island_sun_news.csv", encoding="utf-8")