In [1]:
import os
os.chdir("../../../")

import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import urllib
import urllib.request

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
checked_path = os.getcwd() + "/data/text/png"
if not os.path.exists(checked_path):
    os.makedirs(checked_path)

In [3]:
def load_page(url, timeout):
    with urllib.request.urlopen(url, timeout=timeout) as conn:
        content = conn.read()
        return content
def extract_info(url, timeout):
    content = load_page(url, timeout)
    soup = BeautifulSoup(content)
    info_dict = {"url": [], "title": [], "date": []}
    for a in soup.find_all(class_="entry-container"):
        entry_title = a.find(class_="entry-title")
        info_dict["date"].append(a.find(class_="entry-date").text)
        if entry_title is not None:
            info_dict["url"].append(entry_title.find("a").attrs["href"])
            info_dict["title"].append(entry_title.text)
        else:
            info_dict["url"].append(a.find("a").attrs["href"])
            info_dict["title"].append("Missing Title.")
        pbar.update(1)
    return info_dict

In [4]:
host = "https://www.postcourier.com.pg/national-news/page/"
urls = [host + str(i) for i in range(1, 1550)]
output = []
with tqdm(total=len(urls)*10) as pbar:
    with ThreadPoolExecutor(max_workers=5) as executor:  
        future_to_url = {executor.submit(extract_info, url, 10): url for url in urls}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                output.append(data)

100%|██████████████████████████████████████████████████████████████████████████▉| 15481/15490 [02:58<00:00, 86.73it/s]


In [5]:
news_urls = pd.DataFrame()
for out in output:
    df = pd.DataFrame(out)
    if news_urls.empty:
        news_urls = df
    else:
        news_urls = (pd.concat([news_urls, df], axis=0)
                     .drop_duplicates()
                     .reset_index()
                     .drop("index", axis=1))

news_urls.to_csv(checked_path + "/post_courier_urls.csv", encoding="utf-8")
news_urls.head(5)

Unnamed: 0,url,title,date
0,https://www.postcourier.com.pg/law-students-go...,Law students go to court training,"July 13, 2023"
1,https://www.postcourier.com.pg/mou-signed-to-p...,MOU signed to promote quality nurse training i...,"July 13, 2023"
2,https://www.postcourier.com.pg/pm-backs-icac-s...,PM backs ICAC set up,"July 13, 2023"
3,https://www.postcourier.com.pg/22kg-marijuana-...,22kg marijuana bust,"July 13, 2023"
4,https://www.postcourier.com.pg/fresh-vegetable...,Fresh vegetables galore at Mt Hagen market,"July 13, 2023"


In [34]:
def extract_news(url, timeout):
    content = load_page(url, timeout)
    soup = BeautifulSoup(content)
    raw_text = soup.find(class_ = "entry-content").text  
    tag = soup.find(class_ = "tags-links")
    if tag is not None:
        tag_text = tag.text
    else:
        tag_text = "No Tag"
    pbar.update(1)
    return url, raw_text, tag_text

In [46]:
import multiprocessing
num_cpus = multiprocessing.cpu_count()

news_lst = []
with tqdm(total=len(news_urls)) as pbar:
    with ThreadPoolExecutor(max_workers=num_cpus-1) as executor:  
        futures = {executor.submit(extract_news, url, 10): url for url in news_urls.url}
        for future in as_completed(futures):
            url = futures[future]
            try:
                url, news, tag = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                news_lst.append((url, news, tag))

27999it [30:44, 14.25it/s]                                                                                            

In [47]:
news_df = pd.DataFrame(news_lst, columns=["url", "news", "tag"])
news_df.to_csv(checked_path + "/post_courier_news.csv", encoding="utf-8")