**Purpose**: Scraping for Stand News articles from June 1, 2019 to Dec 30, 2019.

**Goal**: a set of csv files with all Stand News articles between the specified date range.

In [1]:
import bs4 as BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
import time
import datetime

In [2]:
stand_news_base_url = "https://www.thestandnews.com"
stand_news_folder = 'C:/Users/sunny/Desktop/news_nlp/stand_news_collection/'

In [3]:
# formatted current time

def current_datetime():
    return datetime.datetime.now().strftime("%Y/%m/%d - %H:%M:%S")

In [4]:
# read the html page to soup

def page2soup(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    page = urlopen(req).read()
    soup = BeautifulSoup.BeautifulSoup(page, "html.parser")
    return soup

In [5]:
# extract content from the article url

def extract_content(url):
    content_text=''
    try:
        soup = page2soup(url)
        time.sleep(1)
        for content in soup.find_all("div", class_="article-content"):
            for p in content.find_all('p'):
                content_text = content_text + p.text    # combine all paragraphs to a single string
        return(content_text)
    except:
        print('Error777: Can not extract content: ' + url)
        return('Error777: Can not extract content.')    # standard error message that goes into the raw data
                                                        # will need to either back fill it or remove this article later

In [6]:
# return a list of articles
# each with its title, author, category, date, link and content
# from the archive page

def thestandnews_raw_article(url):
    article_ls=[]
    counter=0
    soup = page2soup(url)
    article_count=len(soup.find_all("div",class_=re.compile("^article-block")))
    print(str(article_count) + ' articles to be collected.')
    
    for a in soup.find_all("div",class_=re.compile("^article-block")):
        title = a.find(class_="article-title").text.strip()
        author = a.find(class_="author").text.strip()[1:]
        category = a.find(class_="category-title").text.strip()
        date = a.find(class_="date").text.strip()
        for url in a.find('h3'):
            link = stand_news_base_url + url.get('href')
        content=extract_content(link)
        #print(title,author,category,date,link,content)
        
        one_article=[title,author,category,date,link,content]
        article_ls.append(one_article)
        #counter +=1
        #print(str(counter) + ' out of ' + str(article_count) + ' collected.')
    
    return article_ls
    
    print(current_datetime() + ' : '+ str(len(article_ls)) + ' articles collected.')

In [7]:
# save articles to csv files

def export_to_csv(news_collection,date):
    df = pd.DataFrame(news_collection, columns=['Title', 'Author', 'Category', 'Date', 'Link', 'Content'])
    filename = stand_news_folder + 'news_' + date.strftime('%Y%m%d')+ '.csv'
    export_csv = df.to_csv(filename, index=None, header=True)
    print(current_datetime() + ' : Exported '+ date.strftime('%Y%m%d') +' news collection to csv.')

In [8]:
# loop through the archive
# collect all articles on each archive page
# then save the collection to a csv file

def archive_news_collection(start_date,end_date,archive_domain):

    print(current_datetime() + ' : Start extracting news from ' + start_date.strftime('%Y/%m/%d') + ' to ' + end_date.strftime('%Y/%m/%d') +'.')

    delta = datetime.timedelta(days=1)
    
    archive_date = start_date

    while archive_date <= end_date:
        news_collection = [] 
        try:
            print(current_datetime() + ' : Start extracting news on '+ archive_date.strftime('%Y/%m/%d') + '.')
            archive = archive_domain+ start_date.strftime('%Y%m%d')
            a_day_of_news = thestandnews_raw_article(archive)
            news_collection.extend(a_day_of_news)
            # print("finish one day--- %s seconds ---" % (time.time() - start_time))
            #print(current_datetime() + ' : Finished extracting news on '+ archive_date.strftime('%Y/%m/%d') + '.')
            export_to_csv(news_collection,archive_date)
            archive_date += delta
        except:
            print('run into error : ' + current_datetime())
            time.sleep(10)    # maybe blocked by host
            continue

In [9]:
# scrape for articles

start_date = datetime.date(2019,6,1)
end_date = datetime.date(2019,6,2)
#end_date = datetime.date(2019,12,30)

archive_domain = 'https://www.thestandnews.com/archive/'

archive_news_collection(start_date,end_date,archive_domain)

2021/03/25 - 02:32:42 : Start extracting news from 2019/06/01 to 2019/06/02.
2021/03/25 - 02:32:42 : Start extracting news on 2019/06/01.
29 articles to be collected.
2021/03/25 - 02:34:32 : Exported 20190601 news collection to csv.
2021/03/25 - 02:34:32 : Start extracting news on 2019/06/02.
29 articles to be collected.
2021/03/25 - 02:36:21 : Exported 20190602 news collection to csv.


Above shows an example on scraping for 2 days of worth of news.
The final dataset scraped all news from Jun 1, 2019 to Dec 30, 2020.