In [6]:
from selenium import webdriver
import time
import json
from requests_html import HTMLSession

In [7]:
politics_url = 'https://vietnamnews.vn/politics-laws'
society_url = 'https://vietnamnews.vn/society'
economy_url = 'https://vietnamnews.vn/economy'
sports_url = 'https://vietnamnews.vn/sports'
environment_url = 'https://vietnamnews.vn/environment'

categories = [politics_url, society_url, economy_url, sports_url, environment_url]

In [8]:
def get_article_information(article_url, session):
    r = session.get(article_url)
    findable_html = r.html
    
    title = findable_html.find('.vnnews-tt-post', first=True).text
    text = findable_html.find('.vnnews-text-post', first=True).text
    article_time = findable_html.find('.vnnews-time-post', first=True).text
    
    return repr(text), repr(title), repr(article_time)

In [9]:
def is_in_time(article_time, time_list):
    for t in time_list:
        if article_time.find(t) != -1:
            return True
        
    return False

In [10]:
csv_file = open('../data/csv/vietnamnews.csv', 'w', encoding='utf-8')
csv_file.write(f'id\ttitle\ttext\tlabel\n')
start_time = time.time()

session = HTMLSession()

n_failed_articles = 0
n_successful_articles = 0

n_duplicate_articles = 0
n_invalid_times = 0
n_invalid_text = 0 # NULL or very short text

total_invalid_articles = 0

title_set = set()
errors_set = set()
urls_set = set()

num_pages = 150

for label_id, category_url in enumerate(categories):
    category_name = category_url.split('/')[-1]
    print(f'... Crawling {category_name} ...')
    browser = webdriver.Chrome('./chromedriver')
    browser.get(category_url)
    
    for i in range(num_pages):
        try:
            articles = browser.find_element_by_class_name('vnnews-list-news').find_elements_by_tag_name("li")
            for article in articles:
                url = article.find_element_by_tag_name('a').get_attribute('href')
                urls_set.add(url)
                try:
                    text, title, article_time = get_article_information(url, session)                    
                    if is_in_time(article_time, ['2017', '2018', '2019']) and (title not in title_set) \
                    and len(text) > 20:
                        title_set.add(title)
                        csv_file.write(f'{n_successful_articles}\t{title}\t{text}\t{category_name}\n')
                        n_successful_articles += 1
                    else:
                        if is_in_time(article_time, ['2017', '2018', '2019']) == False:
                            n_invalid_times += 1
                        if len(text) <= 20:
                            n_invalid_text += 1
                        if title in title_set:
                            n_duplicate_articles += 1
                        
                        total_invalid_articles += 1
                except Exception as e:
                    errors_set.add(e)
                    n_failed_articles += 1
            
            if (i+1) % 10 == 0:
                print('='*50)
                print("Crawled {} pages of {}".format(i+1, category_name))
                print('Number of urls: ', len(urls_set))
                print('Number of successful articles: ', n_successful_articles)
                print('Number of duplicate articles: ', n_duplicate_articles)
                print('Number of invalid time articles: ', n_invalid_times)
                print('Number of invalid text articles: ', n_invalid_text)
                print('Total invalid articles: ', total_invalid_articles)
                print('Number of failed articles: ', n_failed_articles)
                print('Fail reasons: ', errors_set)
                
                elapsed_time = time.time() - start_time
                print('Time elapsed: {}m {}s'.format(int(elapsed_time//60), int(elapsed_time%60)))
                print('='*50)
            browser.find_element_by_class_name("vnnews-paging").find_elements_by_tag_name("a")[4].click()
        except Exception as e:
            print(e)
            print('Terminate at {} pages'.format(i+1))
            break
    
    browser.close()
    
csv_file.close()
elapsed_time = time.time() - start_time
print('Finish after: {}m {}s'.format(int(elapsed_time//60), int(elapsed_time%60)))

... Crawling politics-laws ...
Crawled 10 pages of politics-laws
Number of urls:  90
Number of successful articles:  83
Number of duplicate articles:  0
Number of invalid time articles:  7
Number of invalid text articles:  0
Total invalid articles:  7
Number of failed articles:  0
Fail reasons:  set()
Time elapsed: 7m 51s
Crawled 20 pages of politics-laws
Number of urls:  180
Number of successful articles:  173
Number of duplicate articles:  0
Number of invalid time articles:  7
Number of invalid text articles:  0
Total invalid articles:  7
Number of failed articles:  0
Fail reasons:  set()
Time elapsed: 17m 14s
Crawled 30 pages of politics-laws
Number of urls:  270
Number of successful articles:  261
Number of duplicate articles:  2
Number of invalid time articles:  7
Number of invalid text articles:  0
Total invalid articles:  9
Number of failed articles:  0
Fail reasons:  set()
Time elapsed: 25m 28s
Crawled 40 pages of politics-laws
Number of urls:  360
Number of successful articles