In [1]:
import requests
from bs4 import BeautifulSoup
import time
from functools import reduce
from tqdm import tqdm
import json
from datetime import datetime
import csv
import unicodedata
import threading

### Object classes
Classes that define an Article object.

In [2]:
class Article:

  def __init__(self, id, source, title=None, date=None, tags=[], author=None,
               excerpt=None, content=None, url=None, comments=[],
               category=None, likes=None):

    self.id = id
    self.title = title
    self.date = date
    self.source = source
    self.author = author
    self.excerpt = excerpt
    self.content = content
    self.url = url
    self.comments = comments # List of Comment objects
    self.tags = tags
    self.category = category
    self.likes = likes  # Like count
  
  def __str__(self):
    return ("ID: {}\nTitle: {}\nExcerpt: {}\nDate: {}\nAuthor: {}\n" \
        + "Source: {}\nCategory: {}\nURL: {}\nTags: {}\n" \
        + "Likes: {}\nComments: {}\n-----\n{}") \
        .format(self.id, self.title, self.excerpt, self.date, self.author,
                self.source, self.category, self.url, self.tags, self.likes,
                len(self.comments), self.content)
  
  def __hash__(self):
    return hash(self.source + self.id)
      
  def to_dict(self):

    date = self.date.isoformat() if self.date else None
    
    comments = [comment.to_dict() for comment in self.comments]

    _dict = self.__dict__.copy()

    _dict.update(date=date, comments=comments)

    return _dict

In [3]:
class Comment:
  """
  User comment in an article.
  """

  def __init__(self, id, author=None, content=None, date=None, replies=[],
               likes=None):
    self.id = id
    self.author = author
    self.content = content
    self.date = date
    self.replies = replies  # list of Comment objects
    self.likes = likes
  
  def __str__(self):
    return \
      "ID: {}\nAuthor: {}\nContent: {}\nLikes: {}\nDate: {}\nReplies: {}" \
        .format(self.id, self.author, self.content, self.likes,
                self.date, len(self.replies))
  
  def __hash__(self):
      return hash(self.id)
      
  def __getitem__(self, key):
      return self.replies[key]
  
  def to_dict(self):

      replies = [reply.to_dict() for reply in self.replies]
      
      date = self.date.isoformat() if self.date else None

      _dict = self.__dict__.copy()

      _dict.update(date=date, replies=replies)

      return _dict

### Crawl data
Crawl article (title, content, tags...) and its user comments.
Skip this section if only loading data from file.

#### Crawler class

In [4]:
class TuoiTreCrawler:
  """
  Crawl articles from TuoiTre news. Support crawling articles from
  sub-categories (e.g "the-thao" as in "https://tuoitre.vn/the-thao.htm").
  """

  BASE_URL = "https://tuoitre.vn"
  API_URL = "https://id.tuoitre.vn/api"
  LIKE_COUNT_URL = "https://s1.tuoitre.vn/count-object.htm"
  SOURCE_NAME = "Tuổi Trẻ"
  
  class Category:
    """
    Sub-categories ids extracted from the URL
    (e.g. https://tuoitre.vn/timeline/11/trang-12.htm).
    Not a comprehensive list. Add more as needed.
    
    To find the id of a category, find the value with the key "category_id"
    in the HTML source. Or monitor network traffic when "See more" is pressed.
    """

    MOI_NHAT = 0
    THE_GIOI = 2
    THOI_SU = 3
    SUC_KHOE = 12
    VAN_HOA = 200017
    CONG_NGHE = 200029
    THE_THAO = 1209
    GIAO_DUC = 13

  def __init__(self, category: Category=None, crawl_comment=True, delay=0.5,
               skip_these=set(), newer_only=False):

    self.category = category
    self.crawl_comment = crawl_comment

    # A set of article ids to skip crawling.
    # Can be used to enlarge the existing data (skip the ones that already
    # crawled)
    self.skip_these = skip_these
    # Only get the articles newer than the existing ones in self.skip_these
    # regardless the limit.
    self.newer_only = newer_only

    # Amount of delay in seconds after each request
    # (to avoid overloading the server).
    self.delay = delay

  def get_page_url(self, cursor: int):
    """
    Return the URL of the newspaper indexes given the cursor.
    """

    assert self.category is not None

    return TuoiTreCrawler.BASE_URL + "/timeline/{}/trang-{}.htm" \
                                          .format(self.category, cursor)
  
  def get_id(self, url: str):
    """
    Return the article ID given the URL.
    """

    return url.split("/")[-1].split(".")[0].split("-")[-1]
  
  def normalize_unicode(self, unicode_str):
    """
    Normalize unicode string (e.g. remove \xa0 characters).
    """
    return unicodedata.normalize("NFKC", unicode_str)

  def find_article_urls(self, url: str, limit=float("inf")):
    """
    Find articles in a page given its URL.
    Return a set of URLs to the main articles.
    """

    print("Getting article URLs from", url)
    response = requests.get(url)
    response_soup = BeautifulSoup(response.text, 'html.parser')

    news_items = response_soup\
      .find_all("li", class_="news-item")

    article_urls = set()

    # Signal the caller to stop
    stop = False

    for item in news_items:

      if limit and limit <= len(article_urls):
        break
    
      a_tag = item.find("a", recursive=False)
      article_url = TuoiTreCrawler.BASE_URL + a_tag["href"]

      if self.get_id(article_url) not in self.skip_these:
        article_urls.add(article_url)

      elif self.newer_only:
        stop = True
        break
    
    return article_urls, stop

  def crawl_article_urls(self, limit=15):
    """
    Try to find as many articles as possible given the limit.
    Return a set of article URLs.
    """

    article_urls = set()
    cursor = 1

    try:
      while len(article_urls) < limit:
    
        page_url = self.get_page_url(cursor)
        
        new_urls, stop = self.find_article_urls(page_url, limit - len(article_urls))
        article_urls.update(new_urls)

        if stop:
          break
        
        print("Found", len(article_urls), "/", limit, "article URLs")

        cursor += 1

        time.sleep(self.delay)
    except Exception as e:
      print("\nError while getting article URLs:", e, "\n")
      pass

    return article_urls

  def get_likes_count(self, id, likes=[None]):
    """
    Get the number of likes an article has received given its id.
    """

    try:

      url = TuoiTreCrawler.LIKE_COUNT_URL + "?newsId=" + id

      like_count = requests.get(url, headers={"Origin": "https://tuoitre.vn"})

      likes[0] = int(like_count.text)

    except Exception as e:
      print("\nError while getting likes for article with id", id, ":", e, "\n")

    return int(like_count.text)

  def get_article(self, url: str):
    """
    Get an article given its URL.
    Return an Article object.
    """

    id = self.get_id(url)
    
    likes = [None]  # A list makes it easier to pass data across threads

    get_like_thread = threading.Thread(target=self.get_likes_count, args=[id, likes])
    get_like_thread.start()

    response = requests.get(url)
    response_soup = BeautifulSoup(response.text, 'html.parser')

    source = TuoiTreCrawler.SOURCE_NAME

    try:
      title = response_soup.find("meta", {"property": "og:title"})["content"]
      author = response_soup.find("meta", {"name": "author"})["content"]
      excerpt = response_soup.find("meta", {"name": "description"})["content"]
      category = response_soup.find("meta", {"property": "article:section"})["content"]

      tags = response_soup.find("meta", {"name": "keywords"})["content"]
      if tags:
        tags = tags.split(",")

      # Format 2021-11-13T13:02:00+07:00
      date = response_soup.find("meta", {"name": "pubdate"})["content"]
      date = date.split("+")[0] + "+0700"
      date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")

      paragraphs = response_soup.find(id="main-detail-body") \
        .find_all("p", recursive=False)
      # Can be an empty string because some articles
      # might have no textual content (e.g. video, infographic)
      content = reduce(lambda value,
                       p: value + p.get_text().strip() + "\n", paragraphs, "")
      content = self.normalize_unicode(content)
    except Exception as e:
      print("\nError while getting article with id", id, ":", e)
      print("The article could be an unexpected format (Infographic, video...).")
      return None
    
    get_like_thread.join()

    return Article(id, source, title, date, tags, author, excerpt, content,
                   url, category=category, likes=likes[0])

  def json_to_comment(self, json_comment: dict):
    """
    Convert a JSON comment to a Comment object.
    """

    id = json_comment["id"]
    author = json_comment["sender_fullname"]
    content = BeautifulSoup(json_comment["content"], "html.parser").text
    likes = int(json_comment["likes"])

    # E.g. 2021-11-12T09:32:32
    str_date = json_comment["created_date"]
    date = datetime.strptime(str_date, "%Y-%m-%dT%H:%M:%S")
  
    replies = []
    if json_comment["child_comments"] is not None:
      replies = [self.json_to_comment(reply)
                    for reply in json_comment["child_comments"]]

    return Comment(id, author, content, date, replies, likes)

  def _crawl_comments(self, id, cursor=1, limit=20):
    """
    Get comments of an article given its ID.
    Return a set of Comment objects.
    """

    api_url = self.API_URL + "/getlist-comment.api?"
    
    url = api_url + "objId={}&pageindex={}&pagesize={}&objType=1&sort=1" \
                          .format(id, cursor, limit)

    response = requests.get(url)
    response_json = response.json()
    
    assert response_json["Success"] == True
    
    data = json.loads(response_json["Data"])
    
    comments = []
    
    for comment in data:
      comments.append(self.json_to_comment(comment))
    
    return comments
    
  def crawl_comments(self, id, limit=float("inf"), thread_return=[None]):
    """
    Get comments of an article given its ID.
    Return a set of Comment objects.
    """

    comments = []
    cursor = 1

    try:
      while len(comments) < limit:
        new_comments = self._crawl_comments(id, cursor)

        time.sleep(self.delay)

        # Reached the end of the comments
        if len(new_comments) == 0:
          break

        comments += new_comments
        cursor += 1

    except AssertionError:
      # No more comments
      pass

    except Exception as e:
      print("\nError while getting crawling comments of post id", id, ":", e, "\n")
      pass
    
    thread_return[0] = comments

    return comments

  def crawl_articles(self, limit):
    """
    Crawl articles given the limit.
    Return a list of Article objects.
    """

    articles = []
    article_urls = self.crawl_article_urls(limit)
    loss = 0

    print("Getting", len(article_urls), "articles...")
    try:
      for url in tqdm(article_urls, mininterval=0.5):

        get_comments_thread = None
        comments = [None] # A list makes it easier to pass data across threads

        if self.crawl_comment:

          id = self.get_id(url)
          get_comments_thread = threading.Thread(target=self.crawl_comments,
                                  kwargs={'id': id, 'thread_return': comments})
          get_comments_thread.start()

        a = self.get_article(url)

        if a:
          articles.append(a)
          if get_comments_thread:
            get_comments_thread.join()
            a.comments = comments[0]

        else:
          loss += 1

        time.sleep(self.delay)

    except Exception as e:
      print("Error while getting articles:", e)
    
    print("\nSuccess:", len(articles), "/", LIMIT, "\tLoss:", loss)

    return articles

#### Config

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# File path to store or load the crawled data
FILE_PATH = "/content/drive/MyDrive/data/suc_khoe_articles.csv" #@param {type:"string"}

In [7]:
# Limit the number of articles
LIMIT =  1000#@param {type:"integer"}

# Time delayed inbetween GET requests (to avoid overloading the news server)
DELAY = 0.1 #@param {type:"number"}

CATEGORY = TuoiTreCrawler.Category.SUC_KHOE

#### Start crawling articles
Start crawling from scratch.

**For the need to enlarge the existing database, refer to the section "Enlarge the existing data" below.**

In [8]:
crawler = TuoiTreCrawler(category=CATEGORY,
                         delay=DELAY, crawl_comment=True)

articles = crawler.crawl_articles(limit=LIMIT)

Getting article URLs from https://tuoitre.vn/timeline/12/trang-1.htm
Found 15 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-2.htm
Found 23 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-3.htm
Found 38 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-4.htm
Found 53 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-5.htm
Found 68 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-6.htm
Found 83 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-7.htm
Found 98 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-8.htm
Found 113 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-9.htm
Found 128 / 1000 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-10.htm
Found 143 / 1000 article URLs
Getting article 

 65%|██████▍   | 646/1000 [20:34<08:50,  1.50s/it]


Error while getting article with id 2021110513110478 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


 70%|██████▉   | 696/1000 [22:16<09:46,  1.93s/it]


Error while getting article with id 20211002013543091 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


 72%|███████▏  | 722/1000 [23:06<07:39,  1.65s/it]


Error while getting article with id 20210925192946811 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


 74%|███████▍  | 745/1000 [23:49<07:10,  1.69s/it]


Error while getting article with id 20210915202950649 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


100%|██████████| 1000/1000 [32:07<00:00,  1.93s/it]


Success: 996 / 1000 	Loss: 4





##### Preview crawled data

Preview the first and the last article of the batch.

In [9]:
titles = [article.title for article in articles]

# Print the preview of the set of the articles
def print_list_preview(l: list):
  """
  Print the first and the last item of the list
  """
  
  if len(l) < 1:
    return

  print("[", 1, "]", l[0])
  if len(l) < 2: return

  print("...")
  print("[", len(l), "]", l[-1])

print_list_preview(titles)

[ 1 ] 'Bão cytokine' tấn công phổi, tim, thận, gan... người trẻ F0
...
[ 996 ] Bác sĩ Lương Lễ Hoàng qua đời


Preview the first article content in the batch.

In [10]:
print(articles[0])

ID: 20210919223447809
Title: 'Bão cytokine' tấn công phổi, tim, thận, gan... người trẻ F0
Excerpt: TTO - Theo thống kê của Trung tâm hồi sức Bệnh viện Bạch Mai đặt tại Bệnh viện dã chiến số 16 (quận 7, TP.HCM), với công suất 500 giường bệnh thì có đến 70% bệnh nhân COVID-19 mắc phải 'cơn bão cytokine'.
Date: 2021-09-20 06:20:00+07:00
Author: TUOI TRE ONLINE
Source: Tuổi Trẻ
Category: Sức khỏe
URL: https://tuoitre.vn/bao-cytokine-tan-cong-phoi-tim-than-gan-nguoi-tre-f0-20210919223447809.htm
Tags: ['bão cytokine', 'covid-19', 'f0', 'bệnh viện dã chiến', 'virus tấn công', 'bệnh nhân trẻ']
Likes: 141
Comments: 4
-----
"Cơn bão cytokine" là hiện tượng hệ miễn dịch cơ thể phản ứng quá mức, giải phóng ồ ạt cytokine gây viêm, khiến các cơ quan nội tạng suy kiệt. Người khỏe mạnh khi bị virus tấn công, hệ miễn dịch sẽ phản ứng, cơ thể tiết ra chất cytokine để ức chế virus. Nhưng ở một số người, đặc biệt là người trẻ tiết ra quá nhiều cytokine, gây ảnh hưởng đến phủ tạng.
Người trẻ nhất là 17 tuổ

Preview comments from a random article.

In [11]:
comments = TuoiTreCrawler()._crawl_comments(id="202111131227554", limit=2)
print(comments[0])

ID: 290c3b60-4515-11ec-8e3d-295feb285a8a
Author: Nguyễn Ân
Content: Cho nghỉ việc được rồi, ý thức tệ đến thế là cùng.
Likes: 6
Date: 2021-11-14 13:36:12
Replies: 0


### Serialize, store and load
Serialization and Deserialization.

Format: CSV with JSON columns inbetween.

#### Helper classes
- **ArticleSerialization**: Serialize Article object to dictionary. Deserialize dictionary to Article object.
- **FileStorage**: Read/Write serialized objects.

In [12]:
class ArticleSerialization:
  """
  Serialize and deserialize Article object.
  """

  @staticmethod
  def serialize(obj):
    """
    Transform an object to a serializable dictionary.
    Return a dictionary.
    """

    article_dict = obj.to_dict()

    # Store the comments and tags as JSON objects
    article_dict["comments"] = json.dumps([x.to_dict() for x in obj.comments],
                                          ensure_ascii=False)

    article_dict["tags"] = json.dumps(obj.tags, ensure_ascii=False)

    return article_dict

  def json_to_comment(json_comment: dict):
    comment_dict = json_comment.copy()

    replies = []
    if len(json_comment["replies"]) > 0:
      replies = [ArticleSerialization.json_to_comment(reply)
                    for reply in comment_dict["replies"]]

    comment_dict["replies"] = replies
    comment_dict["likes"] = int(comment_dict["likes"])

    # 2021-11-08T18:18:45
    comment_dict["date"] = datetime.strptime(comment_dict["date"],
                                              "%Y-%m-%dT%H:%M:%S")
    return Comment(**comment_dict)

  @staticmethod
  def deserialize(_dict):
    """
    Deserialize an object from a dictionary.
    Return an Article object.
    """

    obj_dict = _dict.copy()

    comments = [ArticleSerialization.json_to_comment(comment)
                  for comment in json.loads(obj_dict["comments"])]
      
    tags = json.loads(obj_dict["tags"])
    
    # 2021-11-08T18:18:45+07:00
    date = datetime.strptime(obj_dict["date"], "%Y-%m-%dT%H:%M:%S%z")

    likes = int(obj_dict["likes"]) if obj_dict["likes"] is not '' else None

    obj_dict.pop("comments", None)
    obj_dict.pop("tags", None)
    obj_dict.pop("date", None)
    obj_dict.pop("likes", None)

    return Article(**obj_dict, comments=comments, tags=tags, date=date,
                   likes=likes)

In [13]:
class FileStorage:
  """
  Store and retrieve objects to/from a file.
  """

  ENCODING = "utf-8-sig"
  NEW_LINE = ""

  @staticmethod
  def store(objects, file_path, mode="w", sort=True):
    """
    Store Article objects to a file.
    Return the file path.
    """
    sorted_objects = objects

    if sort:
      sorted_objects = sorted(objects, key=lambda o: o.date)

    with open(file_path, mode, newline=FileStorage.NEW_LINE,
              encoding=FileStorage.ENCODING) as f:

      writer = csv.DictWriter(f, fieldnames=sorted_objects[0].to_dict().keys())
      
      if "a" not in mode:
        writer.writeheader()

      for obj in sorted_objects:
        writer.writerow(ArticleSerialization.serialize(obj))

    return file_path

  @staticmethod
  def load(file_path):
    """
    Load Article objects from a file.
    Return a list of object.
    """

    objects = []

    with open(file_path, "r", newline=FileStorage.NEW_LINE,
              encoding=FileStorage.ENCODING) as f:

      reader = csv.DictReader(f)
      
      for row in reader:
        objects.append(ArticleSerialization.deserialize(row))
    
    return objects

#### Store crawled data
Skip this if only loading the existing file.

In [14]:
# CAUTIOUS: This will remove the existing content of the file
FileStorage.store(articles, FILE_PATH, mode="w")

'/content/drive/MyDrive/data/suc_khoe_articles.csv'

#### Load data from file and deserialize

In [23]:
loaded_articles = FileStorage.load(FILE_PATH)
print("Size:", len(loaded_articles), "entries")

Size: 1008 entries


### Enlarge the existing data
Attempt to crawl new articles and skip ones that are already stored.

*Refer to the section above to load the data first before proceed.*

In [16]:
crawled_ids = set([a.id for a in loaded_articles])

# Limit the number of articles.
LIMIT =  16#@param {type:"integer"}

# Time delayed inbetween GET requests (to avoid overloading the news server)
DELAY = 0.1 #@param {type:"number"}

# Whether to get only the articles that are newer than the existing ones.
NEWER_ONLY = False #@param {type:"boolean"}

CATEGORY = TuoiTreCrawler.Category.SUC_KHOE

suckhoe_crawler = TuoiTreCrawler(category=CATEGORY, delay=DELAY,
                                 crawl_comment=True, skip_these=crawled_ids,
                                 newer_only=NEWER_ONLY)

new_articles = suckhoe_crawler.crawl_articles(limit=LIMIT)


Getting article URLs from https://tuoitre.vn/timeline/12/trang-1.htm
Found 0 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-2.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-3.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-4.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-5.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-6.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-7.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-8.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-9.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/timeline/12/trang-10.htm
Found 1 / 16 article URLs
Getting article URLs from https://tuoitre.vn/time

 25%|██▌       | 4/16 [00:07<00:20,  1.71s/it]


Error while getting article with id 20210915202950649 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


 69%|██████▉   | 11/16 [00:20<00:08,  1.68s/it]


Error while getting article with id 20210925192946811 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


 81%|████████▏ | 13/16 [00:22<00:04,  1.57s/it]


Error while getting article with id 2021110513110478 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).


100%|██████████| 16/16 [00:27<00:00,  1.70s/it]


Error while getting article with id 20211002013543091 : 'NoneType' object has no attribute 'find_all'
The article could be an unexpected format (Infographic, video...).

Success: 12 / 16 	Loss: 4





In [18]:
# Append crawled data to existing file
FileStorage.store(new_articles, FILE_PATH, mode="a")

'/content/drive/MyDrive/data/suc_khoe_articles.csv'

### Sort the whole file.

In [21]:
loaded_articles = FileStorage.load(FILE_PATH)
FileStorage.store(loaded_articles, FILE_PATH, mode="w", sort=True)


'/content/drive/MyDrive/data/suc_khoe_articles.csv'

### Duplication test

In [22]:
assert len(loaded_articles) == len(set([a.id for a in loaded_articles]))