<a href="https://colab.research.google.com/github/thaitruong018/Web_Scraping_Selenium_Python/blob/main/VNExpress_Scraping_Selenium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Problem Statement
- Build a Web Scraper to collect data about articles on [https://vnexpress.net/](https://vnexpress.net/).
- Required information:
  - Title
  - Description
  - Link to the Article
  - Link to Thumbnail Image

![](https://i.imgur.com/sI6Slxi.png)

### Set-up

In [None]:
# install selenium and other resources for crawling data
!pip install selenium
!apt-get update
# install other resources for doing crawling
!apt install chromium-chromedriver

Collecting selenium
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[K     |████████████████████████████████| 904 kB 4.0 MB/s 
Installing collected packages: selenium
Successfully installed selenium-3.141.0
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:10 http://archive.ubuntu.com/

In [None]:
# import library
from selenium import webdriver
# set driver
DRIVER = None

In [None]:
# create two function
# initialize driver
def initialize_driver():
    global DRIVER
    if DRIVER is None:
        print('Initiating driver...')
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('-headless') 
        chrome_options.add_argument('-no-sandbox') 
        DRIVER = webdriver.Chrome('chromedriver',options=chrome_options)  # Create the new chrome browser with specific options
        print('Finished!')
# close driver
def close_driver():
    global DRIVER
    if not DRIVER is None:
        DRIVER.quit()

    DRIVER = None


In [None]:
close_driver()

In [None]:
initialize_driver()

Initiating driver...
Finished!


In [None]:
DRIVER.get('https://vnexpress.net/')

In [None]:
DRIVER.current_url

'https://vnexpress.net/'

### Scraping

In [None]:
# count how many item news
all_news_elements = DRIVER.find_elements_by_class_name('item-news')
len(all_news_elements)

177

In [None]:
# overview of a news
print(all_news_elements[4].get_attribute('outerHTML'))

<article class="item-news item-news-common " data-id="4329020">
<h3 class="title-news">
<a href="https://vnexpress.net/de-nghi-di-doi-hai-khu-dan-cu-trong-quy-hoach-song-hong-4329020.html" title="Đề nghị di dời hai khu dân cư trong quy hoạch sông Hồng" data-medium="Item-6" data-thumb="1" data-itm-source="#vn_source=Home&amp;vn_campaign=ThuongVien&amp;vn_medium=Item-6&amp;vn_term=Desktop&amp;vn_thumb=1" data-itm-added="1">Đề nghị di dời hai khu dân cư trong quy hoạch sông Hồng</a>
</h3>
<div class="thumb-art">
<a href="https://vnexpress.net/de-nghi-di-doi-hai-khu-dan-cu-trong-quy-hoach-song-hong-4329020.html" class="thumb thumb-5x3" title="Đề nghị di dời hai khu dân cư trong quy hoạch sông Hồng" data-medium="Item-6" data-thumb="1" data-itm-source="#vn_source=Home&amp;vn_campaign=ThuongVien&amp;vn_medium=Item-6&amp;vn_term=Desktop&amp;vn_thumb=1" data-itm-added="1">
<picture>
<!--[if IE 9]><video style="display: none;"><![endif]-->
<source data-srcset="https://vcdn1-vnexpress.vnecdn.net/

In [None]:
# create functions
# create a func to get title, desc, and link
def get_title_desc_link(article_element):
  link = "No links found"
  title = "No titles found"
  desc = "No descs found"

  try:
    # Locate the Class contents link, title, and desc
    title_news = article_element.find_element_by_class_name("title-news")
    # link
    link = title_news.find_element_by_tag_name("a").get_attribute("href")
    #title
    title = title_news.find_element_by_tag_name("a").get_attribute("title")
    # description
    desc_class = article_element.find_element_by_class_name("description")
    desc= desc_class.find_element_by_tag_name("a").text
  except Exception as e:
    pass
  return link, title, desc

# create a func to get thumb art
def get_thumb_art(article_element):
  src = "No src found"
  try:
    # Xac dinh vi tri class thumb-art
    thubm_art = article_element.find_element_by_class_name("thumb-art")
    # Src
    src = thubm_art.find_element_by_tag_name("img").get_attribute("src")
  except Exception as e:
    pass
  return src


In [None]:
# combine above funcs into a main func
def scrape_vnexpress(DRIVER):
    '''
    Return a list of items, each item is another list containing 4 items: 
    - Article link
    - Title
    - Description
    - Thumbnail link (if possible)
    '''
    all_news_elements = DRIVER.find_elements_by_class_name('item-news')   

    results = []
    
    # scraping
    for article in all_news_elements:

      link, title, desc = get_title_desc_link(article)
      src = get_thumb_art(article)

      temp = [link, title, desc, src]
      results.append(temp)
    
    return results

        

In [None]:
# call the result
results = scrape_vnexpress(DRIVER)

In [None]:
# the results of the first 5 item news
results[:5]

[['https://vnexpress.net/malaysia-dat-cuoc-vao-chien-luoc-xet-nghiem-ncov-tai-nha-4328587.html',
  'Malaysia đặt cược vào chiến lược xét nghiệm nCoV tại nhà',
  'Giữa lúc số ca nhiễm mới hàng ngày lên mức kỷ lục 11.000, Malaysia tuần trước phê duyệt có điều kiện hai loại kit tự xét nghiệm nCoV tại nhà.',
  'https://vcdn1-vnexpress.vnecdn.net/2021/07/22/malays-1626951151-4095-1626951262.jpg?w=120&h=72&q=100&dpr=2&fit=crop&s=8plCgmFwz3xnul4M_gBf3Q'],
 ['https://vnexpress.net/xin-lam-nguoi-xa-la-4329072.html',
  'Xin làm người xa lạ',
  'Khẩu trang có thay thế việc duy trì khoảng cách vật lý hai mét với người bên cạnh không? Không!',
  'https://vcdn1-vnexpress.vnecdn.net/2019/01/28/nguyendanganhthipng-1548661469.png?w=100&h=100&q=100&dpr=1&fit=crop&s=brknxOD9EQCcD90bjLXBfg'],
 ['https://vnexpress.net/benh-vien-da-chien-da-nang-hoan-thanh-sau-ba-ngay-thi-cong-4329076.html',
  'Bệnh viện dã chiến Đà Nẵng hoàn thành sau ba ngày thi công',
  'Sau ba ngày thi công suốt ngày đêm, bệnh viện dã c

# Use pandas to save results as csv file

In [None]:
# import library
import pandas as pd

In [None]:
# create a dataframe contains the result
df = pd.DataFrame(results,columns=['link','title','description','thumbnail_link'])

In [None]:
# first 5 rows of the file
df.head()

Unnamed: 0,link,title,description,thumbnail_link
0,https://vnexpress.net/malaysia-dat-cuoc-vao-ch...,Malaysia đặt cược vào chiến lược xét nghiệm nC...,Giữa lúc số ca nhiễm mới hàng ngày lên mức kỷ ...,https://vcdn1-vnexpress.vnecdn.net/2021/07/22/...
1,https://vnexpress.net/xin-lam-nguoi-xa-la-4329...,Xin làm người xa lạ,Khẩu trang có thay thế việc duy trì khoảng các...,https://vcdn1-vnexpress.vnecdn.net/2019/01/28/...
2,https://vnexpress.net/benh-vien-da-chien-da-na...,Bệnh viện dã chiến Đà Nẵng hoàn thành sau ba n...,"Sau ba ngày thi công suốt ngày đêm, bệnh viện ...",https://vcdn1-vnexpress.vnecdn.net/2021/07/22/...
3,https://video.vnexpress.net/tin-tuc/thoi-su/du...,Đường nối cao tốc TP HCM - Long Thành sắp hoàn...,ĐỒNG NAIĐường 319 dự kiến đưa vào khai thác c...,https://vcdn1-vnexpress.vnecdn.net/2021/07/14/...
4,https://vnexpress.net/nhung-dieu-can-biet-khi-...,Những điều cần biết khi tiêm vaccine Sinopharm,Các phản ứng sau tiêm vaccine Sinopharm như đa...,https://vcdn1-suckhoe.vnecdn.net/2021/07/20/di...


In [None]:
# save to csv file
df.to_csv('vnexpress_scraped1.csv',index=False, encoding="utf-8")