In [1]:
import sys
sys.path.append('..')

from selenium import webdriver
from fake_useragent import UserAgent
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import multiprocessing
import json
import pandas as pd
import time
import os
import math
import argparse
import gc
from crawler.crawler_master import CrawlerMaster
import random
import psutil
from utils.utils import convert_to_nosymbol, gen_uuid_list, random_time_sleep
from constant import options

In [2]:
test = pd.DataFrame()
test['x'] = [1,2,1,1,3]
test['y'] = [2,3,1,1,2]

In [8]:
test[test.duplicated(subset=['x'], keep='first')==False]

Unnamed: 0,x,y
0,1,2
1,2,3
4,3,2


In [2]:
TIME_SLEEP       = 2
TIME_OUT         = 40
MAX_COUNT_LOOP   = 100
PUBLISHER        = 'vnexpress'
RAW_DATA_PATH    = os.path.join('../data/raw_data', PUBLISHER)
KILL_FIREFOX_CMD = "pkill firefox"

In [3]:
class CrawlerMaster():
    def __init__(self):
        self.url = 'https://www.foody.vn'
        self.web_driver_options = self.set_option_web_driver()
        self.foody_options = []

    def set_option_web_driver(self):
        options    = Options()
        ua         = UserAgent()
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'#ua.random
        options.add_argument(f'user-agent={user_agent}')
        options.add_argument("window-size=1920,1080")
        options.add_argument('--headless')
        return options
    
    def create_foody_option(self):
        for i in range(len(options.Location.city)):
            for j in range(len(options.Option.order_option)):
                self.foody_options.append({'city' : options.Location.city[i], 'order_option' : options.Option.order_option[j]})
    
    def control_url_slave(self, id):
        foody_option = self.foody_options[id]
        print('Crawl URL: {} - {}'.format(foody_option['city'], foody_option['order_option']))
        slave = CrawlerSlave(url=self.url, 
                             city=foody_option['city'], 
                             order_option=foody_option['order_option'], 
                             web_driver_options=self.web_driver_options)
        slave.crawl()
    
    def control_crawling_urls(self):
        self.create_foody_option()
        nrof_processes = min(multiprocessing.cpu_count(), 8)
        pool = multiprocessing.Pool(processes=nrof_processes)
        pool.map_async(self.control_url_slave, range(len(self.foody_options)))

In [4]:
def random_time_sleep(level):
    if level == 0:
        return random.uniform(0.2, 0.5)
    if level == 1:
        return random.uniform(0.5, 1)
    if level >= 2:
        return random.uniform(1, 2)

class CrawlerSlave():
    def __init__(self, url, city, order_option, web_driver_options):
        self.city     = city
        self.order_option = order_option
        self.id       = 0
        self.comments = []
        self.driver   = webdriver.Firefox(executable_path='../web_driver/geckodriver', options=web_driver_options)
        # set page load timeout
        self.driver.set_page_load_timeout(TIME_OUT)
        try:
            self.driver.get(url)
        except:
            print('Time out')
            self.driver = None

    def log_in(self):
        log_in_button = WebDriverWait(self.driver, 30).\
                        until(EC.presence_of_element_located((By.CLASS_NAME, "account-manage")))
        log_in_button.click()
        time.sleep(random_time_sleep(level=1))
        user_name_block = WebDriverWait(self.driver, 30).\
                          until(EC.presence_of_element_located((By.XPATH, '//*[@id="Email"]')))
        user_name_block.send_keys('langtunhi96@gmail.com')
        time.sleep(random_time_sleep(level=1))
        pass_word_block = WebDriverWait(self.driver, 30).\
                          until(EC.presence_of_element_located((By.XPATH, '//*[@id="Password"]')))
        pass_word_block.send_keys('langtunhi96')
        pass_word_block.submit()
        time.sleep(random_time_sleep(level=1))
    
    def page_down(self, times):
        body = WebDriverWait(self.driver, 30).\
               until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
        for i in range(times):
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(random_time_sleep(level=0))
    
    def load_more_result(self):
        load_more_button = WebDriverWait(self.driver, 30).\
                           until(EC.presence_of_element_located((By.CLASS_NAME, "fd-btn-more")))
        while True:
            try:
                memory_percent = psutil.virtual_memory().percent
                if memory_percent > 90:
                    print('Stop because memory > 90%')
                    break
                load_more_button.click()
                self.page_down(times=1)
            except:
                break
    
    def choose_location_option(self):
        location_options_box = WebDriverWait(self.driver, 30).\
                               until(EC.presence_of_element_located((By.ID, "head-province")))
        location_options_box.click()
        input_location_box   = WebDriverWait(self.driver, 30).\
                               until(EC.presence_of_element_located((By.CLASS_NAME, "loc-query")))
        input_location_box.send_keys(self.city)
        chosen_location_box = WebDriverWait(self.driver, 30).\
                              until(EC.presence_of_element_located((By.CLASS_NAME, "flp-countries")))
        location = chosen_location_box.find_elements_by_class_name('ng-scope')[0]
        location.click()
    
    def choose_order_option(self):
        order_options_box = WebDriverWait(self.driver, 30).\
                            until(EC.presence_of_element_located((By.CLASS_NAME, "list-nav")))
        order_options = order_options_box.find_elements_by_tag_name('li')
        flag = 0
        for option in order_options:
            if option.text == self.order_option:
                option.click()
                flag = 1
                break
        if flag == 0:
            raise Exception('Not found "{}" order option'.format(self.order_option))
    
    def get_urls(self):
        self.urls = []
        result_block = WebDriverWait(self.driver, 30).\
                       until(EC.presence_of_element_located((By.CLASS_NAME, 'content-container')))
        result_elements = result_block.find_elements_by_class_name('content-item')
        for element in result_elements:
            url = element.find_element_by_class_name('title').find_element_by_tag_name('a').get_attribute('href')
            prefix_url = 'https://www.foody.vn'
            if prefix_url not in url:
                url = prefix_url + url
            self.urls.append(url)
    
    def save_urls(self):
        url_table = pd.DataFrame()
        url_table['url'] = self.urls
        url_table['city'] = self.city
        url_table['order_option'] = self.order_option
        url_table['id'] = gen_uuid_list(len(self.urls))
        url_table = url_table[url_table.duplicated(subset=['url'], keep='first')==False].reset_index(drop=True)
        url_table.to_csv('../data/urls/urls_{}_{}.csv'.format(convert_to_nosymbol(self.city), convert_to_nosymbol(self.order_option)), index=False)
        
    def crawl(self):
        try:
            self.choose_location_option()
            self.log_in()
            # wait for loading page
            time.sleep(TIME_SLEEP)
            self.page_down(times=10)
            self.choose_order_option()
            self.load_more_result()
            self.get_urls()
            self.save_urls()
        except:
            print('Error {} - {}'.format(self.city, self.order_option))

In [5]:
master = CrawlerMaster()
master.control_crawling_urls()

Crawl URL: Bình Phước - Giao hàng
Crawl URL: An Giang - Ở đâu
Crawl URL: Bắc Ninh - Ở đâu
Crawl URL: Bạc Liêu - Đặt bàn
Crawl URL: Gia Lai - Đặt bàn
Crawl URL: Cao Bằng - Giao hàng
Crawl URL: Bạc Liêu - Giao hàng
Crawl URL: Gia Lai - Giao hàng
Crawl URL: Bình Thuận - Ở đâu
Crawl URL: Cà Mau - Ở đâu
Crawl URL: Bắc Ninh - Đặt bàn
Crawl URL: An Giang - Đặt bàn
Crawl URL: Bắc Giang - Ở đâu
Crawl URL: Bình Thuận - Đặt bàn
Crawl URL: Cà Mau - Đặt bàn
Crawl URL: Bắc Ninh - Giao hàng
Crawl URL: Huế - Ở đâu
Crawl URL: An Giang - Giao hàng
Crawl URL: Bình Thuận - Giao hàng
Crawl URL: Cà Mau - Giao hàng
Crawl URL: Bắc Giang - Đặt bàn
Crawl URL: Huế - Đặt bàn
Crawl URL: Bến Tre - Ở đâu
Crawl URL: Bình Dương - Ở đâu
Crawl URL: Bình Định - Ở đâu
Crawl URL: Cần Thơ - Ở đâu
Crawl URL: Bến Tre - Đặt bàn
Crawl URL: Bắc Giang - Giao hàng
Crawl URL: Huế - Giao hàng
Crawl URL: Bình Dương - Đặt bàn
Crawl URL: Bình Định - Đặt bàn
Crawl URL: Cần Thơ - Đặt bàn
Crawl URL: Hà Giang - Ở đâu
Crawl URL: Bắc Kạn - Ở

Process ForkPoolWorker-4:


Time out
Time out


Process ForkPoolWorker-1:


Time out
Time out
Crawl URL: Bắc Kạn - Đặt bàn
Crawl URL: Bình Định - Giao hàng
Crawl URL: Cần Thơ - Giao hàng
Crawl URL: Hà Giang - Đặt bàn


Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ngogiatien/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
  File "/home/ngogiatien/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
TypeError: getresponse() got an unexpected keyword argument 'buffering'
  File "/home/ngogiatien/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ngogiatien/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.

Crawl URL: Bạc Liêu - Ở đâu
Crawl URL: Bắc Kạn - Giao hàng
Crawl URL: Gia Lai - Ở đâu
Crawl URL: Hà Giang - Giao hàng
Crawl URL: Hải Dương - Đặt bàn
Crawl URL: Khánh Hoà - Ở đâu
Crawl URL: Kon Tum - Giao hàng
Crawl URL: Lào Cai - Đặt bàn
Crawl URL: Hải Dương - Giao hàng
Crawl URL: Khánh Hoà - Đặt bàn
Crawl URL: Lào Cai - Giao hàng
Crawl URL: Lai Châu - Ở đâu
Crawl URL: Hải Phòng - Ở đâu
Crawl URL: Khánh Hoà - Giao hàng
Crawl URL: Lâm Đồng - Ở đâu
Crawl URL: Lai Châu - Đặt bàn
Crawl URL: Hải Phòng - Đặt bàn
Crawl URL: Kiên Giang - Ở đâu
Crawl URL: Lâm Đồng - Đặt bàn
Crawl URL: Hải Phòng - Giao hàng
Crawl URL: Kiên Giang - Đặt bàn
Crawl URL: Lai Châu - Giao hàng
Crawl URL: Lâm Đồng - Giao hàng
Crawl URL: Hậu Giang - Ở đâu
Crawl URL: Kiên Giang - Giao hàng
Crawl URL: Long An - Ở đâu
Time out
Time out
Crawl URL: Lạng Sơn - Ở đâu
Crawl URL: Kon Tum - Ở đâu


Process ForkPoolWorker-2:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ngogiatien/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
  File "/home/ngogiatien/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ngogiatien/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ngogiatien/anaconda3/lib/python3.7/multiproce

In [6]:
# slave.get_urls()

In [7]:
slave.urls

['https://www.foody.vn/ha-noi/com-tri-giao-com-van-phong-online',
 'https://www.foody.vn/ha-noi/poeme-home-made-cake-van-cao',
 'https://www.foody.vn/ha-noi/nem-khoai-77-online-nguyen-huy-tuong',
 'https://www.foody.vn/ha-noi/ha-thu-nuoc-ep-trai-cay-nguyen-chat',
 'https://www.foody.vn/ha-noi/xuan-anh-bun-ngan',
 'https://www.foody.vn/ha-noi/yihetang-tea-coffee-cau-giay',
 'https://www.foody.vn/ha-noi/tra-chanh-big-tran-phu',
 'https://www.foody.vn/ha-noi/royaltea-nguyen-van-cu',
 'https://www.foody.vn/ha-noi/dua-dam-hai-phong-kem-bo-da-nang-shop-online',
 'https://www.foody.vn/ha-noi/potato-king-the-gioi-ga-khoai-online',
 'https://www.foody.vn/ha-noi/sua-chua-tran-chau-ha-long-thai-phien',
 'https://www.foody.vn/ha-noi/cine-cafe-3-cafe-phim-chua-lang',
 'https://www.foody.vn/ha-noi/pho-kcc-phuc-dien',
 'https://www.foody.vn/ha-noi/banh-cuon-ba-luong',
 'https://www.foody.vn/ha-noi/che-ngon-truyen-thong-to-hieu',
 'https://www.foody.vn/ha-noi/co-le-nem-nuong-nha-trang',
 'https://www.

In [8]:
len(slave.urls)

1005