In [1]:
# flake8: noqa
import traceback
import pickle
import re
import os
import json
import time
import uuid
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service

import pandas as pd

# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait

class PageLoadingWorker:
    def __init__(self,current_page=None, start="0,KB", end="5,KB"):
        self.current_page = 0 if current_page is None else current_page
        self.url = "https://www.kaggle.com/datasets?fileType=csv&sizeStart="+start+"&sizeEnd="+end+"&page={}"

    def __next__(self):
        self.current_page += 1
        return self.url.format(self.current_page)
    
    def reverse(self, steps=1):
        self.current_page -= steps
    
    def save(self):
        with open("page.bin", "w") as f:
            f.write(str(self.current_page))
    

class CrawlError(Exception):
    pass

class Crawler:    
    def __init__(self, pageloader, run_headless=False, proxy=None,page_load_strategy="none", 
                 wait_time_load=6, wait_time_parse = 0.0,
                 agent = None
                ):
        assert page_load_strategy in ["normal", "eager", "none"],\
                f'page_load_strategy need get value either\
                "normal" for complete load\
                "eager" for interactive load\
                "none" for no strategy\
                but got {page_load_strategy}'
        
        self.pageloader = pageloader
    
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-blink-features")
        options.add_argument("--disable-blink-features=AutomationControlled")
        if proxy is not None:
            options.add_argument(f'--proxy-server={proxy}')
        if run_headless:
            options.add_argument("--headless")
        if agent is not None:
            print(f'user-agent={agent}')
            options.add_argument(f'user-agent={agent}')
        
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        caps = DesiredCapabilities().CHROME
        caps["pageLoadStrategy"] = page_load_strategy
    
        self.caps_status = caps["pageLoadStrategy"]
        self.wait_time_load = wait_time_load
        self.wait_time_parse = wait_time_parse
        s=Service("./chromedriver")
        print("start new browser")
        self.driver = webdriver.Chrome(service=s,options=options,desired_capabilities=caps)
        self.tab_handeler=[0] #0 = main tab
    
    def reset(self):
        try:
            self.driver.quit()
        finally:
            self.__init__()
            
    def __switch_tab(self, tab_ID=0):
        self.driver.switch_to.window(self.driver.window_handles[tab_ID])
        
    def open_new_tab(self):
        self.driver.execute_script("window.open();")
        tabID = self.tab_handeler[-1]+1
        self.__switch_tab(tabID)
        self.tab_handeler.append(tabID)
        
    def quit(self):
        self.driver.quit()
        print("Quit browser !!!")
    
    def close_last_tab(self):
        self.driver.close()
        self.tab_handeler.pop()
        self.__switch_tab(self.tab_handeler[-1])
    
    def get_urls(self, retries=5):
        page_results = {"dataset name": [], "url" : []}
        try:
            element = self.driver.find_element(By.XPATH,f'//ul[@class="km-list km-list--three-line"]')
            elements = element.find_elements(By.TAG_NAME, "li")
            print(f"[INFO] getting {len(elements)}")
            for li in elements:
                page_results["dataset name"].append(li.get_attribute("aria-label"))
                page_results["url"].append(li.find_element(By.TAG_NAME, "a")\
                                                      .get_attribute("href")\
                                                      .replace("https://www.kaggle.com","")
                                          )
            return page_results
        except:
            if retries > 0:
                self.driver.implicitly_wait(self.wait_time_load)
                self.driver.refresh()
                return self.get_urls(retries=retries - 1)
            else:
                return None
    
    def get_data_set_link(self, num_tab:int, sleeptime_pertab = 0):
        urls,outputs = [],[]
        for _ in range(num_tab):
            self.open_new_tab()
            url = next(self.pageloader)
            urls.append(url)
        for i in range(num_tab):
            self.__switch_tab(self.tab_handeler[i+1])
            self.driver.get(urls[i])
        print(f'[INFO] opening {len(self.tab_handeler) - 1} tabs')
        if self.caps_status == "none":
            self.driver.implicitly_wait(self.wait_time_load)
        time.sleep(6)
        for i in range(num_tab):
            # self.driver.refresh()
            output = self.get_urls(retries=5)
            if output != None:
                outputs.append(output)
            elif i == num_tab:
                break
            self.close_last_tab()
            time.sleep(sleeptime_pertab)
        return outputs

In [2]:
proxies = [ None,
            "140.227.65.129:58888", #JP
            "14.225.5.21:3128", "183.89.117.235:8080","183.88.7.145:8080", "110.77.242.180:8080", #Thai
          ]

run_headless = False #False
agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36' #None
wait_time_load = 7
wait_time_parse = 0.0
sleeptime_pertab = 0.0

tab_size = 10
save_level = 1 # 20 batch save one time and rest for 60 sencond avoid being banned from Google
restart_level = 10*save_level # every save 5 times then restart driver 1 time

In [3]:
select_idx = 0

#0,KB-5,KB; 5,KB-50,KB; 50,KB-400,KB; 400,KB-2,MB, extend 2,MB-25,MB
start="2,MB"
end="25,MB"

pl = PageLoadingWorker(start=start, end=end)

while (select_idx < len(proxies)):
    selected_proxy = proxies[select_idx]
    print(f'using {selected_proxy} for VPN address')
    timer = 0
    browser = Crawler(pl, run_headless=run_headless, proxy=selected_proxy,page_load_strategy="none", 
                        wait_time_load=wait_time_load, wait_time_parse = wait_time_parse,
                        agent = agent
                    )
    results = []
    while(True):
        try:
            results.append(browser.get_data_set_link(10))
            timer += 1
            
            if timer%save_level==0:
                dfs = []
                for session in results:
                    dfs += [pd.DataFrame(tab) for tab in session]
                if dfs == []:
                    raise
                if os.path.isfile(f"./datasets_{start}_{end}.csv"):
                    dfs = [pd.read_csv(f"./datasets_{start}_{end}.csv")] + dfs
                pd.concat(dfs).to_csv(f"./datasets_{start}_{end}.csv", index=False)
                del(dfs, results)
                results = []

            if timer%restart_level==0:
                browser.quit()
                time.sleep(20)
                browser = Crawler(pl, run_headless=run_headless, proxy=selected_proxy,page_load_strategy="none", 
                            wait_time_load=wait_time_load, wait_time_parse = wait_time_parse,
                            agent = agent
                        )
            
        except:
            dfs = []
            for session in results:
                dfs += [pd.DataFrame(tab) for tab in session]
            if os.path.isfile(f"./datasets_{start}_{end}.csv"):
                dfs = [pd.read_csv(f"./datasets_{start}_{end}.csv")] + dfs
            pd.concat(dfs).to_csv(f"./datasets_{start}_{end}.csv", index=False)
            traceback.print_exc()
            browser.quit()
            raise

using None for VPN address
user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36
start new browser
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 tabs
[IN

[INFO] getting 20
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
Quit browser !!!
user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36
start new browser
[INFO] opening 10 tabs
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] getting 20
[INFO] opening 10 

Traceback (most recent call last):
  File "/tmp/ipykernel_51734/113989113.py", line 28, in <module>
    raise
RuntimeError: No active exception to reraise


RuntimeError: No active exception to reraise