In [2]:
import re
import os
import time
import json
import string
import random
import threading
import datetime as dt

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

In [2]:
# Pixiv Home page, usually need not to modify.
HOME_PAGE = "https://www.pixiv.net/"

# Tag URL, search illustrations with tag "maiden".
TAG_HOME_PAGE = "https://www.pixiv.net/tags/%E5%A5%B3%E3%81%AE%E5%AD%90/illustrations"
# Page parameter, sort by popular, mode safe, receive page to complete.
PAGE_URL = TAG_HOME_PAGE + "?order=popular_d&mode=safe&p="

# MODIFY PARAMETER: USER_NAME, use to login to the Pixiv.
USER_NAME = "ENTER YOUR PIXIV USERNAME HERE."
# MODIFY PARAMETER: PASSWORD, use to login to the Pixiv.
PASSWORD = "ENTER YOUR PIXIV USERNAME HERE."

# MODIFY PARAMETER: JSON_PATH, use to save image info.
JSON_PATH = "J:/Pixiv_Images/"
# MODIFY PARAMETER: CHROME_DRIVER_PATH, seleium browser driver.
CHROME_DRIVER_PATH = "./chromedriver.exe"
# Initalize global variable, to control all thread (True = all thread stop). 
GLOBAL_CLOSE_ALL_THREAD = False

# Get now time
create_date = dt.datetime.now().strftime('%F')
create_time = dt.datetime.now().strftime('%T')

# Create log file.
char_set = string.ascii_letters + "0123456789"
log_id = "".join(random.choices(char_set, k = 8))

# MODIFY PARAMETER: log_name & log_path, the file name and path to save log file.
log_name = f"SpiderLog_{create_date}_{log_id}.log"
log_path = "./log/" + log_name

# MODIFY PARAMETER: save_path, image will save in this folder.
save_path = "J:/Pixiv_Images/"

with open(log_path, 'w') as init:
    init.writelines(f"Log file create at {create_date} {create_time}.\n")
    pass

In [3]:
class SpiderThread(threading.Thread):
    
    def __init__(self, thread_name, start_page, end_page, start_index = None):
        super().__init__()
        
        self.tname = thread_name
        self.is_initalize = False
        self.driver = None
        
        self.img_info = []
        self.img_tags = []
        self.rs_dict = {}
        self.tag_list = []
        
        if start_index is None:
            self.img_name = (start_page - 1) * 59 + 1
        else:
            self.img_name = start_index
            
        self.total_img_count = 0
        self.page = start_page
        self.start_page = start_page
        self.end_page = end_page
        
    def createLog(self, message_type, message, display = False):
        try:
            time = dt.datetime.now().strftime('%F %T')
            string = f"[{time}](Thread {self.tname}) {message_type}: {message}"

            with open(log_path, 'a') as log:
                log.writelines(string + "\n")
                if display:
                    print(string)
        except Exception as e:
            string = f"[{time}(Thread {self.tname})] EXCEPT: {e}"
            with open(log_path, 'a') as log:
                log.writelines(string + "\n")
                if display:
                    print(string)
        
    def driverInitalize(self):
        if self.is_initalize:
            self.createLog("INITALIZE", f"Thread {self.name} is already initalized.")
        self.createLog("INITALIZE", "Starting initalize.")
        
        self.driver = webdriver.Chrome(CHROME_DRIVER_PATH)
        driver = self.driver
        
        driver.get(HOME_PAGE)
#         driver.set_window_size(1720, 1440)
        driver.maximize_window()
        driver.find_element_by_class_name("signup-form__submit--login").click()
        time.sleep(1)

        driver.find_element_by_xpath('//*[@id="LoginComponent"]/form/div[1]/div[1]/input').send_keys(USER_NAME)
        time.sleep(0.5)
        driver.find_element_by_xpath('//*[@id="LoginComponent"]/form/div[1]/div[2]/input').send_keys(PASSWORD)
        time.sleep(0.5)

        driver.find_element_by_xpath('//*[@id="LoginComponent"]/form/button').click()
        self.createLog("INITALIZE", "Login to the pixiv.")
        time.sleep(2)

        driver.get(PAGE_URL + str(self.start_page))
#         time.sleep(1)

#         driver.find_element_by_xpath("//*[contains(text(),'全年齢')]").click()
#         time.sleep(1)

#         driver.find_element_by_xpath("//*[contains(text(),'人気順')]").click()
#         time.sleep(0.2)
#         driver.find_element_by_xpath("//*[contains(text(),'全体の人気順')]").click()
        self.createLog("INITALIZE", "Initalize complete.")
        time.sleep(1)

    def getLinkFromObject(self):
        page, driver = self.page, self.driver
        object_list = list(driver.find_elements_by_tag_name("a"))
        object_dict = dict(zip([i for i in range(0, len(object_list))], object_list))

        link_dict = {}
        for i in range(len(object_list)):
            temp = object_list[i].get_attribute("href").strip(HOME_PAGE)
            if re.match(re.compile(r'artworks'), temp):
                link_dict[i] = temp
            else:
                object_dict.pop(i)

        if len(object_dict) != len(link_dict):
            self.createLog("ERROR", "Get link from href failed.")
            self.join()

        link_object_dict = {}
        for i in object_dict:
            if int(i) % 2 == 0:
                link_object_dict[link_dict[i]] = object_dict[i]
        self.createLog("PAGE " + str(self.page), f"Successfully get {len(link_object_dict)} links.")

        return link_object_dict
    
    def print_rs(self, rs_dict = None, tag_list = None):
        if rs_dict != None:  
            name = rs_dict["author_name"]
            link = rs_dict["author_link"]
            title = rs_dict["info_title"]

            likes = rs_dict["info_likes"]
            bookmarks = rs_dict["info_bookmarks"]
            viewing = rs_dict["info_viewing"]

            self.createLog("IMG_INFO", f"===> Title: {title}, Author: {name} ({link})")
            self.createLog("        ", f"===> Likes: {likes}, Bookmarks: {bookmarks}, Viewing Count: {viewing}")
        else:
            self.createLog("IMG_INFO", f"===> Get Image info failed.")

        if tag_list != None:
            self.createLog("        ", f"===> Tags: {tag_list}")
        else:
            self.createLog("IMG_INFO", f"===> Get Image tags failed.")

    def getImgInfo(self):
        driver = driver = self.driver
        
        try:
            # Get Author
            rs_dict = {}
            div_ele = driver.find_elements_by_tag_name("h2")[0].find_element_by_tag_name("div")
            a_ele = div_ele.find_element_by_tag_name("a")
            rs_dict["author_link"] = a_ele.get_attribute("href")
            rs_dict["author_name"] = div_ele.text

            # Get Title
            rs_dict["info_title"] = driver.find_element_by_tag_name("h1").text      

            # Get Like, Bookmark and Viewing count
            count = driver.find_elements_by_tag_name("dd")
            rs_dict["info_likes"] = int(count[0].text.replace(",", ""))
            rs_dict["info_bookmarks"] = int(count[1].text.replace(",", ""))
            rs_dict["info_viewing"] = int(count[2].text.replace(",", ""))
            return rs_dict
        except:
            return None

    def getImgTags(self):
        
        driver = self.driver
        # Get tags
        try:
            tag_list = []
            tag = driver.find_elements_by_tag_name("ul")[0]
            obj_tag_list = tag.find_elements_by_tag_name("li")
            for i in obj_tag_list:
                try:
                    string = i.find_element_by_tag_name("span").find_element_by_tag_name("a").text
                    tag_list.append(string)
                except:
                    pass
            return tag_list
        except:
            return None
    
    def saveJSON(self):
        
        img_info_dict = dict(zip([i + 1 for i in range(len(self.img_info))], self.img_info))
        img_tags_dict = dict(zip([i + 1 for i in range(len(self.img_tags))], self.img_tags))
        
        path = JSON_PATH + f"Page{str(self.page - 1)}/"
        
        json_info = f"{path}img_info_{self.tname}.json"
        with open(json_info, "w", encoding ='utf-8') as f:
            f.write(json.dumps(img_info_dict, indent = 4, ensure_ascii = False))
            self.createLog("JSON", f"JSON info file saved at {json_info}")
        
        json_tags = f"{path}img_tags_{self.tname}.json"
        with open(f"{path}img_tags_{self.tname}.json", "w", encoding ='utf-8') as f:
            f.write(json.dumps(img_tags_dict, indent = 4, ensure_ascii = False))
            self.createLog("JSON", f"JSON tags file saved at {json_tags}")

    def waitElement(self, retry_time = 6):
        retry_count = 0
        while retry_count <= retry_time:
            gif = self.driver.find_elements_by_tag_name("canvas")
            img = self.driver.find_elements_by_css_selector("figure>div>div>div>a>img")
            if gif != []:
                self.createLog("DETECTOR", f"===> Animation {str(gif[0])} found.")
                return gif[0]
            elif img != []:
                self.createLog("DETECTOR", f"===> Image {str(img[0])} found.")
                return img[0]
            else:
                self.createLog("DETECTOR", f"===> Image wait {retry_count} / {retry_time}.")
                retry_count += 1
                time.sleep(0.3)
        self.createLog("DETECTOR", f"===> TIMEOUT, Image not found.")
        return None
                    
    def processPage(self):
        driver = self.driver
        recive_dict = self.getLinkFromObject()
        img_sum, img_count = len(recive_dict), 1
        main_window = driver.current_window_handle
        
        path = save_path + f"Page{self.page}/"
        if not os.path.exists(path):
            os.makedirs(path)

        for i in recive_dict:
            if GLOBAL_CLOSE_ALL_THREAD:
                return None
            retry_count = 0
            
            driver.execute_script(f"window.open('{HOME_PAGE + i}');")
            time.sleep(1)

            temp_window = driver.window_handles
            temp_window.remove(main_window)
            sub_window = temp_window[0]

            if len(driver.window_handles) > 2:
                self.createLog("ERROR", f"Too many windows cause thread {self.name} joined.", display = True)
                self.join()

            driver.switch_to.window(sub_window)

            rs_dict = self.getImgInfo()
            self.img_info.append(rs_dict)
            tag_list = self.getImgTags()
            self.img_tags.append(tag_list)
            
            self.print_rs(rs_dict, tag_list)

            img_object = self.waitElement()
            
            try:
                if img_object is not None:
                    driver.execute_script("arguments[0].scrollIntoView();", img_object)
                    img_object.screenshot(path + str(self.img_name) + ".png")
                    self.createLog("SAVE", f"===> File {img_count}/{img_sum}, total {self.total_img_count} saved.")
            except Exception as e:
                self.createLog("EXCEPT", f"===> Save the file fail, cause by {e}")

            self.total_img_count += 1
            self.img_name += 1
            img_count += 1

            driver.close()
            driver.switch_to.window(main_window)

            time.sleep(0.5)

    def startSpider(self):
        for i in range(self.start_page, self.end_page):
            self.processPage()
            if GLOBAL_CLOSE_ALL_THREAD:
                self.driver.quit()
                self.createLog("STOP", f"Thread [{self.tname}] stoped.")
                break
            time.sleep(3)
            self.driver.find_elements_by_class_name("xhhh7v-1-filterProps-undefined")[-1].click()
            self.page += 1
            time.sleep(3)
            self.saveJSON()
        self.createLog("FINISHED", f"Thread [{self.tname}] finished.")
            
    def closeSpider(self):
        self.driver.quit()
        raise Exception(f"Thread[{self.tname}] stoped.")
        
    def run(self):
        self.driverInitalize()
        self.startSpider()
        self.driver.quit()

In [4]:
def splitPage(split_range, split_count):
    start_page, end_page = split_range
    rs_list, page_list = [], [i for i in range(start_page, end_page + 1)]
    split_length = (end_page - start_page) // split_count
    start = start_page
    for i in range(1, split_count):
        rs_list.append((start, start + split_length))
        start += split_length + 1
    rs_list.append((start, end_page)) 
    return rs_list

def getThreadList(page_range, spider_count):
    page_list = splitPage(page_range, spider_count)
    thread_id = [i + 1 for i in range(spider_count)]
    thread_list = []

    for i in range(spider_count):
        thread_name = f"SpiderThread[{thread_id[i]}]"
        start_page, end_page = page_list[i]
        end_page += 1
        thread_list.append(SpiderThread(thread_name, start_page, end_page))
        
    return thread_list

def startAllThread(thread_list):
    for t in thread_list:
        t.start()

def closeAllThread(thread_list):
    for t in thread_list:
        t.join()
        
def stopAllThread():
    GLOBAL_CLOSE_ALL_THREAD = True
    time.sleep(5)
    GLOBAL_CLOSE_ALL_THREAD = False

In [5]:
# Parameter page_range: page's range, if page_range = (1, 3), spider will process the page[1, 2, 3], (1, 1) mean only one page[1].
# Parameter spider_count: how many spider(thread) you want execute synchronously, recommend no more than 10.
thread_list = getThreadList(page_range = (1, 1), spider_count = 1)
startAllThread(thread_list)

In [6]:
# Use under function to stop all thread
stopAllThread()