Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

爬取bibi的已更新,代码不能使用,请问怎么破解bibi的搜索框,::before #6

Open
AdvancingStone opened this issue Dec 23, 2019 · 4 comments

Comments

@AdvancingStone
Copy link

No description provided.

@wistbean wistbean reopened this May 9, 2020
@ToddCombs
Copy link

确实抓不了了,不断弹出登录框,是不是要模拟登陆才行呢。

@McChickenNuggets
Copy link

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import xlwt

browser = webdriver.Chrome()
browser.get("https://www.bilibili.com/")
WAIT = WebDriverWait(browser, 10)
browser.set_window_size(1400, 900)
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('蔡徐坤篮球', cell_overwrite_ok=True)
sheet.write(0, 0, '名称')
sheet.write(0, 1, '地址')
sheet.write(0, 2, '描述')
sheet.write(0, 3, '观看次数')
sheet.write(0, 4, '弹幕数')
sheet.write(0, 5, '发布时间')
n = 1

def search():
try:
print('开始访问b站....')
browser.get("https://www.bilibili.com/")

    input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#nav_searchform > input")))
    submit = WAIT.until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div/form/div/button')))
    input.send_keys('蔡徐坤 篮球')
    submit.click()

    print('跳转')
    all_h = browser.window_handles
    browser.switch_to.window(all_h[1])
    get_source()

    total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                                       "#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button")))
    return int(total.text)

except TimeoutException:
    return search()

def next_page(page_num):
try:
print('获取下一页数据')
next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
next_btn.click()
WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),str(page_num)))
get_source()

except TimeoutException:
    return next_page(page_num)

def save_to_excel(soup):
list = soup.find(class_='video-list clearfix').find_all(class_='video-item matrix')

for item in list:
    item_title = item.find('a').get('title')
    item_link = item.find('a').get('href')
    item_dec = item.find(class_='des hide').text
    item_view = item.find(class_='so-icon watch-num').text
    item_biubiu = item.find(class_='so-icon hide').text
    item_date = item.find(class_='so-icon time').text

    print('爬取:' + item_title)

    global n

    sheet.write(n, 0, item_title)
    sheet.write(n, 1, item_link)
    sheet.write(n, 2, item_dec)
    sheet.write(n, 3, item_view)
    sheet.write(n, 4, item_biubiu)
    sheet.write(n, 5, item_date)

    n = n + 1

def get_source():
WAIT.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#all-list > div.flow-loader > div.filter-wrap')))

html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print('到这')

save_to_excel(soup)

def main():
try:
total = search()
print(total)

    for i in range(2, int(total+1)):
        next_page(i)

finally:
    browser.close()

if name == 'main':
main()
book.save('蔡徐坤篮球.xlsx')
用我这个

@rainrae
Copy link

rainrae commented May 23, 2022

确实抓不了了,不断弹出登录框,是不是要模拟登陆才行呢。

我也遇到这个问题,程序只能反复刷新首页,不知道要怎么改呀?

@ls-6414
Copy link

ls-6414 commented Aug 4, 2022

获取到搜索的input框后需要先.click()然后再.send_keys()

`def search():
try:
print('start visit bilibili...')
browser.get('https://www.bilibili.com/')

    search_input = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#nav-searchform > div.nav-search-content > input")))
    search_input.click()
    search_input.send_keys('蔡徐坤篮球')
    search_submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="nav-searchform"]/div[2]')))
    search_submit.click()
    print('jump to new window')
    all_h = browser.window_handles
    browser.switch_to.window(all_h[1])
except TimeoutException:
    return search()`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

6 participants