-
Notifications
You must be signed in to change notification settings - Fork 0
/
helion_scraper.py
78 lines (65 loc) · 3.15 KB
/
helion_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from playwright.sync_api import sync_playwright
import time
import re
import pandas as pd
import datetime
with sync_playwright() as p:
data_record = []
browser = p.chromium.launch(headless=True, slow_mo=50)
page = browser.new_page()
page.goto("https://helion.pl/")
print(page.title())
page.type("input#inputSearch", "python")
time.sleep(0.3)
page.click("#szukanie > fieldset > a > button")
for no_page in range(58):
numerate_page = f"https://helion.pl/search?szukaj=python&nrs={no_page}&serwisyall=0&wsprzed=1&wprzed=0&wprzyg=0&wyczerp=0&sortby=wd&qa=&nr=&ceny=&formaty=&wydawca=&jezyk=&promocja="
page.goto(numerate_page)
bks = page.query_selector_all("#right-big-col > div.book-list-container.multi-line.padding-top.padding-top-search > div > ul > li")
time.sleep(2)
for single_book in bks:
book_tags = single_book.query_selector("p.tags").inner_text()
book_title = single_book.query_selector("h3 a:first-of-type").inner_text()
book_author = single_book.query_selector("p[class*='author']").inner_text()
#example path to book actual price - "p > #text","a > ins","a","p a #text",
book_actual_price = "null"
for fn in ["p[class*='price price-add']","p[class*='price price-time']", "a > ins"]:
try:
book_actual_price = single_book.query_selector(f"{fn}").inner_text().replace("\n", " ")
break
except Exception as e:
print(e)
pass
try:
book_first_price = single_book.query_selector("del").inner_text()
except Exception as e:
print(e)
book_first_price = "null"
book_format_list_raw = single_book.query_selector_all("ul > li") #[class*='book-type']
book_format_list = [x.get_attribute("class") for x in book_format_list_raw]
words = ['type-book', 'type-ebook', 'type-online']
finds = []
for i in book_format_list:
try:
single_finds = re.findall(r"(?=(\b" + '\\b|\\b'.join(words) + r"\b))", i)
finds.append(single_finds[0].split("-")[-1])
except Exception as e:
print(e)
pass
book_format = ' '.join(set(finds))
book_url = single_book.query_selector("a").get_attribute("href")
data_row = {"book_title": str(book_title),
"book_author": str(book_author),
"book_tags": str(book_tags),
"book_format": str(book_format),
"book_actual_price": str(book_actual_price),
"book_first_price": str(book_first_price),
"book_url_link": book_url}
print(data_row)
data_record.append(data_row)
print(no_page)
df = pd.DataFrame(data_record)
now = datetime.datetime.now()
timestamp = now.strftime("%Y_%m_%d_%H_%M_%S")
df.to_csv(f"helion_ksiazki_{timestamp}.csv", sep="\t")
page.close()