/
of_utils.py
100 lines (88 loc) · 2.83 KB
/
of_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import importlib
import inspect
import logging
import os
import sys
import time
import traceback
from datetime import datetime
from selenium import webdriver
sys.path.append('.')
import of_spider
def sleep(seconds):
time.sleep(seconds)
def create_chrome_driver():
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
options.add_argument('--ignore-certificate-errors')
# prefs = {'profile.managed_default_content_settings.images': 2}
# options.add_experimental_option('prefs',prefs)
driver = webdriver.Chrome(chrome_options=options)
driver.maximize_window()
return driver
def find_element_by_css_selector(element, selector):
try:
return element.find_element_by_css_selector(selector)
except:
return None
def find_elements_by_css_selector(element, selector):
try:
return element.find_elements_by_css_selector(selector)
except:
return []
def create_flogger(filename, level=logging.INFO):
logger = logging.getLogger(filename)
logger.setLevel(level)
dt = datetime.now()
fh = logging.FileHandler(filename + '_' + dt.strftime('%Y-%m-%d') + '_' + str(os.getpid()) + '.log')
fh.setLevel(level)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
return logger
def get_domain(url):
return url.split('://')[1].split('/')[0]
def load_spiders(path, logger):
spiders = {}
files = os.listdir(path)
for f in files:
f_path = os.path.join(path, f)
if os.path.isfile(f_path) and \
(not f.startswith('__init__')) and \
f.endswith('.py'):
mod = importlib.import_module('%s.%s' % (path, f[:-3]))
for var in dir(mod):
obj = getattr(mod, var)
try:
if issubclass(obj, of_spider.Spider):
var = var.lower()
spiders[var] = obj(logger)
except:
pass
return spiders
def get_base_url(url):
return url.split('?')[0]
def get_url_parameters(url):
parameters = {}
kvs = url.split('?')[-1].split('&')
for kv in kvs:
k, v = kv.split('=')
parameters[k] = v
return parameters
def convert_price(price):
try:
return int(float(price.replace('¥','').replace('¥','').replace(',','').replace('人民币','').replace('RMB','').replace('CNY','')))
except Exception as e:
print(e)
return 0
def find_element_by_xpath(element, selector):
try:
return element.find_element_by_xpath(selector)
except:
return None
def find_elements_by_xpath(element, selector):
try:
return element.find_elements_by_xpath(selector)
except:
return []